Compare commits

...

1103 Commits

Author SHA1 Message Date
a2599dca0f fix missing removal
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-10-17 11:35:42 -07:00
3fd66b1e73 [Misc] Remove unused virtual engine flag
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-10-16 23:04:05 -07:00
fec2b341ad [Kernel] Lazy import FlashInfer (#26977) 2025-10-17 04:48:18 +00:00
87bc0c492f [Bugfix] Fix ReplicatedLinearWithLoRA (#27065)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-17 04:43:16 +00:00
fe3b9372ad [Core] Change execute_model_with_error_logging() to be a ctx manager (#27060)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-17 11:45:32 +08:00
bde9e2272a [Bugfix][Qwen] fixes the weights dtype in qwen3_next: it is actually a bfloat16 (#27030)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-10-17 03:37:52 +00:00
08405609cc disable graph partition in custom op (#26952)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-17 11:08:47 +08:00
ab81379ea6 [Perf] Exploit out-of-band buffers in shm_broadcast (#26961)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-16 20:08:03 -07:00
4ffd6e8942 [Docs] Reduce custom syntax used in docs (#27009)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 20:05:34 -07:00
965c5f4914 vllm bench serve shows num of failed requests (#26478)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
2025-10-16 19:55:09 -07:00
4d055ef465 Remove unused imports (#26972)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-16 19:51:17 -07:00
17c540a993 [torch.compile] fix simple inductor graph partition test (#27050)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-16 21:09:36 -04:00
4d4d6bad19 [Chore] Separate out vllm.utils.importlib (#27022)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-17 00:48:59 +00:00
11ae016bd7 [torch.compile] Passing only necessary compilation config to inductor pass config (#27041)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-10-17 00:01:52 +00:00
41d3071918 [NVIDIA] [Perf] Update to leverage flashinfer trtllm FP4 MOE throughput kernel (#26714)
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-16 16:20:25 -07:00
fb5e10d3fb Refactor Transformers backend to use mixins (#26906)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 21:50:39 +00:00
b2f78cbad4 [small][batch invariance] Rename the env and internal flags to simplify usage (#26855)
Signed-off-by: Bram Wasti <bwasti@meta.com>
2025-10-16 21:40:25 +00:00
23583ee28c [Bug] Add Assertion for random-input-len / random-output-len (#26834)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-16 21:36:39 +00:00
01c977e96d [CI] Prune Quantization Tests and skip compilation (#27038)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-16 17:26:35 -04:00
b3dda72c23 [Feature] Migrate DeepGEMM API from get_m_alignment_for_contiguous_layout to get_mk_alignment_for_contiguous_layout (#26935)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-16 16:46:48 -04:00
fb0571b077 [GPTOSS][DP/EP][Marlin] Enable GPTOSS Batched DP/EP using Marlin kernels (#25997)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-16 12:53:11 -07:00
2ed8b6b3d0 [Bug] Fix batch invariant test has to is (#27032)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-16 19:45:14 +00:00
013abde6ef Adding Warmup to Benchmark Serving (#26943)
Signed-off-by: Kimbo Chen <chentenghung@gmail.com>
2025-10-16 12:44:32 -07:00
a5464dcf92 [Compressed Tensors] Always clone output for compile robustness (#26849)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-16 19:29:59 +00:00
ac3ed5a815 Support block size of 256 used by Intel HPU (#26883)
Signed-off-by: mandy-li <mandy.j.li@intel.com>
2025-10-16 15:10:57 -04:00
e6ba2000ae [gpt-oss][1/N] EZ: refactor serving_responses for modularity (#26948)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-10-16 18:44:06 +00:00
aa255ff55a Support set in the CLI generation (#27031)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 18:07:18 +00:00
7bb736d00e Fix Qwen2.5 VL image grid docstring (#27033)
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
2025-10-16 09:57:36 -07:00
9f4e30904b [Model] Fix Qwen3VL mm mapping (#27027)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-16 09:45:59 -07:00
5afd3276df [Feature] Add process_weights_after_loading to AttentionImpl (#26870)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-10-16 08:02:30 -07:00
43721bc67f [CI] Replace large models with tiny alternatives in tests (#24057)
Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 15:51:27 +01:00
02d709a6f1 [docs] standardize Hugging Face env var to HF_TOKEN (deprecates HUGGING_FACE_HUB_TOKEN) (#27020)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-10-16 15:31:02 +01:00
4a510ab487 [NIXL] Improve request_finished() debug logs (#25665)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-16 15:55:17 +02:00
314fa8abbf [Attention] Tune CUTLASS MLA num_splits (#26846)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-16 06:36:09 -07:00
334535b6fb [Benchmark] Show E2EL by default for pooling models (#27014)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 12:47:09 +00:00
dcbb3f1871 [Bugfix] Correct LayerNorm epsilon parameter in modernbert.py (#27008)
Signed-off-by: bogdanm <152898065+bogdan01m@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-16 12:27:44 +00:00
00417f4e44 [MISC] fix import violations for re and triton modules (#26654)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-10-16 03:38:27 -07:00
ed344f4116 Cleanup code after Python 3.10 upgrade (#26520)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-16 03:38:23 -07:00
e51928793e [Model][Bugfix] fix ernie45 vl run failed from shared experts optimization (#26885)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-16 03:37:35 -07:00
d2740fafbf [Chore] Separate out vllm.utils.collections (#26990)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 08:35:35 +00:00
17838e50ef [Benchmark] Use truncation by default for pooling benchmarks (#26992)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 16:02:39 +08:00
44c8555621 [CI/Build] Fix AMD import failures in CI (#26841)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-16 07:28:20 +00:00
f7d318de2b [Hardware][CPU][PowerPC]Disable torch.compile() in toptopk sampling (#26987)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-10-15 22:36:59 -07:00
76f0d05bc6 [CI/Build] Update expected beam search output for Phi3V (#26978)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 05:12:44 +00:00
7d8975de84 Deepseek-v3 Batch Invariant on 8xH100 (#26609)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-15 22:06:02 -07:00
785d8b6410 [PERF] Qwen3-next MTP speedup (change bool mask indexing to index_select / index_copy to reduce d2h) (#26437)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2025-10-16 12:18:31 +08:00
f6cdc9a02f [Chore] Rename utils submodules (#26920)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 03:58:13 +00:00
509cdc0370 [DOC][XPU]update feature parity with Intel GPU (#26954)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-15 20:07:10 -07:00
9b6504c307 [BugFix] Work around graph partition x torch.compile cache issue (#26956)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-10-15 20:06:11 -07:00
e19b16dde6 [bugfix] Fix SP + PP without specifying compile size (#26955)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 20:05:33 -07:00
582f2c6be7 [BUG] Allow runai_streamer_sharded in config check (#26958)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-10-15 20:05:14 -07:00
f8a0acbdbe [CI] Enable Blackwell Llama4 MoE tests (#26731)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-15 21:02:57 -06:00
1317034379 [ROCm][FEAT] Fuse DeepSeek shared experts into AITER fused_moe ops (#24097)
Signed-off-by: chenjun <junchen2@amd.com>
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
Co-authored-by: valarLip <103567126+valarLip@users.noreply.github.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2025-10-16 10:41:34 +08:00
0ecc553ee6 [Bugfix] reasoning_parser parameter handling in run_batch.py (#26225)
Signed-off-by: inc-jeong <inc.jeong@navercorp.com>
Signed-off-by: InChang Jeong <inc.jeong@navercorp.com>
Co-authored-by: USER <user@AL02367916.local>
2025-10-16 10:24:05 +08:00
f96bc3649c [Qwen3-Next] Add tuned MoE config for Qwen3-Next FP8 on H100 tp2 (#26887)
Signed-off-by: Felix Zhu <felixzhu555@gmail.com>
2025-10-15 18:55:05 -07:00
938c43ea7f [ci] Adjusting AMD test composition 2025-10-14 (#26852)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-15 23:52:13 +00:00
0a9ef0cfce Move query quantization to attention layer for Flashinfer & Triton. (#26534)
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Adrian Abeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-15 19:01:38 -04:00
e5b438a247 [Bug] Temporally Disable VLLM_ALLREDUCE_USE_SYMM_MEM by Default (#26925)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-15 16:18:50 -04:00
0b99f5d302 support flashinfer_fp4 moe for 5090 gpu (#26669)
Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-15 15:06:47 -04:00
1f491aa0c8 Vectorize RMS norm variance using vectorize_read_with_alignment (#26234)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-15 11:54:41 -07:00
de92d916fe [NVIDIA] Add support for cudnn fp4 gemm via flashinfer (#26107)
Signed-off-by: kaixih <kaixih@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-10-15 13:53:00 -04:00
a1063628a4 [Chore] Clean up CODEOWNERS (#26923)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-15 10:52:54 -07:00
d796375258 [ModelOpt] Remove NVFP4 MoE K%16==0 constraint (#26891)
Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
2025-10-15 13:06:17 -04:00
14f8456344 [Feature]: Use pydantic validation in observability.py config (#26637)
Signed-off-by: Samuel Wu <cernunnos1710@gmail.com>
Signed-off-by: Sam/Samuel <57896620+cern1710@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 16:44:03 +00:00
4794c2bd92 Olmo 3 tool parser and tests (#26143)
Signed-off-by: Pradeep Dasigi <pradeepd@allenai.org>
2025-10-15 16:36:12 +00:00
d3cbaa08dc Lower sevarity of log when model info cache misses due to exception (#26917)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 09:01:09 -07:00
828523ad8e [Chore] Separate out vllm.utils.async_utils (#26913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 15:33:00 +00:00
136a17fe6e [Chore] Separate out vllm.utils.func (#26904)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 13:03:58 +00:00
f57438338d [BugFix] Patch inductor memory plan logic (#26878)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 12:51:45 +00:00
5d598680e3 chore: remove unused marker (#26890)
Signed-off-by: Max Wittig <max.wittig@siemens.com>
2025-10-15 05:40:33 -07:00
8f4b313c37 [Misc] rename torch_dtype to dtype (#26695)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-15 12:11:48 +00:00
f93e348010 [Misc] Remove isort and yapf ignores (#26888)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 12:09:03 +00:00
f54f85129e [Model][2/N] Improve all pooling task | Support multi-vector retrieval (#25370)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-15 11:14:41 +00:00
d4d1a6024f [Lora]Load tuned multi-lora kernel configs from json files (#26319)
Signed-off-by: li2haipeng <44383182+li2haipeng@users.noreply.github.com>
Signed-off-by: Haipeng Li <li2haipeng@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-15 09:45:14 +00:00
db1764e4e0 [Platform] allow platform to init dp group (#22243)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-15 02:32:17 -07:00
7f83b4ee8e [Easy] Get rid of unnecessary paraenthesis in kv_cache_manager (#26842)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-15 09:17:43 +00:00
5c3bae1a6a [Fix] Remove divisibility requirement between num_kv_heads and tp_size in bailing_moe (#26876)
Signed-off-by: vito.yy <vito.yy@antgroup.com>
2025-10-15 16:44:04 +08:00
5210dc3940 [Misc] Update TritonLanguagePlaceholder to have attributes that are used by Flash Linear Attention ops. (#26853)
Co-authored-by: Xudong Ma <mxd@meta.com>
2025-10-15 08:37:49 +00:00
650b51f9f9 [doc] add Context Parallel Deployment doc (#26877)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-15 16:33:52 +08:00
6256697997 [Doc] ruff format remaining Python examples (#26795)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 01:25:49 -07:00
71557a5f7c [CI] Fix mypy for vllm/executor (#26845)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-15 01:23:33 -07:00
f3c378ffa7 [CI/Build] Add Qwen2.5-VL-7B-Instruct ChartQA Accuracy Tests in CI (#21810)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Ye (Charlotte) Qi <ye.charlotte.qi@gmail.com>
2025-10-15 08:09:56 +00:00
f5ed68ef63 [Deepseek-V3.2][Kernel] Integrate cuda indexer k cache gather (#26456)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-10-15 16:05:01 +08:00
efdef57b1f [bugfix] Lazy import cv2 (#26869)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 07:47:50 +00:00
b8a4572157 [Misc] Use helper function to generate dummy messages in OpenAI MM tests (#26875)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 07:17:37 +00:00
302ef403a2 [DSA][MLA] Tiny refactor on DeepSeek to make it reusable for different backends (#26656)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-15 00:16:44 -07:00
8865da157b [Bugfix][Multi Modal] Fix incorrect Molmo token processing (#26873)
Signed-off-by: sanghol <sanghol@allenai.org>
2025-10-15 07:13:59 +00:00
f0862eae43 [Graph Partition] pass tests for decorator (#26831)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-15 06:39:48 +00:00
8c851f6d04 [Bugfix] Fix qwen3-omni audio truncation issue (#26815)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-15 05:38:36 +00:00
7cfa420f49 [BugFix] Patch inductor partitioning logic (#26735)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 05:04:32 +00:00
a27b288e4a [Feature] default --extra-body param to disable thinking in vllm bench serve (#26784)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-10-15 04:23:44 +00:00
e471d7ca7e [CI/Build][Bugfix] fix qutlass cmake error when set QUTLASS_SRC_DIR (#26773)
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-15 04:09:44 +00:00
c43ca8259e [Docs] Move build.inc into arm.inc (#26862)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-10-14 20:35:08 -07:00
85a65e7f51 [Model] Add DeepSeek-V3.1 reasoning parser (split from PR #24972) (#25589)
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-10-15 11:09:52 +08:00
a2986b3e33 [Bugfix] Fixes prefix-repetition benchmark script (#26828)
Signed-off-by: Kourosh Hakhamaneshi <Kourosh@anyscale.com>
2025-10-15 02:54:43 +00:00
96b9aa5aa0 [Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-15 02:51:16 +00:00
e66d787bce Disable FlashInfer sampler by default (#26859)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-15 02:35:18 +00:00
bfad142e25 [BUGFIX][NIXL] quick fix for 'assert self.connector_worker is not None' in get_kv_connector_stats (#26851)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-15 02:33:25 +00:00
9354660036 [Bugfix]fix Qwen3 xml tool parser (#26345)
Signed-off-by: Zhikaiiii <1658973216@qq.com>
2025-10-15 09:50:30 +08:00
07ca70af8d [Core][Easy] Use envs.__getattr__ for all Unify to environment variable access (#26810)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-15 01:41:18 +00:00
2dcd12d357 [torch.compile] Fix tests for torch==2.9 inductor partition (#26116)
Signed-off-by: ProExpertProg <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-10-14 19:55:02 -04:00
579d2e5458 [WideEP][P/D] Add usage stats for DP+EP and KV Connector (#26836)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-14 23:51:54 +00:00
0512c04aee [frontend][gptoss] Add per turn stats into Harmony Context (#25061)
Signed-off-by: lacora <hyelacora@gmail.com>
Co-authored-by: Ye Hu <yehu@fb.com>
2025-10-14 16:48:13 -07:00
7e0ef4084a [CI Failure] Fix torchao dep failure for Quantization Test (#26824)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 16:41:43 -07:00
4aed506b65 [Core] Streamline some structured output related code (#26737)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-14 23:27:44 +00:00
a86b4c58e8 remove attn output view kernel (#26680)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 22:53:10 +00:00
ff4810ba73 [Minor] Group async_scheduling related fields in model runner init (#26736)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-14 14:46:37 -07:00
9d6964926e fix: response_format for completion (#23212)
Signed-off-by: Nan2018 <qinnanjoshua@gmail.com>
2025-10-14 21:23:22 +00:00
0e65818910 Added MoE configs for llama 4, H200 device with tp=4/8 tuning (#26837)
Signed-off-by: Dhruvil Bhatt <bhattdbh@amazon.com>
2025-10-14 14:21:03 -07:00
380f17527c [Perf] Cache vllm.env.__getattr__ result to avoid recomputation (#26146)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 17:03:21 -04:00
b92ab3deda Notice for deprecation of AutoAWQ (#26820)
Signed-off-by: HDCharles <39544797+HDCharles@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 13:39:59 -07:00
acaa2c0a4a [Core] Reuse empty block lists whenever possible in KVCacheBlocks to mitigate GC costs (#24964)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 12:58:43 -07:00
82af928c41 [Attention][Spec Decode] FlashMLA spec decode support (#26541)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-14 19:38:20 +00:00
87efc681db llama4_vision_rope: add HIP override to accept (q, k) and avoid (positions, q, k) mismatch (#26790)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-14 11:54:12 -07:00
c3a722fcb2 [CI Failure] Fix tests with missing TinyLlama-1.1B-Chat-v1.0-FP8-e2e (#26816)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 18:38:59 +00:00
aba48f7db1 [Kernel][MoE] Add MoE tunings for GLM 4.6-FP8 and GLM 4.5 Air on NVidia B200 (#26818) 2025-10-14 11:20:39 -07:00
04b5f9802d [CI] Raise VLLM_MAX_SIZE_MB to 500 due to failing Build wheel - CUDA 12.9 (#26722)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 10:52:05 -07:00
efc8f7d814 Update coveragerc and add codecov.yml for path fixes (#26435)
Signed-off-by: Reza Barazesh <rezabarazesh@meta.com>
2025-10-14 09:45:06 -07:00
6d87a2838c [Config] Remove Unused Environment Variable VLLM_DISABLE_PAD_FOR_CUDAGRAPH (#26743)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-14 11:47:49 -04:00
e6cdbd6792 Revert "[issues template] Encourage the author implement their own ideas" (#26814) 2025-10-14 08:37:34 -07:00
df850c4912 [Feature][Responses API] Stream Function Call - harmony (#24317)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-14 08:31:43 -07:00
720394de43 [KVConnector][Metrics] Aggregate scheduler-side KVConnectorStats (#26046)
Signed-off-by: Qier Li <kevin44036@gmail.com>
2025-10-14 14:38:07 +00:00
88a49745af [issues template] Encourage the author implement their own ideas (#26671)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-14 22:32:36 +08:00
ca683a2a72 use combo kernel to fuse qk-norm and qk-rope (#26682)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-14 09:40:59 -04:00
e9f1b8c9e9 Adjusted the model order of the model registration file (#26798)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-10-14 13:26:11 +00:00
ea97940d6c [DCP] Support Decode Context Parallel (DCP) for GQA with FlashAttention (#24864)
Signed-off-by: yuanyongjie.yyj <yuanyongjie.yyj@antgroup.com>
Signed-off-by: FENP <32334296+FENP@users.noreply.github.com>
Signed-off-by: Jaya Yuan <yuanyongjie.yyj@antgroup.com>
2025-10-14 13:07:50 +00:00
fdd32750f0 [CI/Build] Cleanup LoRA test (#26752)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-14 12:06:35 +00:00
c715ba3735 [Feature] Change vllm.py with pydantic validation (#26726)
Signed-off-by: Vladislav <vladislav.bronzov@gmail.com>
Signed-off-by: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-14 12:00:54 +00:00
9c4cb68339 [Chore] Remove SupportsV0Only interface and update supported models docs (#26783)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 04:55:10 -07:00
780eb03d9b [CI] Fix test_tool_id_kimi_k2 (#26787)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-14 10:27:07 +00:00
ef9676a1f1 [Doc] ruff format some Python examples (#26767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 03:21:53 -07:00
70b1b330e1 Don't allow typos to fix by default (#26785)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-14 03:05:15 -07:00
d1d063a588 [Chore] Use max_transformers_version for Qwen-VL test (#26792)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 03:03:46 -07:00
7e6edb1469 [NIXL][HeteroTP] Enable KV transfer from HND prefill to NHD decode (#26556)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-14 09:46:05 +00:00
74704d4553 [Model] Use merge_by_field_config for MM models (O-P) (#26776)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 09:42:45 +00:00
d2f816d6ff [Bugfix] Standardize merging multimodal embeddings (#26771)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 09:36:21 +00:00
577d498212 [Plugin] Make plugin group clear (#26757)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-14 07:49:59 +00:00
fd85c9f426 [Bugfix][FE]: Always include usage with --enable-force-include-usage (#20983)
Signed-off-by: Max Wittig <max.wittig@siemens.com>
Signed-off-by: Antoine Auger <antoineauger@users.noreply.github.com>
Co-authored-by: Antoine Auger <antoineauger@users.noreply.github.com>
2025-10-14 09:17:39 +02:00
d32c611f45 [CI/Build] Use 127.0.0.1 instead of localhost in utils (#26750)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-14 07:04:00 +00:00
01ad27faff [Model][Bugfix]fix ernie45 load failed due to ernie45 eplb code (#26684)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-14 06:55:23 +00:00
481545b397 scheduler.py: Update the name of the default scheduler. (#26758)
Signed-off-by: Ryan Li <ryanli@ryanli.org>
2025-10-14 06:52:21 +00:00
d3cc8427c0 [ci] Adding the test-amd.yaml for test definitions for the AMD backend. (alternative PR) (#26718)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-13 23:10:23 -07:00
4821ac1b4d [CI] [ROCm] Automate CC list for ROCm related issue (#26753)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-14 13:57:26 +08:00
4497c8f821 Fix lora tests failure in TPU CI due to the removal of LoRA bias (#26723)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-10-14 13:04:23 +08:00
2e36cdbe2b [Docs] Add a start tag to build.inc.md (#26747)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-10-13 21:51:55 -07:00
fe3edb4cf0 Add support for the /rerank endpoint in vllm bench serve (#26602)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-10-14 04:25:43 +00:00
29350922c6 [Feature][Quantization] auto_round format add support for regex (#24024)
Signed-off-by: n1ck-guo <heng.guo@intel.com>
Signed-off-by: Heng Guo <heng.guo@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 03:03:16 +00:00
8ae169286f [torch.compile] Unwrap fused_marlin_moe custom op (#26739)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-14 02:22:16 +00:00
8a0af6a561 [build][torch.compile] upgrade depyf version (#26702)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-14 10:12:09 +08:00
cfded80793 [Easy] Fix env type check errors from VLLM_DEBUG_LOG_API_SERVER_RESPONSE (#26742)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 01:46:44 +00:00
b59dd19b55 [compile] Enable sequence parallelism for full cuda graph without specifying compile sizes (#26681)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-13 18:15:34 -07:00
3e051bda82 [UX] Replace VLLM_ALL2ALL_BACKEND with --all2all-backend (#26732)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-13 18:12:52 -07:00
8317f72354 [Misc][DP] support customized aggregated logger for dp (#24354)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-10-13 17:45:59 -07:00
d8bebb008a Add tests for chunked prefill and prefix cache with causal pooling models (#26526)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Ayush Singh <ayush1009208@gmail.com>
2025-10-14 07:45:04 +08:00
35bc22f23c [ResponseAPI] Further polish message serialization and unit tests (#26728)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-13 23:31:35 +00:00
fa96fb9c70 Pruning kernel Core Tests (#26727)
Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
2025-10-13 23:08:18 +00:00
e3fdb627d9 [FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
2025-10-13 22:47:16 +00:00
7200a21cd1 [Bug] Fix Assertion error DeepEP/csrc/kernels/intranode.cu:928: 'false and Unsupported type' (#26532)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-13 18:26:37 -04:00
577c72a227 [CI Perf]Prune Tests in kernel/mamba (#26538)
Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-13 18:22:31 -04:00
314285d4f2 [CI] Fix mypy for vllm/distributed (#26593)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 16:02:24 -04:00
d2a7938582 [Frontend][1/N] Improve all pooling task | Support FP16 Embedding Base64 (Still uses fp32 by default). (#26414)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-13 19:06:43 +00:00
89342ce4c0 [Quantization] [Performance] Enable Marlin GEMM kernels for the calibration-free RTN-based quantization (#26051)
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Signed-off-by: Alex Kogan <82225080+sakogan@users.noreply.github.com>
2025-10-13 18:52:54 +00:00
f89f599395 [CI][Release][Arm64]: Build arm64 release for gpu arch 8.9 (#26698) 2025-10-13 18:42:12 +00:00
e251e457c5 [Log] Optimize Startup Log (#26601)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-14 02:06:57 +08:00
afc47e4de7 [Model] Use merge_by_field_config for MM models (M-N) (#26710)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 01:27:01 +08:00
e3b90c1ba2 [Bugfix][Speculative Decoding] Extend Eagle quantization config fix to llama_eagle.py (#26590)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-13 17:17:13 +00:00
134f70b3ed [Bugfix][Rocm] fix qr error when different inp shape (#25892)
Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-10-13 10:04:21 -07:00
a1b2d658ee [CI/Build] upgrade compressed-tensors to 0.12.2 to address LGPLv3 (#26501)
Signed-off-by: Sangyeon Cho <josang1204@gmail.com>
2025-10-13 12:58:33 -04:00
5c7fe25491 [Misc] Separate prompt logging to debug (#26713)
Signed-off-by: Aleksei Tsvetkov <aitsvet@ya.ru>
2025-10-13 09:04:18 -07:00
53c9a7cee2 [P/D] [NixlConnector] kv load recovery integration (#26171)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-10-13 08:48:04 -07:00
0d21b9b51e [UX] Speedup DeepGEMM warmup with heuristics (#25619)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-13 07:59:27 -07:00
10214b6935 [FEATURE]: Use pydantic validation in multimodal.py config (#26629)
Signed-off-by: Anand Roy <86306690+andycandy@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 07:56:59 -07:00
4a61950f4d [Hardware][CPU] Disable torch.compile for RISC-V to prevent APIError (#26693)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn
2025-10-13 07:56:01 -07:00
3263799056 [unrevert] Add batch invariant kernel override for FlashInfer backend [2/n] (#26373)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
2025-10-13 10:24:53 -04:00
8e67b2557a [Bugfix] Fix out of bound index issue for Jina-embedding-v3 RoPE with cuda graph (#26687)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-13 03:21:48 -07:00
4073c82c4e [ResponseAPI] Simplify input/output message serialization (#26620)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-13 09:59:15 +00:00
767c3ab869 [Model][0/N] Improve all pooling task | clean up (#25817)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-13 16:44:50 +08:00
4f207c7174 Ignore large reformatting PRs in git blame (#26690)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 01:20:47 -07:00
782505ed8e [Model] Add reasoning_parser and tool_parser for Ernie45 thinking (#25027)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-13 15:55:20 +08:00
98f30b8cba [Model] Fix Skywork R1V mlp (#26673)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-12 22:42:17 -07:00
3cd36660f7 docs: wrong command in structured_outputs README (#26677)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-10-12 20:59:01 -07:00
46ad73955a [FIX] Throwing an exception when the model does not support pool tasks (#25840) (#25855)
Signed-off-by: zxw <1020938856@qq.com>
Co-authored-by: wang.yuqi <noooop@126.com>
2025-10-12 20:56:21 -07:00
41f3884438 [Bugfix][Core]Fix block table out-of-range issue in priority scheduling (#26661)
Signed-off-by: quanliu <18646313696@163.com>
2025-10-13 01:25:42 +00:00
60e419c1ee [Misc] cache result of disable_inplace (#26666)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-13 00:17:50 +00:00
7ef6052804 [CI/Build] Add tool to build vllm-tpu wheel (#19165)
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-12 16:25:40 -06:00
4fca1a1bd2 [easy] fix pre commit error on trunk (#26665)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-12 21:25:34 +00:00
a6049be73c [Models][Qwen3VL] Speedup fast_pos_embed_interpolate (#26647)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-13 01:20:07 +08:00
18ed7746ea [Feature] Add support for naver/splade-v3 (BERT-based sparse embedding model) (#26339)
Signed-off-by: gjgjos <gjgjos@naver.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-12 17:00:52 +00:00
8fcaaf6a16 Update Optional[x] -> x | None and Union[x, y] to x | y (#26633)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-12 09:51:31 -07:00
9bb38130cb [Bugfix] Fix GPU_ID issue in test script (#26442)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-12 11:39:05 +00:00
b91d8db873 [Bugfix][DCP] Set default CUDAGraphMode to PIECEWISE for DCP (#26574)
Signed-off-by: FENP <32334296+FENP@users.noreply.github.com>
2025-10-12 09:58:38 +00:00
045b396d09 [Bugfix][CI/Build] Fix failing Mteb CI (#26638)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-12 02:42:42 -07:00
76852017ea [MISC] Rename the torch profiler filename as instance_id+rank_id for merging the Profiler results of each Rank (#25867)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-12 09:29:08 +00:00
82e64c7a20 [PERF] [Qwen3-next] Speed up gated RMSNorm (#26207)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
Signed-off-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-12 08:27:50 +00:00
4ca204055e Add @noooop to codeowner for pooling models (#26652)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-12 14:04:44 +08:00
c5c8f5ea59 [EPLB] Support ernie4.5-moe (#22100)
Signed-off-by: Haisheng Chen <langzs335@outlook.com>
Signed-off-by: Haisheng Chen <60504847+HsChen-sys@users.noreply.github.com>
Signed-off-by: Haisheng Chen <hac048@ucsd.edu>
Co-authored-by: Haisheng Chen <langzs335@outlook.com>
2025-10-12 10:40:47 +08:00
01653a917b [compile] Fix inductor partition config (#26645)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-11 21:03:14 +00:00
0cd103e7cb CP: make correct_attn_out robust to 4‑D views and fix Triton arg binding (#26509)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-11 20:50:57 +00:00
5be7ca1b99 [Benchmark] Support Infinity API (#26641)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-12 01:45:32 +08:00
f0a30a067b [Bugfix] Fix qwen-moe packed_modules_mapping (#26634)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-11 15:21:33 +00:00
9d6cff3ede [Bugfix][Qwen3VL] fix deepstack in qwen3vl (#26626)
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
2025-10-11 05:58:33 -07:00
a25f2adee9 [compile] Add patched_fused_scaled_matmul_reduce_scatter (#26604)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-11 05:44:43 -07:00
d0bed837ac [Refactor]Reduce duplicate code in serving_chat (#26627)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-11 12:04:49 +00:00
f7ee69868a [CPU] fix the issue when the node is '-' cause json decode error. (#26562)
Signed-off-by: muzian666 <andylee_2001@163.com>
Co-authored-by: qingan.li <qingan.li@wizpresso.com>
2025-10-11 12:04:04 +00:00
d2a71530c1 Add EAGLE-3 Speculative Decoding Support for Qwen3 MoE (#26485)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-11 10:14:41 +00:00
086609de64 fix(nix): Allow local oneDNN path to fix vLLM CPU build failure (#26401)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
2025-10-11 09:12:16 +00:00
727144bed1 [Refactor]: Use M-RoPE interface directly while defining model class instead of maintaining model specific M-RoPE implementation in mrope.py (#24172)
Signed-off-by: Divyansh Singhvi <divyanshsinghvi@gmail.com>
Signed-off-by: dsinghvi <divyanshsinghvi@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: wwl2755 <wangwenlong2755@gmail.com>
2025-10-11 07:21:04 +00:00
55392bc879 [Bugfix][Multi Modal] Fix incorrect Molmo image processing (#26563)
Signed-off-by: sanghol <sanghol@allenai.org>
2025-10-10 22:28:23 -07:00
ddaff2938e [MM] Move Qwen3Omni MRoPE impl to model file (#26608)
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-10 22:17:24 -07:00
27ed39a347 [XPU] Upgrade NIXL to remove CUDA dependency (#26570)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
2025-10-11 05:15:23 +00:00
8f8474fbe3 [CI/Build] Fix ppc64le CPU build and tests (#22443)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
2025-10-11 13:04:42 +08:00
be067861c6 [Frontend] Improve the performance of is_reasoning_end (#25735)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-11 10:43:39 +08:00
5bc26c438d [BugFix] Make penalties and bad_words work with async scheduling (#26467)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-10 23:27:04 +00:00
eef921f45e AOT Compilation for torch.compile (Bundled) (#24274)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2025-10-10 19:02:11 -04:00
e317414ce1 Cache the environment variable check for batch invariance (#26510)
Signed-off-by: Bram Wasti <bwasti@meta.com>
2025-10-10 22:47:34 +00:00
949cb0170d [BugFix] Fix async scheduling + request preemption (#26385)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-10 20:29:57 +00:00
e94cfd51da [BUG] Qwen3-next MTP. Fix attn metadata build bug (#26564)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2025-10-10 14:59:03 -04:00
7c12763b24 Fix some typing issues found by mypy==1.18.2 (#26596)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-10 18:21:25 +00:00
3b780a4bbb Update CUDA architecture list in build pipeline for 12.9.1 wheels (#26592)
Signed-off-by: Will Eaton <wseaton@users.noreply.github.com>
2025-10-10 11:15:27 -07:00
30f78af147 Update pre-commit hook versions (#26591)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-10 17:03:44 +00:00
19a9b169bf Add Qwen3-Omni moe thinker (#25550)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Xiong Wang <feizi.wx@alibaba-inc.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-10 17:00:56 +00:00
96ad65b7fe [Transform] [Quantization] Add QuTLASS support to vLLM (#24440)
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Signed-off-by: Andrei Panferov <andrei@panferov.org>
Co-authored-by: Andrei Panferov <andrei@panferov.org>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-10 09:43:40 -07:00
8d2b8c0ff2 [Model] Add FlexOlmo model implementation (#24923)
Signed-off-by: Shane A <shanea@allenai.org>
2025-10-10 09:43:15 -07:00
b2155ed317 [Model][Qwen3VL] Compute cu_seqlens on CPU to remove (#26496)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-10 09:42:17 -07:00
910abdbd08 [Bugfix] fixed top_logprobs: -1 does not appear to work as intended (#26470)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-11 00:41:17 +08:00
cddce79fda [torch.compile] Make inductor partition rules respect splitting_ops #25691 (#25845)
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
Signed-off-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-10 16:35:28 +00:00
e519281920 [Metrics] Add test for multi-modal cache stats logging (#26588)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-10 16:00:50 +00:00
7b03584de8 Silu v2 (#25074)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: elvircrn <elvircrn@gmail.com>
Signed-off-by: Elvir Crnčević <elvircrn@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
2025-10-10 15:19:53 +00:00
ae9d0e7da5 [Bugfix] Make DP padding optional in coordinate_batch_across_dp (#26375)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-10-10 10:53:33 -04:00
0e67102d93 Added test_top_k_per_row to test-pipeline.yaml. (#26569)
Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
2025-10-10 10:48:33 -04:00
f4ba2061cf [BugFix][torch.compile] Fix fused_scaled_matmul_reduce_scatter signature for PyTorch 2.8 (#26038)
Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
Signed-off-by: <>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-10 07:42:13 -07:00
1e6848a65d [CI] fix test_run_batch.py::test_completions - AssertionError (#26578)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-10 22:16:28 +08:00
67661375fa [BugFix] Fix noop elimination edge case (#26394)
Signed-off-by: Andy Lo <andy@mistral.ai>
2025-10-10 13:33:04 +00:00
213b64452a [Bugfix] Convert untraceable GroupShape to list for AMD impl (#26535)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
2025-10-10 13:32:29 +00:00
784c231151 [NIXL] Ignore abort on already-finished request (#25067)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-10 12:21:56 +02:00
606b00e80f [bugfix][DCP] fix block_size of hash in DCP prefix caching (#26296)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-10 03:02:49 -07:00
720d3cd0f0 [CI] fix ruff format (#26579)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-10 03:02:12 -07:00
ab196edefb Remove LoRA bias support (#25807)
Signed-off-by: Ashwin Phadke <ashwinphadke12@rediffmail.com>
Signed-off-by: Ashwin Phadke <23502062+ashwin-phadke@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-10 09:50:33 +00:00
3ee202ea1e [GPT-OSS] Add support for arrays at tool message content (#25593)
Signed-off-by: Luis Tomas Bolivar <ltomasbo@redhat.com>
2025-10-10 09:00:45 +00:00
ad430a67ca [Metrics] Log multi-modal cache stats and fix reset (#26285)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-10 01:45:55 -07:00
6f0f570c43 [deepseek] kernel block size for UniformTypeKVCacheSpecs (#26559)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-10 16:40:41 +08:00
b545a0b207 fix test_simple_inductor_graph_partition (#26522)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-10 06:39:19 +00:00
29255cfc3b [Spec-Decode] Support piecewise cudagraphs for Eagle head (#25109)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
2025-10-10 01:20:31 -04:00
da4455609d [Chore]: One pythonic tool parser test uses the wrong parser (#26515)
Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-10-10 04:03:55 +00:00
aafb99a4d4 [Core] Small simplification in GPUModelRunner._update_states() (#26508)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-10 10:53:58 +08:00
757fa4a4da [DP][ray] Support different VLLM_RAY_DP_PACK_STRATEGY (#23849)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-09 19:53:43 -07:00
c6187f55f7 Refactor MistralTokenizer (#26358)
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
2025-10-09 22:48:58 +00:00
8983e0216f [CI] Fix Pre-commit Issue Cannot determine type of "rank" and "world_size" (#26448)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-09 15:16:48 -07:00
1ee35382cb [Bug] Fix modular_kernel: ZeroDivisionError: integer division or modulo by zero (#26528)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-09 15:13:27 -07:00
6e783bc54b [Bugfix] Fix CUDA graph selection bug in FlashInfer at high concurrency (#26499)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-09 17:12:34 -04:00
c9d33c60dc [UX] Add FlashInfer as default CUDA dependency (#26443)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-09 14:10:02 -07:00
2e54db4d2b [Core] Remove unused prev_sampled_token_ids_invalid_indices input batch field (#26514)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-09 20:22:14 +00:00
44f633dba1 [Flashinfer][gpt-oss] Support FP8-qkv Flashinfer TRTLLM Sinks Attention (#25674)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-10-09 16:13:39 -04:00
a462331e36 [Bugfix] Disable moe inplace for torch >= 2.9 (#26497)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-09 18:07:38 +00:00
4069db3f2e [Bugfix] Enable padded FP4 quantization (#25947)
Signed-off-by: Roi Koren <roik@nvidia.com>
2025-10-09 10:59:41 -07:00
0d37450eb7 [BUGFIX] Add cu_tokens_across_sp to DPMetadata (#26457)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-10-09 17:13:56 +00:00
47e66c24e2 [Model] Apply shared experts overlap optimization to all models with shared experts (#26145)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-09 11:31:04 -04:00
3b736e1c38 [Attention][DCP] Support DCP with query length > 1 (MTP) with FA3 (#25049)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-10-09 08:06:29 -07:00
2c1c7dfb35 [Models][Qwen] Replace pad with cat for better performance (#26486)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-09 14:51:26 +00:00
e246ad6f0c Upgrade Pydantic to v2.12.0 and remove hack for Python 3.13 (#26481)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-09 06:02:40 -07:00
5728da11ea Revert #26113 "[Frontend] CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops" (#26472)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-10-09 05:43:55 -07:00
92be3f3517 [Feature] Use pydantic validation in parallel.py config (#26417)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-09 12:41:31 +00:00
d1ddf340c8 [V0 deprecation] Remove QKVCrossParallelLinear implementation (#26475)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-09 10:52:27 +00:00
ec10fd0abc [Bugfix] Move current_platform import to avoid python import cache. (#16601)
Signed-off-by: iwzbi <wzbi@zju.edu.cn>
2025-10-09 10:46:19 +00:00
0426e3c5e1 [Models][Qwen3VL] Optimise _validate_and_reshape_mm_tensor (#26426)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-09 10:25:48 +00:00
4bdf7ac593 [Bugfix] Fix SHM cache initialization (#26427)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-09 02:48:04 -07:00
dc7976dd9f [Misc] Upgrade more code to Python 3.10 (#26463)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-09 10:43:53 +01:00
e4791438ed [Feature] Use pydantic validation in lora.py and load.py configs (#26413)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
2025-10-09 02:38:33 -07:00
e6e898f95d [doc] add Volcengine as a compute sponsor (#26477)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-09 17:11:47 +08:00
ddcbc2f334 [Misc] Misc code simplifications (#26450)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-09 02:10:06 -07:00
a83ff278d6 [torchao] Add support for ModuleFqnToConfig using regex (#26001)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-10-09 08:32:32 +00:00
cf4cd6c24f Add: Support for multiple hidden layers in Eagle3 (#26164)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-09 07:30:50 +00:00
b960441812 Enable RMSNorm substitution for Transformers backend (#26353)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-09 07:28:51 +00:00
1317028aa8 [Model] Gemma3: Fix GGUF loading and quantization (#26189)
Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-09 07:00:53 +00:00
5e49c3e777 Bump Flashinfer to v0.4.0 (#26326)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-10-08 23:58:44 -07:00
0d7c3cb51d Update Dockerfile and install runai-model-streamer[gcs] package (#26464)
Signed-off-by: Peter Schuurman <psch@google.com>
2025-10-08 23:48:51 -07:00
1b2c440cd6 [Core] Relax the LoRA max rank (#26461)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-08 23:47:14 -07:00
0f29dca988 [CI/Build] Fix model nightly tests (#26466)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-08 23:44:16 -07:00
d24cf322e1 [Hybrid]: Decouple Kernel Block Size from KV Page Size (#24486)
Signed-off-by: lizhiyuan <uniartisan2017@gmail.com>
Signed-off-by: Zhiyuan Li <uniartisan2017@gmail.com>
2025-10-08 23:43:39 -07:00
d17f0fbf30 [Core][KVConnector] Propagate all tokens on resumed preemptions (#24926)
Signed-off-by: Qier Li <kevin44036@gmail.com>
Co-authored-by: Qier Li <qier@fb.com>
2025-10-09 14:43:31 +08:00
43ab8cfaa5 [MM][Doc] Add documentation for configurable mm profiling (#26200)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-10-08 23:21:20 -07:00
de253d63b7 [Hardware][AMD] Enable FlexAttention backend on ROCm (#26439)
Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
2025-10-09 06:20:18 +00:00
8bd696fa53 [Bugfix] Incorrect another MM data format in vllm bench throughput (#26462)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-10-09 05:58:46 +00:00
bb6d8c21f9 [Bugfix] Catch and log invalid token ids in detokenizer #2 (#26445)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-08 21:20:25 -07:00
ebf6ef1a9b [Minor] Change warning->warning_once in preprocess (#26455)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-10-08 21:09:06 -07:00
0c52d6ef81 [Bugfix] Set the minimum python version for gpt-oss (#26392)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-08 20:35:49 -07:00
467a4f98f1 [Misc] Redact ray runtime env before logging (#26302)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-08 17:43:34 -07:00
e614ab7806 Separate MLAAttention class from Attention (#25103)
Signed-off-by: Naveenraj Kamalakannan <therealnaveenkamal@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-08 17:11:11 -07:00
2a03f93de9 [Attention] Register FLASHMLA_SPARSE (#26441)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-08 22:28:52 +00:00
da364615fc [Kernels] Modular kernel refactor (#24812)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-08 17:51:52 -04:00
f08919b7d1 [Bugfix] Respect min_tokens in scheduler stop check (#26317)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
2025-10-08 14:08:24 -07:00
93f2c0aa08 [Models] Improve iteration over layers (#26425)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-08 20:48:33 +00:00
4ebc9108a7 [Kernel] Centralize platform kernel import in current_platform.import_kernels (#26286)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-08 20:25:31 +00:00
e1ba235668 [BugFix] Fix failing test quantization/test_compressed_tensors.py::test_compressed_tensors_fp8_block_enabled (#26436)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
2025-10-08 20:04:12 +00:00
b82f4307c9 [Bugfix][Flashinfer] fix VLLM_USE_TRTLLM_ATTENTION issue for models with diff hyperparameters (#25924)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-10-08 19:54:48 +00:00
76879cc160 [Attention] Implement universal BACKEND_MAP (#25900)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-08 12:00:25 -07:00
b25d7b5657 [Feature] Change cache.py with pydantic validation (#26390)
Signed-off-by: Vinay Damodaran <vrdn@hey.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 11:12:59 -07:00
e09d1753ec Remove Python 3.9 support ahead of PyTorch 2.9 in v0.11.1 (#26416)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 10:40:42 -07:00
4ba8875749 [Bug] Fix Test in Batch Invariant (#26128)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-08 10:13:47 -07:00
6273fe8d3d [Benchmarks] Fix imports in FP8 tuning script (#26407)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-08 16:31:59 +00:00
9fb3ae4e6f [Bug] Fix DeepGEMM Attention Test (#26423)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-08 12:23:41 -04:00
76afe4edf8 [Bugfix] Fix vllm bench ... on CPU-only head nodes (#25283)
Signed-off-by: Aydin Abiar <aydin@anyscale.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Aydin Abiar <aydin@anyscale.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-08 16:06:42 +00:00
c1b06fc182 [CI Failure] Fix pre-commit issue for install_nixl_from_source_ubuntu.py (#26424)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-08 07:55:43 -07:00
241b4cfe66 [Refactor] Refactor FP8 & INT8 Quant Folder inside w8a8 (#25293)
Signed-off-by: nicole-lihui <nicole.li@daocloud.io>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: courage17340 <courage17340@163.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zxw <1020938856@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: chenlang <chen.lang5@zte.com.cn>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Jonas Kuebler <kuebj@amazon.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: AlonKejzman <alonkeizman@gmail.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com>
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Icey <1790571317@qq.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: zixi-qi <qizixi@meta.com>
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Naman Lalit <nl2688@nyu.edu>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Junhong <liujunhong11@huawei.com>
Signed-off-by: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Patrick Toulme <ptoulme@meta.com>
Signed-off-by: Patrick Toulme <pctoulme+1@gmail.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Signed-off-by: Clayton Coleman <smarterclayton@gmail.com>
Signed-off-by: Jialin Ouyang <jialino@meta.com>
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Signed-off-by: Juechen Liu <jueliu@meta.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: yingjun-mou <renzomou@gmail.com>
Signed-off-by: zhoukz <me@zhoukz.com>
Signed-off-by: Chenxi Yang <cxyang@fb.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Signed-off-by: Lee Nau <lnau@nvidia.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: Lucia Fang <fanglu@meta.com>
Signed-off-by: a120092009 <zhaoty0121@gmail.com>
Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com>
Signed-off-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
Signed-off-by: Lehua Ding <lehuading@tencent.com>
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
Signed-off-by: anion <1005128408@qq.com>
Signed-off-by: Anion <123177548+Anionex@users.noreply.github.com>
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Signed-off-by: Or Ozeri <oro@il.ibm.com>
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Salvatore Cena <cena@cenas.it>
Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
Signed-off-by: nadathurv <work.vnadathur@gmail.com>
Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com>
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: billishyahao <bill.he@amd.com>
Signed-off-by: Nathan Scott <nathans@redhat.com>
Signed-off-by: Kenichi Maehashi <maehashi@preferred.jp>
Signed-off-by: Johnny <johnnynuca14@gmail.com>
Signed-off-by: johnnynunez <johnnynuca14@gmail.com>
Signed-off-by: Johnny <johnnync13@gmail.com>
Signed-off-by: Huamin Li <3ericli@gmail.com>
Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
Signed-off-by: Peter Schuurman <psch@google.com>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: leo-pony <nengjunma@outlook.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: ElizaWszola <elizaw.9289@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: huijjj <huijong.jeong@squeezebits.com>
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: kyt <eluban4532@gmail.com>
Signed-off-by: Egor <e.a.krivov@gmail.com>
Signed-off-by: Yang <lymailforjob@gmail.com>
Signed-off-by: Paul Pak <paulpak58@gmail.com>
Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: Xiang Si <sixiang@google.com>
Signed-off-by: Aleksandr Samarin <astrlrd@nebius.com>
Signed-off-by: Jun Jiang <jasl9187@hotmail.com>
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Signed-off-by: Nikhil Ghosh <nikhil@anyscale.com>
Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com>
Co-authored-by: courage17340 <courage17340@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: chenlang <chen.lang5@zte.com.cn>
Co-authored-by: chenlang <10346245@zte.com.cn>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: AlonKejzman <alonkeizman@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com>
Co-authored-by: Iceber Gu <caiwei95@hotmail.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: Icey <1790571317@qq.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: 阿丹(adan) <47373076+LDLINGLINGLING@users.noreply.github.com>
Co-authored-by: liudan <adan@minicpm.com>
Co-authored-by: liudan <liudan@qq.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: Clouddude <kouss.hd@gmail.com>
Co-authored-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: qizixi <22851944+zixi-qi@users.noreply.github.com>
Co-authored-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: Naman Lalit <nl2688@nyu.edu>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Xiaohan Zou <renovamenzxh@gmail.com>
Co-authored-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Peter Pan <peter.pan@daocloud.io>
Co-authored-by: Patrick C. Toulme <135739773+patrick-toulme@users.noreply.github.com>
Co-authored-by: Clayton Coleman <smarterclayton@gmail.com>
Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: Jialin Ouyang <jialino@meta.com>
Co-authored-by: weiliang <weiliangl@nvidia.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
Co-authored-by: Juechen Liu <grinchcoder@gmail.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yingjun Mou <renzomou@gmail.com>
Co-authored-by: Zhou Jiahao <me@zhoukz.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@fb.com>
Co-authored-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Lee Nau <lee.nau@gmail.com>
Co-authored-by: Adrian Abeyta <aabeyta@redhat.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: acisseJZhong <40467976+acisseJZhong@users.noreply.github.com>
Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucia Fang <fanglu@meta.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Xiaozhu Meng <mxz297@gmail.com>
Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Co-authored-by: a120092009 <33205509+a120092009@users.noreply.github.com>
Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com>
Co-authored-by: Lehua Ding <lehuading@tencent.com>
Co-authored-by: Reza Barazesh <3146276+rzabarazesh@users.noreply.github.com>
Co-authored-by: ihb2032 <40718643+ihb2032@users.noreply.github.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: Anion <123177548+Anionex@users.noreply.github.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: cjackal <44624812+cjackal@users.noreply.github.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Andrew Xia <axia@mit.edu>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Salvatore Cena <cena@cenas.it>
Co-authored-by: Param <psch@cs.unc.edu>
Co-authored-by: Zhewen Li <zhewenli@meta.com>
Co-authored-by: nadathurv <work.vnadathur@gmail.com>
Co-authored-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com>
Co-authored-by: billishyahao <bill.he@amd.com>
Co-authored-by: Nathan Scott <natoscott@users.noreply.github.com>
Co-authored-by: Kenichi Maehashi <939877+kmaehashi@users.noreply.github.com>
Co-authored-by: Johnny <johnnync13@gmail.com>
Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Co-authored-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Hosang <156028780+hyoon1@users.noreply.github.com>
Co-authored-by: Jerry Zhang <jerryzh168@gmail.com>
Co-authored-by: pwschuurman <psch@google.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Andrew Xia <axia@meta.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: ahao-anyscale <ahao@anyscale.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Liu-congo <1502632128@qq.com>
Co-authored-by: HUIJONG JEONG <64083281+huijjj@users.noreply.github.com>
Co-authored-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
Co-authored-by: kyt <eluban4532@gmail.com>
Co-authored-by: Egor <e.a.krivov@gmail.com>
Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com>
Co-authored-by: Paul Pak <52512091+paulpak58@users.noreply.github.com>
Co-authored-by: whx <56632993+whx-sjtu@users.noreply.github.com>
Co-authored-by: Xiang Si <sixiang@google.com>
Co-authored-by: Aleksandr Samarin <samarin_ad@mail.ru>
Co-authored-by: Jun Jiang <jasl9187@hotmail.com>
Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Nikhil G <nrghosh@users.noreply.github.com>
2025-10-08 10:20:48 -04:00
9fc983c707 [NIXL][non-cuda] Add install script for nixl with non-cuda ucx (#25959)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
2025-10-08 14:19:53 +00:00
2f99f2f506 Tidy vllm/config/__init__.py to only add classes and functions (#26405)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 07:10:00 -07:00
338b1bf04f [Benchmarks] Add support for Qwen 3 VL MoE tuning (#26419)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-08 14:01:08 +00:00
e39dc46f8f [CI] Pooling models mteb test disable enforce_eager (#26408)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-08 12:15:36 +00:00
10c75b5439 [Docs] Have mergify leave a comment with the docs preview link (#26412)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 12:04:00 +00:00
f9582fd8f4 [Model] Allow passing custom number of max tiles to Nano 2 VL (#26403)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
2025-10-08 11:19:39 +00:00
f377333bd7 [Misc] add usedforsecurity=False in md5 hash call (#26357)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-10-08 10:18:32 +00:00
f8607863d8 [Feature] Enable E8M0 by Default on Hopper for DeepGEMM, 5% E2E throughput improvement (#26197)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-08 15:33:56 +08:00
335b28f7d1 [TPU] Rename tpu_commons to tpu_inference (#26279)
Signed-off-by: Utkarsh Sharma <utksharma@google.com>
Co-authored-by: Utkarsh Sharma <utksharma@google.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-10-07 23:30:52 -07:00
5e65d6b2ad fix[DP][v1]: Prevent hangs from mismatched worker configurations (#26218)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
2025-10-07 22:55:08 -07:00
0d4f48fa10 [Bugfix] Incorrect MM data format in vllm bench throughput (#26395)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-08 13:52:19 +08:00
127c8b782a Add gather_indexer_k_quant_cache kernel (#25931)
Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-10-08 04:58:57 +00:00
cd9890544b fix(v1/kv_cache): resolve async KV transfer bug in cascade attention (#23485)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
2025-10-08 04:46:33 +00:00
067da2d1df [Core] Simplify setting new_token_ids in CachedRequestData (#26388)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-08 03:32:37 +00:00
046118b938 Add SwigluOAI implementation for CPUFusedMOE (#26347)
Signed-off-by: Sharif Inamdar <sharif.inamdar@arm.com>
2025-10-07 20:17:49 -06:00
b32260ab85 [torchao] safetensors integration (#25969)
Signed-off-by: Angel Li <liangel@meta.com>
2025-10-07 20:12:35 -06:00
f80e7866c0 [Misc] Clean up cruft from previous FlashMLA sparse implementation (#26125)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-08 10:09:34 +08:00
31a4b3e6c4 Revert #24446 and #26168 (#26332)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-07 16:38:19 -06:00
caf8b1c084 [Bugfix] Fix MTP+FlashInfer crash when trtllm kernels are available but disabled (#26361)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-07 22:12:26 +00:00
1b86bd8e18 Add more libraries to rlhf.md (#26374)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-10-07 20:59:41 +00:00
59012df99b [TPU] update TPU benchmark threshold (#25713)
Signed-off-by: Johnny Yang <johnnyyang@google.com>
2025-10-07 13:53:09 -07:00
3d1f67616d [Spec Decode] Enable efficient speculative decoding with FlashInfer-MLA (#25984)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-07 16:05:59 -04:00
6ebaf43ee4 [V1] Logit processors for rejection sampler (#19482)
Signed-off-by: southfreebird <yvorott@gmail.com>
Signed-off-by: Sergei Skvortsov <sergeyskv@nebius.com>
Signed-off-by: Sergei Skvortsov <yvorott@gmail.com>
Co-authored-by: Sergei Skvortsov <sergeyskv@nebius.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-10-07 13:02:49 -07:00
0c824fc46f [Frontend] CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26113)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
2025-10-07 12:53:43 -07:00
eb577e4655 [Bugfix] Add missing sink tensor into flash attn cascade attn implementation (#26325) 2025-10-07 18:56:39 +00:00
8f36850f73 [Bug] Fix Shape Validation for Fallback while Enabling E8M0 for DeepGEMM (#26322)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-07 13:50:30 -04:00
29fd2662ba [deepseek] add EP8 FusedMOE config for H200 and B200 (#26331)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-07 10:38:54 -07:00
30a3e5af69 [CI] Add Qwen3 MoE NVFP4 to Blackwell lm-eval (#26316)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-07 10:36:15 -07:00
a38c1bfe09 [ci] Rename test_mxfp4_moe.py to test_ocp_mx_moe.py (#26364)
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
2025-10-07 09:52:24 -07:00
320feae6f5 [Model] Lfm2Moe (#26344)
Signed-off-by: Paul Pak <paulpak58@gmail.com>
2025-10-07 16:03:05 +00:00
1e4ecca1d0 [V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 15:42:31 +00:00
c0a7b89d8e [Misc] Move LRUCache into its own file (#26342)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 15:08:40 +00:00
6f59beaf0b [Model] Add support for ModernBertForTokenClassification (#26340)
Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr>
Signed-off-by: antrec <antoine.recanati@gmail.com>
Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-07 14:29:19 +00:00
41f1cf38f2 [Feature][OCP MX] Support mxfp6 and mixed mxfp6-mxfp4 (#21166) 2025-10-07 09:35:26 -04:00
08d26a1b7e [Model] Use merge_by_field_config for MM models (Ovis family) (#26308)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-07 12:54:22 +00:00
63773a6200 [Docs] add docs for cuda graph v1 (#24374)
Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-07 05:25:05 -07:00
883b42896a Add TRL example notebook to RLHF docs (#26346)
Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com>
2025-10-07 11:31:28 +00:00
e1098ced95 Add topk logits torch op for DS3.2. (#25945)
Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
Signed-off-by: Daniel Cámpora <961215+dcampora@users.noreply.github.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-10-07 10:07:32 +00:00
d100d78eb3 Optimize KV cache distribution for asymmetric pipeline parallelism (#25164)
Signed-off-by: gholmes829 <g.holmes429@gmail.com>
2025-10-07 09:20:30 +00:00
7e4cd070b0 [V0 Deprecation] Remove VLLM_USE_V1 from docs and scripts (#26336)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 16:46:44 +08:00
46b0779996 [BugFix] Update KV block hash type from BlockHash to ExternalBlockHash in kv_events_subscriber - #26264 (#26265)
Signed-off-by: atalhens <sneh.lata@nutanix.com>
2025-10-07 08:42:28 +00:00
de342585ff [Model] Define merge_by_field_config MM interface (R-T) (#26260)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 16:10:55 +08:00
185d8ed44f [responsesAPI][bugfix] serialize harmony messages (#26185)
Signed-off-by: Andrew Xia <axia@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-07 07:07:53 +00:00
d9836d4517 [Deprecation] Deprecate LLM.set_tokenizer (#26333)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 06:50:57 +00:00
5f7e8a916a [Model] Define merge_by_field_config MM interface (U-Z) (#26261)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 06:45:49 +00:00
4dbdf4a294 [BUG] Fix file parsing for load_format runai_streamer_sharded (#26324)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-10-07 11:23:07 +08:00
c6873c4e6d [UX] Support nested dicts in hf_overrides (#25727)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-07 11:19:16 +08:00
2111b4643c [Core] Simplify the Dp padding/should ubatch coordination logic (#25768)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-10-07 01:57:49 +00:00
c50901f3b9 [Docs][DBO] Add initial doc that describes the DBO implementation (#26024)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-10-07 00:47:28 +00:00
8229280a9c [Misc] Define EP kernel arch list in Dockerfile (#25635)
Signed-off-by: Simon Mo <simon.mo@hey.com>
2025-10-07 00:05:33 +00:00
f77df94647 [Perf] Add decode full-graph support to FlashInfer-MLA backend (#26313)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-06 23:03:49 +00:00
f231e5bc21 [ROCm] Split AITER unified attention into its own backend (#25507)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-10-06 22:49:23 +00:00
2161efe978 [Bugfix] Allow skipping MoE in NVFP4 (fix for MTP) (#25987)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-06 16:16:30 -04:00
f23b4c04fd [BugFix] Pad input buffers in _dummy_run (#26209)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-06 16:07:51 -04:00
93540958b8 [Docs] Fix broken table in moe_kernel_features doc (#26314)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-06 15:58:05 -04:00
44b9af5bb2 [Benchmark] Enable MM Embedding benchmarks (#26310)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-06 19:51:58 +00:00
7cd95dc8a3 [Bugfix] Fix gemma3 with transformers backend (#23178)
Signed-off-by: raushan <raushan@huggingface.co>
Signed-off-by: Raushan Turganbay <raushan@huggingface.co>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-06 18:42:32 +00:00
c02058c222 Add bias handling to CPUFusedMOE kernel (#26289)
Signed-off-by: Crefeda Rodrigues <crefeda.rodrigues@arm.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Crefeda Rodrigues <65665931+cfRod@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Sharif Inamdar <Sharif.Inamdar@arm.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-10-06 18:39:10 +00:00
b2ea5ba677 [Bugfix][Spec Decode] Fix wrong valid_mask for padded speculation when chunked prefill occurs (#26231)
Signed-off-by: seven-mile <i@7li.moe>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-06 18:24:22 +00:00
824a3f403f [Misc] auto_tune: kill specific vllm process (#26304)
Signed-off-by: Karan Goel <karangoel@google.com>
2025-10-06 18:02:51 +00:00
05f6846ede Support llama3 eagle3 head with llama4 verifier (#25961)
Signed-off-by: rahul-tuli <rtuli@redhat.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-06 13:56:08 -04:00
20db99cc69 [CI Bugfix] Make sure TRTLLM attention is available in test_blackwell_moe (#26188)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-06 13:50:11 -04:00
6431be808f [Tests] conftest: Extending VllmRunner and HfRunner to accept token_ids as input (#26295)
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-06 17:19:34 +00:00
4727a8afa7 [Attention] Remove unused reorder_batch method (#24463)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-06 13:13:39 -04:00
b8f603cebe [Model] EVS support for nano_nemotron_vl (#26269)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
2025-10-07 00:23:37 +08:00
fc679696f8 Fix DotsOCR tensor type (#26281)
Signed-off-by: what_in_the_nim <chatcharinsang@gmail.com>
2025-10-06 12:23:43 +00:00
ab5e7d93f4 [Bugfix] Fix mrope in Transformers Backend (#26087)
Signed-off-by: raushan <raushan@huggingface.co>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-06 11:40:50 +00:00
0340f45553 Support expert parallel load balancing in Transformers backend (#26287)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-06 11:20:16 +00:00
19a00eb210 [Model] Use merge_by_field_config for MM models (Llava family) (#26280)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-06 09:45:26 +00:00
391612e78b [Frontend] Consolidate tokenizer init code (#26276)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-06 09:34:52 +00:00
77c95f72f7 [Doc] add KAITO to integrations (#25521)
Signed-off-by: "Abhishek Sheth" <absheth@microsoft.com>
2025-10-06 17:30:03 +08:00
59f30d0448 [Docs] Edit HF Inference Endpoints documentation (#26275)
Signed-off-by: Aritra Roy Gosthipaty <aritra.born2fly@gmail.com>
Signed-off-by: ariG23498 <aritra.born2fly@gmail.com>
2025-10-06 10:13:09 +01:00
43c146ca42 [Misc] Clean up unnecessary E501 ignore (#26274)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-10-06 07:29:18 +00:00
7c2ec0fe87 [Benchmarking] Add disable_shuffle option for dataset loading (#26258)
Signed-off-by: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
2025-10-06 07:05:44 +00:00
039b6bade3 Bump actions/stale from 10.0.0 to 10.1.0 (#26272)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-10-06 07:01:21 +00:00
6c04638214 Fix per file ruff ignores related to line length (#26262)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-06 05:12:40 +00:00
91ac7f764d [CI][gpt-oss] Enable python tool tests in CI (#24315)
Signed-off-by: wuhang <wuhang6@huawei.com>
2025-10-06 04:20:06 +00:00
4be7d7c1c9 [MISC] Add heheda12345 to CODEOWNERS of vllm/config/cache.py (#26270)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-06 10:58:59 +08:00
59b477645c [Doc] Edited minor typo (#26266)
Signed-off-by: Orange Ng <ngquanhao@outlook.com>
2025-10-05 19:53:09 -07:00
778f554157 [V1] [Hybrid] Some additional clean-up in Mamba2 prefix caching (#26222)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-06 10:40:30 +08:00
d3c84297c3 [CI] Add comment about the single cudagraph capture size that is used (#26252) 2025-10-06 02:35:37 +00:00
f509a20846 [DOC] Update production-stack.md (#26177)
Signed-off-by: Elieser Pereira <elieser.pereiraa@gmail.com>
2025-10-05 21:32:48 +00:00
60bc25e74c [CI] Add Blackwell LM Eval Small Models test to nightly (#26052)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-05 14:59:50 -06:00
b893d661b1 Fix per file ruff ignores related to simplification (#26259)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 20:31:53 +00:00
6b6e98775f [NVIDIA] flashinfer TRTLLM attention prefill token limit (#25998)
Signed-off-by: jasonlizhengjian <jason.li@centml.ai>
Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
2025-10-05 14:24:37 -06:00
9c3c21c519 [CI] fix mamba kernel test (#26250)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-10-05 18:26:59 +00:00
512b8affa4 Update ruff pre-commit hooks version (#26255)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-05 09:50:50 -07:00
1c0c68202c Fix per file ruff ignores related to typing (#26254)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 16:37:55 +00:00
5f317530ec fix(tests): Resolve late binding of loop variable in assert message lambda (#26249)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com
2025-10-05 09:18:22 -07:00
557b2e961d Remove all cases of fmt: on/off (#26253)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 09:18:14 -07:00
4e256cadc2 Remove all references to yapf as it's no longer used (#26251)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 09:18:11 -07:00
d6953beb91 Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 07:06:22 -07:00
17edd8a807 [Platform][Kernel] platform-specific kernel loading (#25823)
Signed-off-by: Hank <hcc.mayday@gmail.com>
2025-10-05 13:25:15 +02:00
3303cfb4ac [Bugfix][Hardware][RISC-V] Limit supported dtypes to float32 to avoid scheduler segfault (#26228)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
2025-10-05 10:36:54 +00:00
b7e8e4e6be [Bugfix] Always apply MM processor even when no MM items are passed (#26240)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-05 10:10:20 +00:00
432e1cbc23 [Bugfix]: Assertion error when using FlashInfer backend (#25933)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-05 16:46:36 +08:00
201c971e96 [Perf][Easy] Early stop in request_block_hasher (#26112)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-05 16:46:03 +08:00
e0986ea07b Add documentation for granite 4 tool calling (#26175)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-10-05 07:35:42 +00:00
a964e5e6c3 [Bugfix] Allow --skip-tokenizer-init with echo and return_token_ids (#26238)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-05 05:38:53 +00:00
78c1d5bfd2 [Easy] Add str repr for IterationStats (#26232)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-10-05 05:00:21 +00:00
59a85c366e [Model] Use merge_by_field_config for MM models (H-L) (#26230)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-05 11:54:17 +08:00
119f00630b [Renderer] Clean up renderer code (#26216)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-04 17:05:29 +00:00
a42d2df75f [Frontend] Cache chat template kwargs resolution (#26227)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-04 15:32:30 +00:00
5c057e068f [CPU] Refine batch reorder of CPU attention backend (#26096)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-10-04 21:54:35 +08:00
ed3aeb25a4 [V1] [Hybrid] Remove code to override default CUDA graph configuration (#26226)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-04 13:47:48 +00:00
86ee949128 Fix tensor device and dtype placement in Qwen2VL model (#26219)
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Yuanfeng Li <yuanfengli@meta.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-04 06:41:39 -07:00
4570535ec4 [Model] CLIP Embedding Support (#26010)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-04 06:21:42 -07:00
2a6dc67eb5 [Bugfix] Fix _reqs_to_process leak on abort (#26012)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-04 11:39:31 +00:00
f05fea1f5e [Core] Enable decode of context length equal to max model length (#26168)
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
2025-10-04 09:59:26 +00:00
d0df145c2a Add Olmo 3 reasoning parser (#26054)
Signed-off-by: Luca Soldaini <luca@soldaini.net>
2025-10-04 17:48:29 +08:00
1838cd4860 Revert "Add batch invariant kernel override for FlashInfer backend [2/n]" (#26220) 2025-10-04 02:45:08 -07:00
7d6b03381e [CI Failure] fix_test_auto_prefix_cache_support (#26053)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-04 02:44:49 -07:00
7c2e91c4e0 [Misc] Remove unused executor.apply_model (#26215)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-04 01:45:53 -07:00
736fbf4c89 [Misc] Require merge_by_field_config argument (#26214)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-04 01:40:14 -07:00
44ea85137a [Model] Support nested structures for TensorSchema (#26212)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-04 01:20:32 -07:00
d3d649efec Support expert parallel in Transformers backend (#26162)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-04 04:35:04 +00:00
ea507c3a93 [V1] [Hybrid] Mamba2 Automatic Prefix Caching (#25752)
Signed-off-by: Stanislaw Wozniak <stw@zurich.ibm.com>
Signed-off-by: Thomas Ortner <boh@zurich.ibm.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Thomas Ortner <boh@zurich.ibm.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-04 06:34:22 +02:00
9705fba7b7 [cpu][perf] Accelerate unquantized-linear for AArch64 through oneDNN/ACL and weight prepack (#25948)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-10-04 12:16:38 +08:00
2f7dbc9b42 Add batch invariant kernel override for FlashInfer backend [2/n] (#25769)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-03 19:49:30 -07:00
ea25a76c05 [BugFix] Use async Mistral Tokenizer in Chat Completions (#26134)
Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-04 09:42:08 +08:00
67bc0c003e [Bugfix] Fix qwen3 vl dummy data generation with overrides (#26193)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-10-04 01:40:20 +00:00
5a05f26603 Fix issue of using only the part of video frame [Nemotron Nano] (#26186)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
2025-10-04 00:21:00 +00:00
7ef40bb983 [GPTOSS][DP/EP][Marlin] Enable GPTOSS DP/EP using Marlin kernels (#25488)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-10-03 20:13:13 -04:00
767cbb011d [CI] Fix Pre-commit Mypy Error (#26181)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-03 16:08:03 -07:00
7cfa4b24bf [BugFix] Fix de-functionalization pass for rotary_embedding (#23953)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-03 15:44:18 -07:00
b71fcd4905 [Misc] Add penalties sampling parameters to serve tool (#25974)
Signed-off-by: Sergei Skvortsov <sergeyskv@nebius.com>
Co-authored-by: Sergei Skvortsov <sergeyskv@nebius.com>
2025-10-03 15:43:14 -07:00
75003f34e8 [CI] Push multiarch manifests as nightly builds (#25764)
Signed-off-by: Sahithi Chigurupati <chigurupati.sahithi@gmail.com>
2025-10-03 15:42:55 -07:00
78b8015a4d [Bugfix] Relax tokenizer regex for mixtral to include 'tokenizer.model' (#25964)
Signed-off-by: Bowen Bao <bowenbao@amd.com>
2025-10-03 18:31:59 -04:00
831b124151 [responsesAPI] add better error messaging for long prompts (#25724)
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2025-10-03 14:33:13 -07:00
c1ffcb55da [Refactor] Optimize FP8 MOE Backend Choice and Log (#26044)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-03 15:23:42 -06:00
0879736aab [Perf] Remove hardcoded num_warps=1 (#26183)
Signed-off-by: Corey Lowman <clowman1993@gmail.com>
2025-10-03 20:38:50 +00:00
a26917332f [Quantization/NVFP4] Speed up TRTLLM NVFP4 MOE weight loading and fix K/V scale loading for MLA Attn (#25968)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-10-03 19:35:06 +00:00
cd9e5b8340 Fix V1 engine serialization error with Ray distributed executor (#26148)
Signed-off-by: Nikhil Ghosh <nikhil@anyscale.com>
2025-10-03 18:39:45 +00:00
300a59c4c3 Avoid division by zero in cache DS MLA kernel (#26174)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-03 17:35:17 +00:00
d76541a6c5 Stop mergify from keeping stale PRs alive (#26169)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-03 16:42:34 +00:00
dd96465fd7 [BugFix][QWEN-VL]fix wrong apply_rotary_emb_torch selection introduced by #24642 (#26123)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-03 08:52:26 -07:00
4f8f47e87e Fix undefined symbol: cutlass_moe_mm_sm100 (#26098)
Signed-off-by: Jun Jiang <jasl9187@hotmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-03 15:48:32 +00:00
d78fda7cda [Renderer] Move Processor out of LLMEngine (#26165)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-03 15:08:22 +00:00
73a99cc2a5 [Model] Fixed stream generator for gpt-oss + spec-decoding (#26027)
Signed-off-by: Aleksandr Samarin <astrlrd@nebius.com>
2025-10-03 13:43:41 +00:00
adae0c1f43 [CI/Build] do not enforce precompilation on tpu ci tests (#25992)
Signed-off-by: Xiang Si <sixiang@google.com>
2025-10-03 13:38:42 +00:00
whx
cbf9221992 [Model] Supplement to PR 24862: Pass param prefix to LLMHead (#25805)
Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-10-03 21:34:53 +08:00
5f42fc53b6 [backends][short_conv] CUDA graph piecewise edits (#24215)
Signed-off-by: Paul Pak <paulpak58@gmail.com>
2025-10-03 12:59:48 +00:00
8ee846c27c [Bugfix] Re-enable prefill of max model length (#24446)
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
2025-10-03 14:13:34 +02:00
812b7f54a8 [Renderer] Move Processor out of AsyncLLM (#24138)
Signed-off-by: Yang <lymailforjob@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-03 11:29:45 +00:00
5f2cacdb1e Quick fix for IMA with the Prefix Prefill kernel during graph capture (#25983)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-10-03 11:28:22 +00:00
aa5053e3fe [Doc] Fixed shape description for fused_batched_moe.py (#25668)
Signed-off-by: Egor <e.a.krivov@gmail.com>
2025-10-03 04:00:23 -07:00
79aa244678 [Multi Modal] Configurable MM Profiling (#25631)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-03 03:59:10 -07:00
kyt
2ed3f20dba [openai] Fix missing tool usage check (system message) (#24768)
Signed-off-by: kyt <eluban4532@gmail.com>
2025-10-03 18:55:44 +08:00
48f309029a [NIXL][Misc] Expose metrics from NIXL for logging to CLI (#25388)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-03 10:47:59 +00:00
0e93ac0b3a [CI] Fix distributed hybrid tests in CI (#26155)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-03 09:14:18 +00:00
5446ad1d24 [test utils] correct wrong typing (#26159)
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
2025-10-03 02:11:49 -07:00
f9a8084e48 [Model] Use merge_by_field_config for MM models (InternVL family) (#26153)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-03 01:59:06 -07:00
3e70e3d4d5 add(v1): RequestStatesStats to RequestOutput (#24947)
Signed-off-by: huijjj <huijong.jeong@squeezebits.com>
2025-10-03 08:56:25 +00:00
eb0fa43868 [Perf] Optimize reshape_and_cache CUDA Kernel (#25955)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Co-authored-by: Liu-congo <1502632128@qq.com>
2025-10-03 01:33:46 -07:00
0ad9951c41 [Input] Remove unused prompt field (#26097)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-03 00:23:21 -07:00
8c9117181d [Misc] Remove typing.List (#26150)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-03 07:00:33 +00:00
c4b48d3c0f [BUG] Reorder model config creation (#26124)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-10-03 14:59:36 +08:00
10d765482d FusedMoE support for the Transformers backend (#22650)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-02 23:12:15 -07:00
39b643dc1a [Model] Use merge_by_field_config for MM models (G) (#26117)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-02 22:38:29 -07:00
711f485643 [Bugfix] Fix import gemm_afp4wfp4 failure on AMD (#26068)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-02 22:37:25 -07:00
9c5ee91b2a [ROCm] [VL] [Bugfix] Fix vit flash attn dispatcher logic for ROCm (#26104)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-10-02 22:34:53 -07:00
27edd2aeb4 [Build/CI] Revert back to Ubuntu 20.04, install python 3.12 with uv (#26103)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-10-02 22:21:01 -07:00
e5017cd6d6 [gpt-oss] disable tool server initialization if no tool in request (#25790)
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2025-10-03 05:08:35 +00:00
6a7796e871 [Bug]: Limit num_reqs in dummy_run when max_num_seqs is small (#26144)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-03 04:00:20 +00:00
47b9339546 [DeepSeek] Improve performance of DS MLA cache kernel (#26132)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-02 20:35:47 -07:00
5d5146eee3 [CI/Build] Conditionally register cutlass_fp4_group_mm to fix building on Hopper (#26138)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-02 20:32:38 -07:00
2aaa423842 [Attention] Move Backend enum into registry (#25893)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-02 20:32:24 -07:00
ad2d788016 [Bug][Benchmark] Fix duplicate req in oversampling (#26140)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-03 02:55:24 +00:00
36ce76c632 [Log] Optimize DeepGEMM Missing Log (#26106)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-02 20:02:26 -06:00
f1fc2107a3 [Bugfix] Disable cascade attention with FlashInfer (#26130)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-02 16:30:37 -07:00
13cdc02173 Fix MTP with deepep_low_latency (#25904)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-02 21:29:49 +00:00
502640c3f9 [Perf] Fix and reapply move apply w8a8 block fp8 linear to class (#25696)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: ElizaWszola <elizaw.9289@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
2025-10-02 19:35:13 +00:00
3d5f1c8640 [Mamba][KVCacheManager] Simplify kv cache manage logic for mamba + MTP (#25119)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-02 18:48:31 +00:00
1cab2f9cad EAGLE 3: Fix preamble so that measured speedup over Eagle 1 becomes 32% instead of 5% on MTBench (#25916)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
2025-10-02 11:29:35 -07:00
1e50f1be70 [Deepseek v3.2] Support indexer prefill chunking (#25999)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-02 10:29:12 -07:00
ad87ba927a [Small] Prevent bypassing media domain restriction via HTTP redirects (#26035)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-10-02 10:27:10 -07:00
decf7f794b [BugFix] Fix FI accuracy issue when used for MLA prefill (#26063)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-10-02 17:18:13 +00:00
d00d652998 [CI/Build] Replace vllm.entrypoints.openai.api_server entrypoint with vllm serve command (#25967)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-02 10:04:57 -07:00
3b279a84be [CI] Add Blackwell DeepSeek FP8 FlashInfer MoE tests (#26040)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-02 09:07:19 -07:00
5e4a8223c6 [Qwen][ROCm] Flash Attention Rotary Embeddings (#24642)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-02 08:26:08 -07:00
e51de388a2 [Platform][CI] Added OOT platform interface e2e test that running on Ascend NPU (#25470)
Signed-off-by: leo-pony <nengjunma@outlook.com>
2025-10-02 23:19:22 +08:00
cc253b73d3 [Model] Use merge_by_field_config for MM models (D-F) (#26076)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-02 08:17:35 -07:00
7d6fb905d9 [Model] Use merge_by_field_config for MM models (A-C) (#26073)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-02 08:17:31 -07:00
418d111f8c [FA/Chore] Bump vllm-flash-attention (#25537)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-02 11:06:14 -04:00
be8921fbba Change size of single CUDA graph for CI to 4 (#26089)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-02 14:14:28 +00:00
d4e7a1152d Update base image to 22.04 (jammy) (#26065)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-10-02 05:48:04 -07:00
be22bb6f3d Run:ai model streamer add GCS package support (#24909)
Signed-off-by: Peter Schuurman <psch@google.com>
2025-10-01 20:59:13 -07:00
169313b9f8 [Misc] Make handling of SamplingParams clearer in n>1 case (#26032)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-01 19:31:39 -07:00
0b018d8baf [ROCm][Bugfix] Add missing parameter to ROCm backend (#26029)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-10-01 19:23:14 -07:00
c31246800c Support RL online quantization with torchao (#23014)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-10-01 16:39:29 -07:00
4134312b35 [BugFix] ChunkedLocalAttention is currently not CG compatible (#26034)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-01 16:28:00 -07:00
da554f932e [Bug] Fix Negative Cuda Memory Usage (#25683)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-01 18:16:26 -04:00
aac622e0cd [ROCm][Build] Add support for AMD Ryzen AI MAX / AI 300 Series (#25908)
Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>
2025-10-01 21:39:49 +00:00
1726e93ef1 [BugFix][DP/EP] Fix CUTLASS MLA hang under load (#26026)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2025-10-01 12:30:00 -07:00
ee04c0cd04 [CI] Tweaks to GPT-OSS Eval (Blackwell) for stability (#26030)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-01 12:02:17 -07:00
c36f0aa300 Fix test_mamba_ssm_ssd.py due to missing _query_start_loc_to_chunk_indices_offsets (#25995)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-01 18:18:36 +00:00
5234dc7451 [NVIDIA] Blackwell Family (#24673)
Signed-off-by: Johnny <johnnynuca14@gmail.com>
Signed-off-by: johnnynunez <johnnynuca14@gmail.com>
Signed-off-by: Johnny <johnnync13@gmail.com>
Signed-off-by: Salvatore Cena <cena@cenas.it>
Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Co-authored-by: Salvatore Cena <cena@cenas.it>
2025-10-01 10:50:54 -07:00
3b7c20a6b5 [Bugfix] Apply same sampling parameters for both n=1 and n>1 (#26005)
Signed-off-by: Kenichi Maehashi <maehashi@preferred.jp>
2025-10-01 14:37:35 +00:00
f9e714813a [Benchmark] Finish documented v0.11.0 deprecation of --endpoint-type (#26007)
Signed-off-by: Nathan Scott <nathans@redhat.com>
2025-10-01 12:41:57 +00:00
2518230d3e [MISC] Fix misleading batch_size_capture_list when cuda_graph_sizes < 4 (#25829)
Signed-off-by: billishyahao <bill.he@amd.com>
Co-authored-by: Luka Govedic <ProExpertProg@users.noreply.github.com>
2025-10-01 08:39:45 -04:00
a332b84578 [CI] Only capture a single CUDA graph size in CI by default (#25951)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-01 10:03:44 +01:00
1405f0c7ba [Misc] Factor out common _apply_feature_select_strategy (#26003)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-01 01:31:03 -07:00
84d57342b6 [BugFix][MM] Fix Nonetype error when video is cache in qwen2.5-omni-thinker (#26004)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-10-01 08:03:25 +00:00
57b46d769e [Doc] updating torch.compile doc link (#25989)
Signed-off-by: nadathurv <work.vnadathur@gmail.com>
Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com>
Co-authored-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
2025-10-01 07:04:56 +00:00
f48b6a03ba [Misc]allow disable pynccl (#25421)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-10-01 06:04:13 +00:00
2a69ab4899 Update to Transformers v4.56.2 (#24638)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-30 22:07:07 -07:00
8d7da92fd7 [BugFix] Fix default kv-cache-dtype default for DeepseekV3.2 (#25988)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-30 21:58:31 -07:00
e952eee698 [Bugfix] Fix __syncwarp on ROCM (#25996) 2025-09-30 21:15:11 -07:00
66bca9b8bd [MM] Add text-only mode for Qwen3-VL (#26000) 2025-09-30 21:13:42 -07:00
99028fda44 Fix INT8 quantization error on Blackwell GPUs (SM100+) (#25935)
Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
2025-09-30 19:19:53 -07:00
1244948885 [Log] Optimize Log for FP8MOE (#25709)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-30 19:18:43 -07:00
a73f6491c8 Update launch_bounds_utils.h for correct compile on Multiple Cuda Arch - PTXAS out of range Warning (#25843)
Signed-off-by: Salvatore Cena <cena@cenas.it>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-30 19:18:19 -07:00
001e50c92c [Model] MTP fallback to eager for DeepSeek v32 (#25982)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-10-01 01:53:22 +00:00
96ebcaa3ad [Misc] Make EP kernels install script support uv (#25785)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-30 23:38:34 +00:00
5db1870bb9 [gpt-oss] use vLLM instead of openai types for streaming (#25186)
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2025-09-30 22:47:07 +00:00
2ce26b9b5d [Docs] Remove API Reference from search index (#25949)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-30 22:10:02 +00:00
a388252ac4 Add explicit pooling classes for the Transformers backend (#25322)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-30 23:07:06 +01:00
9a9f48dff7 [V1] [P/D] Add Support for KV Load Failure Recovery (#19330)
Signed-off-by: David Ben-David <davidb@pliops.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
2025-09-30 14:57:08 -07:00
67f3fb0844 [Bench] Add DeepSeekV32 to MoE benchmark (#25962)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-30 14:13:48 -07:00
43b752c325 [Llama4] [multimodal] Fix misplaced dtype cast of cos_sin_cache in Llama4VisionRotaryEmbedding (#25889)
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
2025-09-30 20:35:15 +00:00
cfd302db9b OffloadingConnector: Fix GPU block tracking bug (#25856)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-30 19:53:04 +00:00
fb610ae684 [Docs] Add moe kernel features doc (#25297)
Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-30 19:03:15 +00:00
2f652e6cdf [Doc] Improve MM Pooling model documentation (#25966)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-30 18:58:29 +00:00
e6a226efba [Bug] Fix AttributeError: 'QKVParallelLinear' object has no attribute 'orig_dtype' (#25958)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-30 11:13:03 -07:00
a2e6fa7e03 [bugfix][deepseek] fix flashmla kernel selection (#25956)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-01 00:30:36 +08:00
9f1c4ecaf2 [Bugfix] Token type and position embeddings fail to be applied to inputs_embeds (#25922)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-01 00:23:12 +08:00
ef283548f7 [Bugfix] Fix accuracy issue of TRTLLM FP8 MOE and improve logging (#25895)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-09-30 10:51:31 -04:00
f4db5e6de1 [Bugfix][Model] Fix inference for Hunyuan dense models (#25354)
Signed-off-by: anion <1005128408@qq.com>
Signed-off-by: Anion <123177548+Anionex@users.noreply.github.com>
2025-09-30 14:38:07 +00:00
099aaee536 Add Hugging Face Inference Endpoints guide to Deployment docs (#25886)
Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-30 14:35:06 +00:00
35fe398c7c [Kernel][Moe Configs] Add more tuned triton configs for ExpertsInt8 and FP8 (#25858)
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
2025-09-30 07:30:44 -07:00
bb6d43047e [Fix] Improve CPU backend compatibility for RISC-V (#25816)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
2025-09-30 13:48:07 +00:00
bc546f76a1 [CI] Move applicable tests to CPU (#24080)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-30 14:45:20 +01:00
80608ba5af [NIXL] Add support for MLA caches with different latent dim (#25902)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-09-30 12:18:29 +00:00
e184c9c510 [perf] Use CPU tensor to reduce GPU->CPU sync (#25884)
Signed-off-by: Lehua Ding <lehuading@tencent.com>
2025-09-30 19:51:16 +08:00
d7e34b4210 [Model] Move vision_feature_select_strategy into resolve_visual_encoder_outputs (#25938)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-30 11:24:57 +00:00
ef6e0e7132 [Bugfix][Model]fix ernie45 moe gate&bias dtype to float32 (#25936)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-09-30 19:11:21 +08:00
1ad3aca682 Updated TRL integration docs (#25684)
Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-30 03:10:55 -07:00
8d0afa9b42 [Doc] Add Cambricon MLU support (#25942)
Signed-off-by: a120092009 <zhaoty0121@gmail.com>
2025-09-30 17:59:47 +08:00
fa7e254a7f [New Model] DeepSeek-V3.2 (Rebased to Main) (#25896)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: Lucia Fang <fanglu@meta.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: Lucia Fang <fanglu@meta.com>
Co-authored-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Xiaozhu Meng <mxz297@gmail.com>
Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
2025-09-30 17:14:41 +08:00
e23cacda35 [Bugfix]: Clean up chunked prefill logging when using whisper (#25075)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
2025-09-30 08:17:49 +00:00
2e1b8bc2b6 [Model][Bugfix] Fix MiDashengLM audio encoder mask by removing incorrect logical_not (#25925)
Signed-off-by: zhoukz <me@zhoukz.com>
2025-09-30 08:15:23 +00:00
e47433b3c1 [BugFix] Pass config_format via try_get_generation_config (#25912) 2025-09-30 05:09:50 +00:00
23194d83e8 [BugFix] Fix DP/EP hang (#25906)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-30 04:18:59 +00:00
61aedb5ffe MoveVllmConfig from config/__init__.py to config/vllm.py (#25271)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-29 19:49:49 -07:00
d3bd171123 [Benchmark] Support benchmark throughput for external launcher DP (#25913)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-09-30 01:43:57 +00:00
89e4050af4 [Bug] Fix Weight Loading for Block FP8 Cutlass SM90 (#25909)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-30 09:15:19 +08:00
78a47f87ce Test Prompt Embeds/LoRA compatibility and Enable LoRA Support for OPT Models (#25717)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-30 08:10:58 +08:00
6a113d9aed [V0 Deprecation] Remove vllm.worker and update according imports (#25901) 2025-09-29 23:26:11 +00:00
2e4fe48c37 [NIXL] Increase default KV block eviction timeout on P (#25897)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-29 21:35:14 +00:00
8eb0a1d906 [Doc] Polish example for torchrun dp (#25899) 2025-09-29 21:31:34 +00:00
fea3e476aa [Kernel] Chunk-aligned mamba2 (#24683) 2025-09-29 23:18:25 +02:00
61a3431613 [Bugfix][ROCm] Fixing trying to import non-existent symbols from libnccl.so (#25605)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-29 17:01:50 -04:00
9bedac9623 [Doc] Add documentation for vLLM continuous benchmarking and profiling (#25819)
Signed-off-by: Naman Lalit <nl2688@nyu.edu>
2025-09-29 20:49:49 +00:00
c42ff4f4fd [BugFix][torch.compile] KV scale calculation issues with FP8 quantization (#25513)
Signed-off-by: adabeyta <aabeyta@redhat.com>
2025-09-29 15:52:04 -04:00
d5ab28511c [Bugfix] Use correct key "ignore" for config.json non-quantized layers (#25706)
Signed-off-by: Lee Nau <lnau@nvidia.com>
2025-09-29 15:07:29 -04:00
e61eb5e09d [Model] Remove MotifForCausalLM (#25866)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-30 00:36:30 +08:00
0899ba5b42 [CI/Build] Include Transformers backend test in nightly transformers test (#25885)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-29 09:33:39 -07:00
145ac73317 [Bugfix][Speculative Decoding] Fix Eagle3 quantization config issue (#25883)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-09-29 11:37:20 -04:00
d0d138bc55 [Nixl][P/D] Add cuda2cpu support (HD->DH transfer) (#24690)
Signed-off-by: Chenxi Yang <cxyang@fb.com>
Co-authored-by: Chenxi Yang <cxyang@fb.com>
2025-09-29 14:31:51 +00:00
43227236ec [torch.compile] serialize cudagraph_mode as its enum name instead of value (#25868)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-29 13:54:52 +00:00
8616300ae2 [Model][Bugfix] Fix issues in MiDashengLM implementation for quantized models (#25854)
Signed-off-by: zhoukz <me@zhoukz.com>
2025-09-29 10:59:04 +00:00
edbaadd91f [Bugfix] Fix requirements paths in install instructions (#25827)
Signed-off-by: yingjun-mou <renzomou@gmail.com>
2025-09-29 03:49:35 -07:00
9360d34fa1 update to latest deepgemm for dsv3.2 (#25871)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-29 17:51:43 +08:00
1b67b04656 [Misc] Remove more get_input_embeddings_v0 (#25857)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-29 08:03:37 +00:00
bd51f78e39 [V0 Deprecation][Models] Remove all V0 condition for mm embeddings merge (#25331)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: isotr0py <2037008807@qq.com>
2025-09-29 14:09:18 +08:00
65ecb4f134 [Bugfix] Fallback ViT attn backend to SDPA for blackwell (#25851)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-29 06:03:51 +00:00
143844fa43 [XPU]Fix xpu spec decoding UTs, avoid using cuda graph (#25847)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-29 05:15:10 +00:00
219cfbe7f6 Add Phi4FlashForCausalLM to _PREVIOUSLY_SUPPORTED_MODELS (#25832)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-09-29 05:08:17 +00:00
9b44a7d926 [P/D] NIXL Updates (#25844)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-09-29 04:46:30 +00:00
a3ae45a38c [Misc] fix tests failure by using current_platform (#25825)
Signed-off-by: Juechen Liu <jueliu@meta.com>
2025-09-29 04:18:57 +00:00
0307428d65 Remove redundant cudagraph dispatcher warning (#25841) 2025-09-28 17:12:42 -04:00
471997adf6 [Bugfix] fix Qwen3VLMoe load when pp > 1 (#25838)
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
2025-09-28 17:56:12 +00:00
b1ded114b9 Update GLM-4.5 Doc transformers version (#25830)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-09-28 12:05:51 +00:00
f4e4088c99 Fix random dataset mismatched token length with config. (#24937)
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-28 08:23:44 +00:00
0efd540dbc [VLM] Update Qwen3-VL max_num_video_tokens calculation for configurable video profiling (#25557)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-28 04:21:01 +00:00
6144754014 [Bugfix] Fix Qwen3-VL regression from #24982 (#25814)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-28 03:21:09 +00:00
69311446ba [MM] Optimize memory profiling for scattered multimodal embeddings (#25810)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-28 02:17:58 +00:00
da63274d9f [Bugfix][NIXL] Fix Async Scheduler timeout issue (#25808)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-27 15:17:35 -04:00
c216119d64 [Core] GC Debug callback (#24829)
Signed-off-by: Jialin Ouyang <jialino@meta.com>
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: Jialin Ouyang <jialino@meta.com>
2025-09-27 17:53:31 +00:00
5546acb463 [Bug]: Set LD_LIBRARY_PATH to include the 'standard' CUDA location (#25766)
Signed-off-by: Clayton Coleman <smarterclayton@gmail.com>
2025-09-27 13:36:28 -04:00
c0ec81836f [torch.compile]: Add VLLM_DEBUG_DUMP_PATH environment variable (#25651)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-27 16:09:00 +00:00
b65e56babe [Core] Refactor self.model() to call a helper for subclassing. (#25084)
Signed-off-by: Patrick Toulme <ptoulme@meta.com>
Signed-off-by: Patrick Toulme <pctoulme+1@gmail.com>
2025-09-27 08:40:59 -07:00
49996cd597 [env] default nixl side port conflicts with kv-event zmq port (#25056)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2025-09-27 15:02:40 +00:00
ecb37e276a [docs] transcriptions API audio upload (#25446)
Signed-off-by: zxw <1020938856@qq.com>
2025-09-27 15:00:35 +00:00
a5354b3ed2 [Bugfix][WideEP] Apply TP Attn + EP MoE fix to other models (#24982)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-09-27 14:22:28 +00:00
f9df8b4ad7 [Bugfix] Fix triton import precommit failure (#25803)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-09-27 07:13:11 -07:00
ec152c8748 Fix GPTQ model loading in Transformers backend (#25770)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-27 12:18:20 +00:00
7977e5027c Add filtering for chat template kwargs (#25794)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-27 10:46:49 +00:00
3f5d902d2a Validate API tokens in constant time (#25781)
Signed-off-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: rentianyue-jk <rentianyue-jk@360shuke.com>
2025-09-27 18:09:26 +08:00
27d7638b94 [Bugfix] Merge MM embeddings by index instead of token IDs (#16229)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-27 08:15:12 +00:00
176173989a [Bugfix] Add missing image_size for phi4_multimodal (#25796) 2025-09-27 07:59:22 +00:00
23b8ee672d [Misc] Update openai client example file for multimodal (#25795)
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-27 07:57:07 +00:00
3939152069 [Misc] Fix codeowners override for v1 sample and attention (#25037)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-09-27 07:47:29 +00:00
cd87bfbf37 [CI/Build] Reorganize root-level V1 tests (#25767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-27 13:51:15 +08:00
b3613e3ace [CI/Build] Add timing to Model Executor Test (#25799)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-09-26 21:57:27 -07:00
d346ec695e [CI/Build] Consolidate model loader tests and requirements (#25765)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-26 21:45:20 -07:00
c242c98031 [Bugfix] Allow Only SDPA Backend for ViT on B200 for Qwen3-VL (#25788) 2025-09-26 20:44:52 -07:00
f1d53d150c [Multimodal][Speculative Decoding]Eagle Eagle3 mm support, enablement on qwen2.5vl (#22872)
Signed-off-by: Junhong <liujunhong11@huawei.com>
Signed-off-by: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
2025-09-27 03:35:47 +00:00
92da847cf5 Add flashinfer-build.sh and register precompiled cu128 wheel in Dockerfile (#25782)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-26 18:54:09 -07:00
3958b96bf5 Add option to restrict media domains (#25783)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
2025-09-27 01:23:52 +00:00
8bf8f45822 [Core] Don't count preempted tokens in prefix cache hit rate (#25787)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-09-27 00:16:40 +00:00
6f5c0931c1 [Spec decode] automatically disable mm for text-only draft models (#25667)
Signed-off-by: Jonas Kuebler <kuebj@amazon.com>
2025-09-27 08:10:21 +08:00
4e33a7ea85 [Bugfix] Optimize CpuGpuBuffer initialization (#25447)
Signed-off-by: Naman Lalit <nl2688@nyu.edu>
2025-09-27 08:07:36 +08:00
dc48ba0c75 Kernel-override Determinism [1/n] (#25603)
Signed-off-by: Bram Wasti <bwasti@meta.com>
2025-09-26 16:59:09 -07:00
4778b42660 Reduce the Cuda Graph memory footprint when running with DBO (#25779)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-09-26 22:29:56 +00:00
c70ac4b8ff [spec decode] Consolidate speculative decode method name for MTP (#25232)
Signed-off-by: zixi-qi <qizixi@meta.com>
2025-09-26 22:27:05 +00:00
cf89202855 [CI] Fix FlashInfer AOT in release docker image (#25730)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-26 14:11:40 -07:00
f075693da7 [V1] address post issues related to #20059 (part 1) (#23046)
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-26 15:58:19 -04:00
f708bd4904 [CI] Add E2E Blackwell Quantized MoE Test (#25723)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-26 12:23:00 -07:00
0002b7f0d1 [Docs] Add Toronto Meetup (#25773)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-26 12:00:46 -07:00
11aafd9886 [Bugfix] Improve GLM4 MoE Reasoning Parser's is_reasoning_end Condition (#25355)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-09-26 11:54:00 -07:00
b761df963c [Doc]: improve CPU(x86) build-wheel-from-source section (#25617)
Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>
2025-09-26 10:26:33 -07:00
33f6aaf972 Eagle3 that supports the Minicpm3 model (#24243)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: liudan <adan@minicpm.com>
Co-authored-by: liudan <liudan@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
2025-09-26 10:04:57 -07:00
56aafa8c0b [Misc] fix unique_filepath (#25732)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-26 16:56:15 +00:00
8d52f2b3a7 [ray][metrics] Replace ':' with '_' for OpenTelemetry compatibility in Ray (#25439)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
2025-09-26 09:43:30 -07:00
984d18498a [BugFix] Fix using dbo_decode_token_threshold always (and ignoring dbo_prefill_token_threshold) (#25622)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-26 16:22:49 +00:00
d4d9899860 [Quantization] Add field to skip unquantized modules for GPTQ config (#25455)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-26 15:47:41 +00:00
db1e42f627 [CI/Build] Fix some V1 tests not being run (#25569)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-26 20:52:36 +08:00
bc9d7b5595 [CI/Build] Split up Distributed Tests (#25572)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-26 14:49:33 +02:00
fe6b19c314 [Bugfix] Properly abort pooling request. (#25734)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-26 05:47:34 -07:00
2827b3f4a3 [CI] Fix test_shared_storage_connector_hashes (#25748)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-26 20:46:17 +08:00
2b6b1d7809 [Model] Mamba2 varlen refactor (#21467)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
2025-09-26 11:31:14 +00:00
633f943e30 [Doc] Update Batch-level DP docs (#25757)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-26 02:37:40 -07:00
b03b1b97f6 Support LongCat-Flash-Chat tool call (#24083)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-09-26 09:25:39 +00:00
dfb9af2014 [Bugfix] Fix Shared Expert/Zero expert code in FusedMoE.process_chunk (#25698)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-26 01:25:28 -07:00
19f76ee68e [misc] refactor speculative config (#25657)
Signed-off-by: zxw <1020938856@qq.com>
2025-09-26 01:22:06 -07:00
dd70437a4f Remove cuda hard-code in compute_causal_conv1d_metadata (#25555)
Signed-off-by: Icey <1790571317@qq.com>
2025-09-26 01:19:20 -07:00
99b3a504c5 [Qwen3-Next][GDN] fixes cuda graph capturing bug in GDN metadata and a stride bug in causal_conv_1d. (#25743)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-09-26 01:18:58 -07:00
6e30010d2f fix: print outputt offline_inference/base/chat.py example (#25744)
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
2025-09-26 01:18:24 -07:00
52621c8f5c [Harware][AMD][Model] Triton MoE tuning configs for GLM-4.5 for MI300X (#25703)
Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com>
2025-09-26 01:18:20 -07:00
d48f4d6daf perf: Avoid copying inputs_embeds tensors to GPU unless prompt_embeds is enabled (#25739)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-26 01:18:09 -07:00
e84e0735c7 fix: revert cast to cpu in MsgpackEncoder._encode_tensor to avoid hidden performance regressions (#25738)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-26 01:18:05 -07:00
3edf87d25f [CI/Build] fix doc build warning: Failed to get 'name: description' pair (#25733)
Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io>
2025-09-26 01:18:02 -07:00
392edee34a EVS Support (Video tokens pruning) (#22980)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-26 11:54:54 +08:00
983056e456 [Misc] Remove unnecessary memoryviews in shm_broadcast.py (#25721)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-26 03:11:44 +00:00
13dd93c667 [Core] Force PIECEWISE CUDAGraph mode for encoder-decoder (#25701)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-25 18:21:56 -07:00
53a30845be Llamas 3.1 405B fp4 changes upstreaming from 355_wip (#25135)
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
2025-09-25 19:16:53 -06:00
8b77328ffe [Misc] Don't log shm dequeue delay warning on worker side (#25720)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-26 01:08:30 +00:00
9fe4c2bdb9 [Refactor] Remove DeepGEMM OP Register (#25710)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-25 20:13:41 -04:00
081b5594a2 Fix routing_bias dtype (#25711)
Signed-off-by: Shu Wang. <shuw@nvidia.com>
2025-09-25 23:35:14 +00:00
57329a8c01 [Model] rename NemotronH_Nano_VL -> NemotronH_Nano_VL_V2 (#25708)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
2025-09-25 16:10:29 -07:00
8c435c9bce [Core] Enable command line logging for LLMEngine (#25610)
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-09-25 15:31:17 -07:00
e71b8e210d [Spec Decode] Add Batch Parallel Ngram. Upto 8x lower overhead. (#24986)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-09-25 15:22:03 -07:00
89fa54e6f7 [Optimization] Use a cheaper cache key in get_model_architecture (#25682)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 17:54:20 -04:00
3d54bdcb73 [Optimization] Streamline InputPreprocessor (#25702)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 21:06:49 +00:00
6b0fcbbf43 [Misc] Simplify test_argsort_mm_positions (#25690)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 18:23:01 +00:00
0fa673af4c [V0 deprecation] Clean up LoRA (#25686)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-25 18:12:33 +00:00
3468f17ebe [V0 deprecation] Remove _VLLM_V1 suffixes from attention backend names (#25489)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
2025-09-25 17:37:50 +00:00
71b25b0d48 [V0 deprecation] Clean up V0 fallback in compilation config (#25675)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-25 17:29:51 +00:00
0ea80c87d9 [Model] Define merge_by_field_config MM interface (#25676)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 17:13:07 +00:00
b8d9e4a326 [Model] Add optional parameter to reasoning parser constructor (#25554)
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-26 01:12:50 +08:00
13cc7f5370 [BugFix] Fix DBO hang (#25625)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-25 17:04:48 +00:00
916bd9204d Revert "[Bug] Dynamo Unsupported due to BasevLLMParameter.torch_function calling disabled super()" (#25681)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-25 09:45:06 -07:00
e04a1b6b21 [BUGFIX] Fix crash in Eagle Speculative Decoding models when exceedin… (#24662)
Signed-off-by: AlonKejzman <alonkeizman@gmail.com>
2025-09-25 15:40:14 +00:00
2e5df88c92 [Logging] Remove TORCH_NCCL_AVOID_RECORD_STREAMS to squash a warning (#25532)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-09-25 15:16:06 +00:00
0754ac4c49 [Misc] Remove cruft file in repo (#25678)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-25 08:05:12 -07:00
03858e6d1c [Bugfix] Fix InternS1 video processing after Transformers v4.56 (#25644)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-25 14:46:04 +00:00
532a6cfccb [ux] Switch a warning to debug about a pytorch fallback (#23750)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-25 14:38:16 +00:00
eb32335e35 [CPU] update torch 2.8 and fix missing fields in TorchSDPAMetadata (#25652)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-25 13:29:11 +00:00
69a8c8e99a [torch.compile] Make Query Quantization Fusable (#24914)
Signed-off-by: Jonas Kuebler <kuebj@amazon.com>
2025-09-25 09:25:12 -04:00
6c340da4df [misc] log info messages by default for hanging / busy / idle (#25627)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-25 21:14:57 +08:00
2f17117606 [mypy] Fix wrong type annotations related to tuple (#25660)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 13:00:45 +00:00
1e9a77e037 [Hardware][RISC-V] Add riscv64 support for vLLM with scalar (#22112)
Signed-off-by: chenlang <chen.lang5@zte.com.cn>
Co-authored-by: chenlang <10346245@zte.com.cn>
2025-09-25 20:46:11 +08:00
d2af67441d [XPU][Triton]add xpu config in triton_reshape_and_cache_flash (#25643)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-25 12:38:11 +00:00
0bcc3a160d [CI/Build] Fix flaky entrypoints test (#25663)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 12:19:40 +00:00
70fbdb26e9 Add backward compatibility for guided_... API (#25615)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-09-25 19:45:25 +08:00
7f570f1caa [V0 deprecation] Remove unreachable model_config.supported_tasks (#25642)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-09-25 11:26:31 +00:00
eaeca3cd7f [Bugfix] Parse SpeculativeConfig Error (#25142)
Signed-off-by: zxw <1020938856@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-25 11:09:39 +00:00
12c1287d64 [mypy] Further improve MM type annotations (#25654)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 10:57:36 +00:00
17b4c6685c [Bugfix] Fix Qwen3-VL max_num_video_tokens calculation for video profiling (#25648)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-25 18:36:01 +08:00
3c2b2ccece [Bugfix] Add triton.language.tensor placeholder (#25649)
Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
2025-09-25 10:31:14 +00:00
7be9ffcd9f [Misc] Fix Qwen3-VL video_grid_thw typing (#25646)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-25 10:16:45 +00:00
393de22d2e [fix] Update torch version in cpu-build.txt for AArch64/ppc64le and Darwin (#25579)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2025-09-25 09:39:18 +00:00
1260180c67 Revert "[Performance] Move apply_w8a8_block_fp8_linear to an op class… (#25607)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-09-25 08:05:21 +00:00
af4ee63e0e typo: remove duplicate is (#25641)
Signed-off-by: nicole-lihui <nicole.li@daocloud.io>
2025-09-25 00:46:22 -07:00
bc092ea873 Map CwmForCausalLM to llama and LlamaForCausalLM (#25611)
Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-25 07:37:03 +00:00
755ed7b05b [Misc] Simplify PoolerOutput and move to v1/outputs (#25629)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-25 06:47:03 +00:00
a676e668ee [Bugfix] fix apply_temperature to avoid nan in probs (#24734)
Signed-off-by: courage17340 <courage17340@163.com>
2025-09-25 05:32:21 +00:00
c85be1f6dd optimize: eliminate duplicate split_enc_dec_inputs calls (#25573)
Signed-off-by: nicole-lihui <nicole.li@daocloud.io>
2025-09-25 05:03:25 +00:00
845adb3ec6 [Model] Add LongCat-Flash (#23991)
Signed-off-by: yangxurui <yangxurui@meituan.com>
Co-authored-by: yangxurui <yangxurui@meituan.com>
2025-09-24 21:53:40 -07:00
90b139cfff Enable Fbgemm NVFP4 on Dense models (#25609)
Signed-off-by: Saman Keon <samanamp@outlook.com>
2025-09-24 21:12:53 -07:00
4492e3a554 [Bug] Dynamo Unsupported due to BasevLLMParameter.torch_function calling disabled super() (#25613)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-24 18:52:52 -07:00
05c19485a5 [Kernel] Support DCP for Triton backend (#25132)
Signed-off-by: Wei Wei <wwei6@meta.com>
2025-09-24 18:09:34 -07:00
52d0cb8458 [Model] Improve DotsOCRForCausalLM (#25466)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-25 07:58:08 +08:00
5c1e496a75 [MISC] replace c10::optional with std::optional (#25602)
Signed-off-by: Shiyan Deng <dsy842974287@meta.com>
2025-09-24 16:56:21 -07:00
e7f27ea648 Improve --help for enhanced user experience (#24903)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-24 23:08:18 +00:00
1f29141258 [Refactor] Use DeepGEMM Col Major TMA Aligned Tensor (#25517)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-24 18:52:36 -04:00
6160ba4151 feat: BF16 FlashInfer Fused Cutlass MOE for Hopper and Blackwell Expert Parallel (#25503)
Signed-off-by: Duncan Moss <djm.moss@gmail.com>
2025-09-24 18:50:04 -04:00
fea8006062 [Logging] Improve log for when DeepEP HT disables CUDA Graphs (#25531)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-09-24 22:43:06 +00:00
e6750d0b18 [V0 Deprecation] Remove unused classes in attention (#25541)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-24 13:24:40 -07:00
8c853050e7 [Docs] Enable fail_on_warning for the docs build in CI (#25580)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-24 19:30:33 +00:00
f84a472a03 Suppress benign cuBLAS warning when capturing cudagraphs with DBO (#25596)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-09-24 19:02:08 +00:00
54e42b72db Support mnnvl all2allv from Flashinfer (#21003)
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-09-24 14:38:16 -04:00
2dda3e35d0 [Bugfix] add cache model when from object storage get model (#24764)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-09-24 18:11:16 +00:00
d83f3f7cb3 Fixes and updates to bench_per_token_quant_fp8 (#25591)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-09-24 08:30:15 -07:00
302eb941f3 [ROCm][Build][Bugfix] Fix ROCm base docker whls installation order (#25415)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-24 11:25:10 -04:00
487745ff49 [ROCm][Bugfix] Only enable +rms_norm based on aiter if not explicitly disabled (#25275)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-24 11:24:39 -04:00
9313be5017 [Misc] Improve type annotations for jsontree (#25577)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-24 22:49:58 +08:00
8938774c79 Move DeviceConfig, ObservabilityConfig, SpeechToTextConfig to their own files (#25564)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-24 13:59:05 +00:00
e18b714b2e [Bugfix] Fix DeepSeekV31ToolParser to correctly parse multiple tools in non-streaming output (#25405)
Signed-off-by: taohui <taohui3@gmail.com>
2025-09-24 20:58:00 +08:00
b1068903fd [docs] fix nixl kv_connector_extra_config.backends key (#25565)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Peter Pan <peter.pan@daocloud.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-24 11:00:27 +00:00
164299500b [Benchmark] Fix regression in structured output benchmark (#25500)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-24 10:40:42 +00:00
58c360d9be [Bug] fix import and unit test (#25558)
Signed-off-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com>
2025-09-24 10:17:59 +00:00
42488dae69 [Bugfix] Fix dummy video number of frames calculation (#25553)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-24 09:47:30 +00:00
b67dece2d8 [misc] update the warning message (#25566)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-09-24 17:24:35 +08:00
2338daffd3 [BugFix] Potential Fix for FA3 full-cudagraph IMA (#25490)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-24 02:04:04 -07:00
2e19a848d4 [V0 Deprecation] Remove max_seq_len_to_capture (#25543)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-24 01:51:39 -07:00
77a7fce1bb [CI/Build] add nightly prime-rl integration tests (#25207)
Signed-off-by: Jackmin801 <ongjackm@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-24 08:44:22 +00:00
6488f3481b [Misc]] Move processing context to multimodal directory (#25548)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-24 08:15:00 +00:00
27ec3c78f3 [CI/Build] Fix v1 OOT registration test (#25547)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-24 08:03:13 +00:00
1cbcfb94de [Bugfix][CPU] Skip unsupported custom op register on CPU (#25534)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-24 06:21:51 +00:00
fed8a9b107 [Misc] Retry HF processing if "Already borrowed" error occurs (#25535)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-23 22:32:11 -07:00
190c45a6af [TPU][Bugfix] fix the missing apply_model in tpu worker (#25526)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-09-24 05:18:08 +00:00
5caaeb714c [Bugfix] [Frontend] Cleanup gpt-oss non-streaming chat tool calls (#25514)
Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-09-24 03:20:38 +00:00
d747c2ef18 [Perf] Fix jit compiles at runtime of fla gated delta rule (#25432)
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-24 11:16:13 +08:00
c30b405b8f [Spec Decode] Enable FlashInfer Spec Decoding (#25196)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: lhsjohn <huashuoli@tencent.com>
2025-09-23 22:29:58 -04:00
77d906995c [KV sharing] Re-land Gemma3n model changes from #22628 (#24357)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-09-23 19:25:34 -07:00
359d293006 [fix]: add Arm 4bit fused moe support (#23809)
Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com>
2025-09-24 01:32:22 +00:00
9df8da548e [BugFix] Fix MLA assert with CUTLASS MLA (#25478)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-23 21:09:43 -04:00
bf68fd76a9 [Compile] Fix AMD Compile Error (#25518)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-24 00:42:48 +00:00
de94289a98 [Core] Support weight_loader_v2 for UnquantizedLinearMethod (#23036)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-09-23 18:30:26 -06:00
1983609239 [Bugfix] Use a separate FlashInfer workspace buffer for trtllm-gen (#25520) 2025-09-24 00:19:56 +00:00
d06b5a95cb [V1][Metrics] Add per-request TPOT histogram (#24015)
Signed-off-by: baxingpiaochong <771405853@qq.com>
2025-09-23 18:19:04 -06:00
be0bb568c9 [Model] Support SeedOss Reason Parser (#24263)
Signed-off-by: Yan Lu <luyan@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-23 18:15:51 -06:00
c8bde93367 [BUG] Allows for RunAI Streamer and Torch.compile cache to be used together (#24922)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-09-23 18:13:32 -06:00
88d7bdbd23 [Bug] Fix AttributeError: 'FusedMoE' object has no attribute 'w13_weight_scale'. Did you mean: 'w13_weight_scale_inv' (#25519)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-24 00:07:51 +00:00
0d235b874a Add CUTLASS FP8 MOE benchmark scripts and kernel config (#25302)
Signed-off-by: Chenxi Yang <cxyang@fb.com>
Co-authored-by: Chenxi Yang <cxyang@fb.com>
2025-09-23 18:07:42 -06:00
7ad5e50adf Improve output when failing json.loads() on structured output test (#25483)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-09-23 18:03:31 -06:00
dc464a3d39 [BugFix] AssertionError: Do not capture num_reqs > max_num_reqs for uniform batch (#25505)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-23 18:00:29 -06:00
1210e4d95b [Bugfix] [B200] cutlass_mla - ensure kv_split == 1 for batch size > 1 (#25509)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-09-23 16:57:55 -07:00
e0b24ea030 [Perf] Increase default max splits for FA3 full cudagraphs (#25495)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-23 16:53:34 -07:00
bde2a1a8a4 [ROCm] Small functional changes for gptoss (#25201)
Signed-off-by: jpvillam <jpvillam@amd.com>
Co-authored-by: jpvillam <jpvillam@amd.com>
2025-09-23 23:39:50 +00:00
5e25b12236 [Kernel] [Mamba] Remove BLOCK_H=1 from list of tuneable configurations for _chunk_cumsum_fwd_kernel (#25197)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Chih-Chieh-Yang <chih.chieh.yang@ibm.com>
2025-09-23 23:23:30 +00:00
c85d75cf08 Add VLLM_NVTX_SCOPES_FOR_PROFILING=1 to enable nvtx.annotate scopes (#25501)
Signed-off-by: Corey Lowman <clowman1993@gmail.com>
2025-09-23 22:50:09 +00:00
abad204be6 [BugFix] Fix OOM in vLLM replicas by ensuring consistent NCCL memory accounting (#25359)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-09-23 15:49:09 -07:00
7361ab379f Remove redundant mutates_args and dispatch_key for direct_register_custom_op (#25512)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-23 22:48:40 +00:00
95bc60e4cb [gpt-oss][bugfix] remove logic to require resp_ in ResponseAPI (#25428)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-23 15:46:46 -07:00
4f2954f724 Fix triton_reshape_and_cache_flash.py triton import (#25522)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-23 15:26:10 -07:00
eca7be9077 Add VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE & VLLM_ENABLE_INDUCTOR_COORDINA… (#25493)
Signed-off-by: rouchenzi <ruochenwen@gmail.com>
Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com>
2025-09-23 22:17:49 +00:00
969b4da3a6 [V0 Deprecation] Remove placeholder attn (#25510)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-09-23 22:12:14 +00:00
4f8c4b890a [Core] Use KVCacheBlock as much as possible instead of dict[block_id, KVCacheBlock] (#24830)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-09-23 15:11:14 -07:00
ae002924e9 [CI/Build] Fix and re-enable v1 PP test on CI (#25496)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-23 21:58:25 +00:00
690f948e4a [Bugfix] Fix for the import error from #24588 (#25481)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-23 21:31:08 +00:00
08275ec0a2 [Build] Update Xgrammar to 0.1.25 (#25467)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-23 21:25:46 +00:00
c828d1bf98 [Bugfix] gpt-oss container tool output bug (#25485)
Signed-off-by: Alec Solder <alecs@fb.com>
Co-authored-by: Alec Solder <alecs@fb.com>
2025-09-23 20:43:45 +00:00
8b8a8afc89 [CI] Fix Pre-commit Issue (#25497)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-24 04:09:37 +08:00
8bdd8b5c51 Enable symmetric memory all reduce by default only enabling for TP (#25070)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-23 15:53:00 -04:00
a8ffc4f0f2 [Bugfix] Lower gpt-oss max cudagraph size to 992 to be compatible with FA3 (#25508)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-23 12:49:55 -07:00
d5944d5146 [Speculators][Speculative Decoding] Fix gpt-oss eagle3 accuracy issue (#25406)
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
2025-09-23 15:44:35 -04:00
24fab45d96 [Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-23 15:29:26 -04:00
63400259d0 [Performance] Move apply_w8a8_block_fp8_linear to an op class (#24666)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: ElizaWszola <elizaw.9289@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
2025-09-23 12:03:10 -07:00
8c1c81a3de [core] add nccl symmetric memory for all reduce (#24532)
Signed-off-by: Amir Samani <asamani@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-23 14:33:06 -04:00
a3a7828010 [ROCm] Add skinny gemm bias support for dtypes fp16,bf16,fp8 (#24988)
Signed-off-by: Hashem Hashemi <hashem.hashemi@amd.com>
Signed-off-by: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com>
2025-09-23 14:31:45 -04:00
5abb117901 [Core] Ensure LoRA linear respect the base_layer's tp_size and tp_rank (#25487)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-23 18:19:25 +00:00
867ecdd1c8 [Spec Decode][CI] Add e2e test for examples/spec_decode.py and prevent breaking Acceptance Length (#24531)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-23 10:46:40 -07:00
24e8222745 [Misc] Reduce initialization time of auto_tune (#23682)
Signed-off-by: Weida Hong <wdhongtw@google.com>
2025-09-23 17:34:58 +00:00
100b630a60 [V1][Kernel] Add triton implementation for reshape_and_cache_flash (#24503)
Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-23 12:52:40 -04:00
527821d191 Use macro guard CUDA functions for back compatibility in grouped_topk_kernel.cu (#25346)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-23 09:45:39 -07:00
846197f505 [Log] Optimize kv cache memory log from Bytes to GiB (#25204)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-23 12:44:37 -04:00
2357480b1a [BugFix] Fix UB in per_token_group_quant.cu (#24913)
Signed-off-by: Shreeasish Kumar <shreeasish@rivosinc.com>
2025-09-23 09:14:22 -07:00
f11e3c516b [Kernels] Support blocked fp8 quantization for compressed tensors MoE (#25219)
Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-23 16:11:34 +00:00
875d6def90 Add backward compatibility for GuidedDecodingParams (#25422)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-23 17:07:30 +01:00
cc1dc7ed6d [Core/DBO][2/N] Dual-Batch Overlap add DeepEP High Throughput support and Prefill support (#24845)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-09-23 16:02:10 +00:00
a903669e10 [V1] Remove V0 code paths for Hybrid models (#25400)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-09-23 08:26:13 -07:00
2c58742dff [UX] Change kv-cache-memory log level to debug (#25479)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-09-23 08:01:24 -07:00
4c966e440e [XPU] Fix MOE DP accuracy issue on XPU (#25465) 2025-09-23 14:32:57 +00:00
da5e7e4329 [Docs] NixlConnector quickstart guide (#24249)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Peter Pan <peter.pan@daocloud.io>
Signed-off-by: Nicolò Lucchesi<nicolo.lucchesi@gmail.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-09-23 14:23:22 +00:00
f05a4f0e34 [P/D] Support NIXL connector to disconnect during a clean shutdown (#24423)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
2025-09-23 16:08:02 +02:00
61d1b35561 [BugFix] Register expert_map as named buffer for wake_up and sleep (#25458)
Signed-off-by: wuxibin <wuxibin@bytedance.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-09-23 21:49:13 +08:00
b6a136b58c [CI/Build] Fix disabled v1 attention backend selection test (#25471)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-23 13:05:46 +00:00
0d9fe260dd [docs] Benchmark Serving Incorrect Arg (#25474)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-09-23 06:05:11 -07:00
273690a50a [Core] Optimize LoRA weight loading (#25403)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-23 18:19:45 +08:00
231c2c63e4 [Bugfix] Fix idefics3 tie_word_embeddings (#25454)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-23 10:06:48 +00:00
4322c553a6 [Test]: Hermes tool parser stream output error in Qwen3 case (#25203)
Signed-off-by: Andreas Hartel <andreas.hartel@aleph-alpha.com>
2025-09-23 17:56:31 +08:00
babad6e5dd [Misc] Move DP for ViT code inside model executor dir (#25459)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-23 09:20:52 +00:00
9383cd6f10 [Frontend] Add a new xml-based tool parser for qwen3-coder (#25028)
Signed-off-by: Zhikaiiii <1658973216@qq.com>
2025-09-23 16:07:27 +08:00
ba8d2165b6 Handle triton kernel import exception (#25319)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-09-23 00:56:00 -07:00
c98be0a232 [Model] Enable DP for ViT in Qwen2-VL (#25445)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-23 05:17:10 +00:00
5774b0a1da [NIXL][OOT platform] support nixl_connector with oot platform and other nixl_backend (#25121)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
2025-09-23 04:17:42 +00:00
e8db44f883 [DP/EP][GPTOSS] Use triton matmul-ogs kernels for GPTOSS DP/EP (#24588)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-09-22 21:01:09 -07:00
fafbe11af4 [Docs] Fix griffe warnings in vllm/lora/ops (#25369)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-23 03:42:58 +00:00
78237e43bf [Bugfix] Remove contiguous output req for context parallel MLA (#25414)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-09-22 20:26:32 -07:00
eea1783989 [benchmarks]allow skip ready check for bench serve (#25420)
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-09-23 03:21:48 +00:00
f225ea7dd9 [XPU] Fix compile_size is None case. (#25433)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-23 03:09:00 +00:00
fc97733da8 [feat] Support MRoPE + YaRN (#25384)
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
2025-09-23 03:04:47 +00:00
4741239db7 [Bug] Fix Long Context OOM Issue (#25290)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-22 22:04:15 -04:00
c625f9043c [V0 deprecation] Remove _set_default_args_v0 function (#25409)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-23 01:52:09 +00:00
6fa78d8f23 [V0 deprecation] Remove platform v1 controling interface (#25410)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-23 01:48:12 +00:00
9949aa2ef1 [Perf] Apply torch.compile for per_block_cast_to_fp8 (#24611)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-22 19:42:45 -06:00
0b7bed9c38 [Performance] Remove input pads in cutlass_mla and optimize v_proj output handling (#25184)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-09-22 19:20:53 -06:00
ac0048c0ae [BugFix] [DP/EP] Fix slow execution when BS <= DP (#25407)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Chris Bamford <chrisbam4d@gmail.com>
2025-09-22 17:26:17 -07:00
090197034f [Bugfix] Fix missing clear_connector_metadata (#25397)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-23 08:10:59 +08:00
f31ff87460 [Core] Drop overly aggressive whisper assertion (#25408)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-22 17:09:52 -07:00
d588cd2406 [Bugfix] fix custom op test (#25429)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-09-23 00:07:43 +00:00
45d7d852d3 [Frontend] Responses API MCP tools for built in tools and to pass through headers (#24628)
Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Alec S <10566873+alecsolder@users.noreply.github.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-22 23:38:19 +00:00
8bed179109 [TPU] update torch_xla dependency for PyPI compatibility (#25278)
Signed-off-by: Johnny Yang <johnnyyang@google.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-09-22 16:14:44 -07:00
f552d5e578 [CI/Build] Skip Qwen3-VL initialization tests until models are actually released (#25394)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-22 13:18:24 -07:00
8db2939289 [KV offload][5/N] Add CPUOffloadingSpec (#24251)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-22 12:30:36 -07:00
d5e0fca264 [torch.compile] Cleanup compilation tests and custom passes, add debug utils, fix DCE bug (#23091), fix test (#24376), and prep for custom op matching (#24604) (#24542)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: luka <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-22 12:30:05 -07:00
8d0ee5a564 [misc] Remove RFC review hours reference (#25416) 2025-09-22 12:16:59 -07:00
922979bfcc [DP] support torchrun external launcher with Data Parallelism (#24899)
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2025-09-22 12:06:05 -07:00
239ef0c1ac [CI Failure] Fix fp8 kv cache on <SM90 (#25396)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-22 18:27:51 +00:00
1d7f95b85c [Compiler] Disable Inductor standalone compile by default (#25391)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
2025-09-22 17:37:46 +00:00
cfbee3d0e7 [CLI env var] Add VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH in env variables (#25274)
Signed-off-by: qqma <qqma@amazon.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: qqma <qqma@amazon.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-22 10:37:43 -07:00
06a41334c7 [EPLB] Reduce EPLB Inference Overhead (#24573)
Signed-off-by: Bowen Wang <abmfy@icloud.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-09-22 16:31:05 +00:00
175811e3b5 [V1][Attention] Split triton_attn in triton-only and rocm specific backends (#24648)
Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
2025-09-22 15:20:28 +00:00
c10101a3eb [Bugfix] Fix several issues with p2p xPyD in GET type (#23993)
Signed-off-by: Csrayz <jover@cmbchina.com>
Signed-off-by: ivyilike <pww123@cmbchina.com>
Co-authored-by: ivyilike <pww123@cmbchina.com>
2025-09-22 14:53:13 +00:00
ac243886b0 [Kernel] MI-300X triton moe configs (#23445)
Signed-off-by: Sara Kokkila Schumacher <saraks@ibm.com>
2025-09-22 14:29:54 +00:00
3d2c56b7a9 Make mypy behave like a proper pre-commit hook (#25313)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-22 12:23:45 +00:00
64c824cd78 Make pickle import check fast (#25379)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-22 04:08:25 -07:00
417a164af6 [Misc] Remove unused encoder-decoder error strings (#25374)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-22 11:04:32 +00:00
b6f01bd9a7 refactor: abstract graph mode support into platform interface (#25161)
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-09-22 10:22:29 +00:00
4cf71cc88a [TPU] Deprecate xm.mark_step in favor of `torch_xla.sync (#25254)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-22 10:12:57 +00:00
a66d131381 [TPU][Bugfix][CI] Fix broken tests/build dependency (#25255)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-22 09:55:04 +00:00
21467f9a1c Enable Eagle3 speculative decoding for GPT-OSS model (#25246)
Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com>
2025-09-22 08:50:39 +00:00
f92d952632 [V0 Deprecation] Remove MultiModalPlaceholderMap (#25366)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-22 08:49:19 +00:00
6d0b827cbd [V0 Deprecation] Remove V0-only methods in multi-modal registry (#25362)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-22 13:58:26 +08:00
0eecb31663 [Bugfix] Fix hermes tool parser handling of non-string argument types (#22002)
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-09-22 11:35:39 +08:00
793be8d057 [Docs] GSM8K Accuracy Evaluation doc update (#25360)
Signed-off-by: David Chen <530634352@qq.com>
2025-09-22 02:49:13 +00:00
7b57a433da [Model] Support Dots OCR (#24645)
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: yinz-aizip <yinz@aizip.ai>
2025-09-22 02:24:40 +00:00
5aeb925452 Multimodal - audio tests (#25285)
Signed-off-by: Debolina Roy <debroy@redhat.com>
2025-09-22 07:07:11 +08:00
04d3752329 [Bugfix][V0 Deprecation][CI] use async mock and await for async method (#25325)
Signed-off-by: Yang <lymailforjob@gmail.com>
2025-09-22 07:06:16 +08:00
bc6e542d9f Remove V0 attention backends (#25351)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-21 16:03:28 -07:00
af7dfb0d1a [Perf] Further optimization for Qwen3-VL fast_pos_embed_interpolate (#25347)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-21 20:12:45 +00:00
1c3ffdbecc [V0 Deprecation] Remove V0 sampling metadata (#25345)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-21 10:37:11 -07:00
c438b2951c feat: Enable engine-level arguments with speculators models (#25250)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Claude <noreply@anthropic.com>
2025-09-21 11:04:45 -06:00
0ff8ebb2d7 [V0 Deprecation] Remove async_output_proc, preemption mode, delay factor (#25334)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-21 08:52:32 -07:00
26e673fe93 [V0 Deprecation] Remove V0 Sequence class & Sampler (#25332)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-21 08:52:15 -07:00
65a5910ce3 [Optimization] Cache chat template result when processor fails to be loaded (#25341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-21 19:41:02 +08:00
9aea7373ff [Bugfix] Typos in error message for missing model config file (#25339)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
2025-09-21 04:36:47 -07:00
30d08911f7 [MM][Perf] Minor Optimization on Qwen3-VL fast_pos_embed_interpolate (#25337)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-21 11:05:20 +00:00
cf56cf78b4 [V1] Add sliding window support to Flex Attention backend (#24089)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-21 05:08:07 +00:00
7ed82d1974 [V0 Deprecation] Remove V0 MP executor (#25329)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 21:26:35 -07:00
12dbd834cf [V0 Deprecation] Remove from_seq_group methods (#25330)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 21:10:48 -07:00
035fd2bd2c [Multi Modal][Performance] Fused Q,K's apply_rope in more models (#25005)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-21 03:55:10 +00:00
1cd885bd54 [V0 Deprecation] Remove V0 model runner base & simplify worker base (#25328)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 20:49:09 -07:00
62b38dc832 [Doc] improve test-pipeline.yaml documentation (#25305)
Signed-off-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2025-09-20 20:29:12 -07:00
c99db8c8dd [V0 Deprecation] Remove V0 core (#25321)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 19:58:26 -07:00
72dd1595b4 [CI] Skip tests failing on main (#25326)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 19:57:46 -07:00
572ddf83ce [Chore] Remove unused sampler in models (#25324)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 19:53:20 -07:00
86647d1cd0 [V0 Deprecation] Remove V0 Output Processor (#25320)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 17:57:20 -07:00
52c2a8d4ad [V0 Deprecation] Remove LLMEngine (#25033)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-20 17:56:30 -07:00
367a480bd3 [Docs] Fix warnings in vllm/profiler and vllm/transformers_utils (#25220)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-20 16:39:47 -07:00
bef180f009 [V0 Deprecation] Enable the remaining multimodal tests in V1 (#25307)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-20 17:50:58 +00:00
d88918e4c2 [Core] Enable sharded state loader for V1 engine and enhance test coverage (#25308)
Signed-off-by: pengdrumli <pengdrumli@tencent.com>
2025-09-20 21:15:22 +08:00
3c713a9711 [Model] Cleanup InternViT's data parallel implementation (#25306)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-20 05:46:24 -07:00
bf8b26cad1 Generate _ModelInfo properties file when loading to improve loading speed (#23558)
Signed-off-by: Manoel Marques <manoel.marques@ibm.com>
Signed-off-by: Manoel Marques <manoelmrqs@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-20 11:51:13 +00:00
032d661d27 [Docs] Fix warnings in mkdocs build (continued) (#25042)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-09-20 11:45:18 +00:00
e08a3a3fdb [CI Failure] Disable FlashInfer RoPE to unblock CI (#25299)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-20 08:16:56 +00:00
3d9a1d2de5 [V1] Support LLM.apply_model (#18465)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-20 07:14:35 +00:00
be874c0201 [Bugfix] Fix Qwen3-VL-MoE weight loading for EP (#25300)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-20 00:04:05 -07:00
9607d5eb44 [Hybrid Allocator] Support full attention with different hidden size (#25101)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-19 23:43:59 -07:00
c60e6137f0 [Optimization] Avoid repeated model architecture conversion for pooling models (#25261)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-20 13:30:22 +08:00
f91480b2d4 [Bugfix] fix tool call arguments is empty (#25223)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: xin.li <xin.li@daocloud.io>
2025-09-20 13:29:54 +08:00
6c5f82e5aa [BUG FIX][NON-CUDA]quick fix to avoid call cudagraph_unsafe in attention (#25298)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
2025-09-20 04:41:23 +00:00
b7f186bbb3 [BugFix] Exclude self when checking for port collision (#25286)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-20 12:28:31 +08:00
3642909617 [BUGFIX] GPTQ quantization compatibility for Qwen3 Next MOE models (AutoGPTQ and AutoRound-GPTQ) (#25268)
Signed-off-by: JartX <sagformas@epdcenter.es>
2025-09-20 11:18:13 +08:00
c308501cb6 Improve weight loading for encoder models in Transformers backend (#25289)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-20 03:11:03 +00:00
535d80056b [Misc] Support more collective_rpc return types (#25294)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-20 02:02:38 +00:00
a25ade5d47 [BugFix] Ensure appropriate guards in destructors (#25284)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-20 09:06:34 +08:00
8945b001db [torch.compile] CUDAGraph Inductor partition integration (#24281)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Signed-off-by: boyuanfeng <boyuan@meta.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-20 01:02:15 +00:00
b8a287a0a8 [docs] Prompt Embedding feature support (#25288)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-19 17:46:23 -07:00
c7e713616a test: Remove vestigial skip for prompt embeds tests after landing v1 Prompt Embeds support (#25291)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-19 17:33:40 -07:00
a36c675817 Don't skip special tokens with hermes-style tool calling (#25281)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-09-19 17:33:25 -07:00
3da17c2cc2 [Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
2025-09-19 20:27:21 -04:00
14c1432789 [BugFix] Fix async scheduling CPU tensor race take 2 (#25279)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-19 16:34:07 -07:00
ee7a66dd9a allow disable flashinfer prefill (#25276)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-09-19 22:59:41 +00:00
431535b522 Enable modelopt gemma3 nvfp4/fp8, make workflow more robust (#22771)
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-19 22:40:33 +00:00
711e912946 [Compile] Fix Compile Warning for Ignoring MIN_BLOCK_PER_SM (#25193)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-19 16:23:19 -06:00
e69e0b8b5f [Frontend] Responses API messages out, just harmony for now (#24985)
Signed-off-by: Alec Solder <alecs@fb.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-19 21:40:16 +00:00
ddc9048394 Fix: Correct FusedMoE layer reference in auto_round quantization (#24818)
Signed-off-by: David-Wen <18927700430@163.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-19 20:44:24 +00:00
b1a63d1b3b [BugFix] Make FlashInferMetadataBuilder non-blocking (#25040)
Signed-off-by: Julien Lin <jullin@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-19 20:36:34 +00:00
48ecb4438b [Perf] Use FlashInfer RoPE for RotaryEmbedding.forward_cuda when available (#21126)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-09-19 14:06:49 -06:00
e57fc15971 Specify platform in pip-compile pre-commit hook so it runs on MacOS (#25273)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-19 12:43:33 -07:00
4bdf400218 [Bugfix] Fix chunked a2_scales in modular kernels (#25264)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-19 19:42:01 +00:00
7852b82b93 [Bugfix] GPT OSS Attritbute error on H100 (#25228)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-09-19 13:14:09 -06:00
a2a5f79e09 Optimize triton unified attention performance for sliding window attention (#24390)
Signed-off-by: zixi-qi <qizixi@meta.com>
2025-09-19 13:07:26 -06:00
c59a0eca42 [KV offload][4/N] Offloading KV connector (#22595)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-19 19:07:17 +00:00
b716ab93a7 [bugfix] fix structured outputs key missing issue from #24929 (#25195)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-09-19 18:37:57 +00:00
138f0d1e75 [Docs] add __init__.py to vllm/model_executor/layers/quantization/compressed_tensors/transform (#24974)
Signed-off-by: samzong <samzong.lu@gmail.com>
2025-09-19 18:32:27 +00:00
2506ce5189 [Core][Prefix Hash] Fix prefix hash metrics sliding window maintainance (#24990)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-09-19 12:22:53 -06:00
47fd08aaf9 [CI/Build] fix test function_calling (#25072)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-19 12:16:32 -06:00
12aed7e453 Encoder model support for the Transformers backend (#25174)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-19 19:15:22 +01:00
d90e212a3a Remove Redundant Assignment in Qwen3_VisionPatchMerger (#25224)
Signed-off-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-19 12:15:13 -06:00
2821986450 [Core] Modify the initialization parameters of the lora manager (#25249)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-19 18:01:28 +00:00
6c117cff7d [Frontend] Pass API server count to each process (#23717)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-20 01:15:19 +08:00
7ac67ea525 [KV offload][3/N] Add worker-side CPU support (#21448)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-19 09:53:45 -07:00
ce75e15373 refactor(benchmarks): add type annotations to wait_for_endpoint parameters (#25218)
Signed-off-by: samzong <samzong.lu@gmail.com>
2025-09-19 16:36:52 +00:00
aed16879a9 Move ModelConfig from config/__init__.py to config/model.py (#25252)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-19 16:22:33 +00:00
cf278ff3b2 Update CODEOWNERS (#25269)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-19 09:12:55 -07:00
838d7116ba [Qwen] Remove cuda hard-code in qwen3 next (#25243)
Signed-off-by: Icey <1790571317@qq.com>
2025-09-19 12:25:12 +00:00
5089fd749c [V0 Deprecation] Remove V0 logic from get_input_embeddings interface (#25242)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-19 11:10:52 +00:00
a3d087adec [P/D][Nixl] Introduce KVTransferMetrics and aggregation strategy (#22188)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-19 11:09:14 +00:00
058525b997 Move PoolerConfig from config/__init__.py to config/pooler.py (#25181)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-19 11:02:55 +00:00
1dfea5f4a9 [Bugfix][Perf] Misc fixes for Qwen3 VL (#25238)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-19 10:46:16 +00:00
cea91a32f2 [Kernel][Performance] Add Triton kernel for Qwen3-VL interleaved MRoPE (#25055)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-19 10:27:49 +00:00
a684c0124c [bugfix] fix MHA for models like OpenGVLab/InternVL3_5-38B (#25146)
Signed-off-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-19 08:45:06 +00:00
f2718d2948 [Misc] Cleanup test conftest for deprecated encoder-decoder models (#25231)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-19 07:44:56 +00:00
825fdb11ad [Bugfix][CPU] Add placeholder to avoid import errors when using fused_moe ops on platforms without triton (#25137)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-19 07:41:12 +00:00
8c1d4acbfe [CPU] Disable oneDNN linear on non-x86 platforms (#25166)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-19 07:27:22 +00:00
486c5599e3 [Build] Update Xgrammar to 0.1.24 to get a CVE fix (#25188)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-19 14:27:17 +08:00
a6149aa587 [OOT] Support sync_model_loading for OOT (#25126)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
2025-09-19 05:41:53 +00:00
6c8a3c099b [Docs] Fix griffe warnings in vllm/multimodal (#25216)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-18 22:10:44 -07:00
31a8a2a7bc [Misc] Clean up MM profiling warnings (#25222)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-19 04:46:57 +00:00
1a0a04dae9 [Perf] Optimize memory peak during EAGLE model loading. (#24585)
Signed-off-by: Chen Ding <candy.dc@alibaba-inc.com>
2025-09-19 03:31:16 +00:00
6d8246aaff [gpt-oss] Add ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent for streaming (#24938)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-18 19:11:59 -07:00
9d1c50a5ac [KV offload][2/N] Introduce LRU-based CPU offloading management (#20075)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-19 00:20:51 +00:00
9a4600e4dc [CORE] Prompt Embeddings Support for v1 Engine (#24278)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Andrew Sansom <qthequartermasterman@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-09-19 08:03:09 +08:00
9fac6aa30b [BugFix] Fix DeepGEMM warmup, no m.weight_scale_inv (#25206)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-18 14:26:28 -07:00
a53ad626d6 [KV offload][1b/N] rename offloading to kv_offload (#25191)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-18 20:53:52 +00:00
1c3dad22ff [V0 Deprecation] Remove unused async_timeout.py (#25190)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-18 20:35:21 +00:00
d2a30a2d93 [Bug] Fix torch Compilation Cache Hit Error (#25093)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-18 12:38:37 -07:00
75fb112d80 [Bug] Fix returned_lse not Defined issue (#25106)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-09-18 19:32:24 +00:00
38db529f66 [feat]: Create interface for model-specific M-RoPE (#24194)
Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
Signed-off-by: Aziz <azizbenothman76@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-18 19:18:56 +00:00
064cac7bb7 [fix]: remove data type hardcoding from gptoss model implementation (#23807)
Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com>
2025-09-18 18:15:23 +00:00
e19bce40a1 [V0 Deprecation] Remove AsyncLLMEngine (#25025)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-18 11:07:42 -07:00
505805b645 [KV offload][1/N] Introduce an offloading component (#19848)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-09-18 10:57:07 -07:00
bbdc0f2366 [ROCm][AITER][Bugfix] Switch AITER to use PIECEWISE_AND_FULL compilation (#25104)
Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
2025-09-18 17:46:47 +00:00
dc34059360 [ROCm][CI/Build] Use ROCm7.0 as the base (#25178)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-18 09:36:55 -07:00
c4cb0af98a [spec decode] Fix MTP inference path for MiMo-7B model (#25136)
Signed-off-by: zixi-qi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-18 09:12:19 -07:00
1c3b1634aa [Misc] Add codeowner for Transformers backend (#25180)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 09:01:50 -07:00
2ea50e977a Enable Allgather/ReduceScatter backend for NaiveAllToAll (#23964)
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-18 15:52:58 +00:00
b419937c78 [Docs] Fix warnings in mkdocs build (continued) (#25163)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-09-18 08:23:26 -07:00
5f696c33b1 [New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872)
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-18 23:22:01 +08:00
67244c86f0 feat(api): Return 503 on /health when engine is dead (#24897)
Signed-off-by: dongbo910220 <1275604947@qq.com>
Co-authored-by: Claude <noreply@anthropic.com>
2025-09-18 14:29:40 +00:00
072d7e53e5 [PERF] Add conv1d metadata to GDN attn (#25105)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2025-09-18 14:27:49 +00:00
01a583fea4 [Kernel] Decouple Tile Size from Block Size in Triton Unified Attention Kernel (#21197)
Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
2025-09-18 14:27:01 +00:00
bc19d75985 [Misc] Add kv-connector label (#25156)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-18 13:56:07 +00:00
fbd6523ac0 Refactor dense FP8 tensor/channel/block utils and add CT FP8 block (#21404) 2025-09-18 08:53:45 -04:00
470484a4f5 [Structured Output][Refactor] Move apply_grammar_bitmask() method from ModelRunner to structured output utils (#21999)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-09-18 20:44:31 +08:00
21da73343a [Misc] Clean up flags in vllm bench serve (#25138)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-09-18 12:43:33 +00:00
66072b36db [Bugfix][Mamba] - Fix Conv State Kernel FP32 Support (#24883)
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
2025-09-18 12:21:17 +00:00
3ed1ec4af2 Fix validate-config pre-commit check (#25157)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 12:06:28 +00:00
5a33ae9a3f Fix forward reference warning in documentation (#25150)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 11:41:41 +00:00
c9ff9e6f0c [Docs] add the parallel sampling usage in LLMEngine and AsyncLLM (#24222) 2025-09-18 04:37:08 -07:00
eaffe4486c [Docs] Fix pooling-params doc references in openai_compatible_server.md (#24939) 2025-09-18 04:36:47 -07:00
8ed039d527 Move StructuredOutputsConfig from config/__init__.py to config/structured_outputs.py (#25153)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 11:24:27 +00:00
37970105fe [Model] Improve Pooling Model (#25149)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-18 11:04:21 +00:00
cc935fdd7e [Frontend] Support setting logprobs to -1 (#25031)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-18 10:34:42 +00:00
abdfcd4f3d silu-v1: Fix EPS not being used during max-reduction (#25069)
Signed-off-by: elvircrn <elvircrn@gmail.com>
2025-09-18 10:25:12 +00:00
4f02b77de4 Fix: Add explicit #include <omp.h> for OpenMP compatibility on certain toolchains (#24951)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
2025-09-18 17:43:23 +08:00
29283e8976 [Chore] Cleanup guided namespace, move to structured outputs config (#22772)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 09:20:27 +00:00
05b044e698 [Doc] Fix cross-reference warnings (#25058)
Signed-off-by: Punit Vara <punitvara@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 02:05:16 -07:00
aa3f105c59 Add 'path' option to ImagePrompt data_format (#25081)
Signed-off-by: Gerard Finol <gerard.finol@urv.cat>
2025-09-18 02:02:14 -07:00
ef7eefe17a [Qwen] Add fp8 checkpoint support for qwen3-next. (#25079)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-09-18 08:16:04 +00:00
350c94deb3 [Bugfix] when use s3 model cannot use default load_format (#24435)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-09-18 07:47:43 +00:00
f4cd80f944 Retrieve sliding_window from text config in Gemma3 MM (#25085)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-18 06:29:05 +00:00
349e0e3462 [Docs] Fix API Reference (#25140)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-17 23:23:29 -07:00
81b16a2bc9 [Kernel] Better inf handling for grouped topk cu (#24886)
Signed-off-by: lumina37 <starry.qvq@gmail.com>
2025-09-18 05:53:55 +00:00
e111d5b0ae [CLI] Use streaming in CLI chat and completion commands (#23769)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-09-17 22:30:26 -07:00
a904ea78ea [benchmark] add peak throughput metrics and plot (#23867)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-09-17 22:30:02 -07:00
b7433ca1a4 [Spec Decode] Efficient padded speculation (#24539)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-09-18 01:07:24 -04:00
5c65a72bb1 [V0 Deprecation] Remove more V0 tests (#25117)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 22:05:25 -07:00
9d8a2d86d2 [EPLB] Add EPLB support for hunyuan_v1 (#23078) 2025-09-18 04:51:35 +00:00
3bc18127ff [XPU] Whisper model support on XPU Platform (#25123)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
2025-09-18 04:30:10 +00:00
bec060fd99 Mark prompt logprobs as incompatible with prompt embeds at API level (#25077)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-17 21:25:07 -07:00
52bc9d5b3e [Model] enable data parallel for InternVL vision encoder (#23909)
Signed-off-by: Yiwen Chen <yiwen66@berkeley.edu>
Signed-off-by: YiwenC <54658925+666even666@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-17 21:11:46 -07:00
dc2979c585 [Kernels] Overlap shared experts with combine instead of dispatch (#24254)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-18 12:10:21 +08:00
027d37df38 [Bugfix][Qwen3-Next] add prefixes to shared_expert in qwen3-next and mlp in qwen2moe to successfully load ignored params in quantized models (#24960)
Signed-off-by: toncao <cpatonn@gmail.com>
Co-authored-by: toncao <cpatonn@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-18 12:08:50 +08:00
b98219670f [Core][MM] Cleanup MultiModalCache (#25006)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-09-17 21:08:41 -07:00
32baf1d036 [Docs] Clean up the contributing README (#25099)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-17 21:05:18 -07:00
3127274d02 [MM Encoder] Apply DP ViT for Qwen3-VL model series (#24955)
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-17 21:04:21 -07:00
4ac510f484 [Kernels] Enable DeepGEMM by default (#24462)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 20:19:52 -07:00
7fb2a5be28 [V0 Deprecation] Skip PP test (#25128)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 20:18:36 -07:00
6c036615dc [V0 Deprecation] Remove misc V0 tests (#25118)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 19:41:55 -07:00
2fc24e94f9 [V0 Deprecation] Remove V0 Tracing & Metrics tests (#25115)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 19:40:44 -07:00
2c3c1bd07a [V0 Deprecation] Remove V0 Engine tests (#25114)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 19:38:09 -07:00
5963b98b46 [Kernel] Delegate construction of FusedMoEQuantConfig to FusedMoEMethodBase subclasses (#22537)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-09-17 17:43:31 -06:00
e6585ddb45 [Bugfix] Fix accuracy issue for silu_mul + nvfp4 quant fusion kernel (#24833)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-17 16:37:23 -07:00
2a4d6412e6 Add a batched auto tune script (#25076)
Signed-off-by: Karan Goel <karangoel@google.com>
Signed-off-by: Karan Goel <3261985+karan@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-17 22:41:18 +00:00
e67a79db03 [Bugfix] Refactor Flashinfer TRTLLM attention kernel selection logic (#24600)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-17 15:36:29 -07:00
9f882d8791 Disable failing GPT-OSS Eval (Blackwell) for now (#25107)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-17 15:36:00 -07:00
1a456c7c90 Aiter mha fp8 fix (#24991)
Signed-off-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
2025-09-17 22:29:14 +00:00
fedb75fa27 [Bugfix][B200] Fix cutlass_mla hang (#24966)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-17 18:06:38 -04:00
bff2e5f1d6 [gpt-oss][2] fix types for streaming (#24556)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-17 22:04:28 +00:00
3c068c637b [Kernel] Faster pre-processing time for W4A8 (#23972)
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
2025-09-17 14:35:32 -07:00
f20c3b0951 [BUG] Exclude .pth files when pulling remote files (#25092)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-09-17 20:42:09 +00:00
883131544f [Bugfix] Update import path for bc_linter_include (#24766)
Signed-off-by: Mohammad Miadh Angkad <mangkad.bsdsba2027@aim.edu>
2025-09-17 20:33:11 +00:00
ee5fd49150 [Misc] Update owners for KV connector and V1 offloading (#25041)
Signed-off-by: ApostaC <yihua98@uchicago.edu>
2025-09-17 12:37:29 -07:00
7ae9887542 [V1] Logits processor docs (#22919)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Signed-off-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Co-authored-by: Joseph Marinier <Joseph.Marinier@gmail.com>
2025-09-17 11:53:12 -07:00
e3db5ebb66 [CI Bugfix] Fix failing test_model_load_with_params tests due to tokenizer refactor (#25086)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-17 11:15:05 -07:00
9d442b7c48 [V0 Deprecation] Remove V0 tests in test_sequence.py (#25088)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 11:08:45 -07:00
eb68c2dcd9 [CI] Revert back prepare_prompts and check_answers (#25087)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 11:03:16 -07:00
8b32464ac1 Change log level from info to debug for IOProcessor (#24999)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-09-17 10:21:28 -07:00
99cc41ad50 [V0 Deprecation] Remove unused output processor util (#25023)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-17 09:50:07 -07:00
d6a518fdde Remove unused find_cuda_init helper script (#25044) 2025-09-17 09:47:40 -07:00
4aa8c7b047 cleanup: remove adapter commons (#25045)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-17 16:46:29 +00:00
4b946d693e [V0 Deprecation] Remove V0 Core tests (#25082)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-09-17 09:32:42 -07:00
087c6ffc92 [CI Bugfix] Fix failing test_invalid_env (#25078)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-17 08:28:58 -07:00
4a2d33e371 [Docs] vllm/benchmarks/datasets.py fix docstring param format. (#24970)
Signed-off-by: samzong <samzong.lu@gmail.com>
2025-09-17 08:11:51 -07:00
8f3616f422 Remove old cutlass mla (#23961)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-09-17 14:31:43 +00:00
47f670b03b [Docs] improve code formatting and comments for eliminate griffe build warning. (#25010)
Signed-off-by: samzong <samzong.lu@gmail.com>
2025-09-17 07:31:20 -07:00
dd6a910aac [Bugfix][Qwen3-Next] fixes the varlen issue in qwen3-next's MTP implementation. (#24957)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-09-17 21:59:09 +08:00
1b962e2457 [fix] lora benchmarks pass no_lora_flag_cpu (#23774)
Signed-off-by: Dylan Maloy <34420038+dolpm@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-17 21:22:25 +08:00
bfe9380161 Apply fixes for CUDA 13 (#24599)
Signed-off-by: Aidyn-A <aidyn.b.aitzhan@gmail.com>
2025-09-17 09:15:42 -04:00
9fccd04e30 [Bugfix] Fix Stream usage in CPU model runner and OneDNN kernel check (#25046)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-17 05:54:02 -07:00
252ada5559 Add RADIO Vision Encoder Support to vLLM (#24595)
Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
Co-authored-by: root <root@cw-dfw-h100-001-305-026.cm.cluster>
2025-09-17 05:53:30 -07:00
e120533d7a [Misc] Avoid use of deprecated AutoModelForVision2Seq (#25065)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-09-17 12:19:15 +00:00
2b85697031 [BugFix] enable DOTALL to match multi-line tool_call parameters in extract_tool_call_required_streaming (#24668)
Signed-off-by: Shijun Yin <shijun.yin@outlook.com>
2025-09-17 09:21:18 +00:00
544fe76b95 [Frontend] Support returning all prompt logprobs (#24956)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-09-17 09:03:52 +00:00
bb58dc8c20 [DP] Create placement groups by ray_device_key (#25026)
Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-17 08:57:25 +00:00
0fb2551c23 [Docs] Fix griffe warning in base_static_graph.py (#25018)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-17 08:49:19 +00:00
6c47f6bfa4 [Core] Remove tokenizer group in vLLM (#24078)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-09-17 08:42:59 +00:00
whx
c15309a730 [Model] Apply SharedFusedMoE to glm4_moe. (#24849)
Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-09-17 16:02:31 +08:00
whx
4a9375fe9d [Model] Pass param prefix to LLMHead (#24862)
Signed-off-by: whx-sjtu <2952154980@qq.com>
2025-09-17 16:01:27 +08:00
03191cd8f0 [Core][MultiModalHasher] Hash images without converting image mode (#24969)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-09-17 00:57:34 -07:00
b77bf34e53 [EPLB] Support EPLB for Mixtral Model (#22842)
Signed-off-by: rouchenzi <ruochenwen@gmail.com>
Signed-off-by: rouchenzi <40842833+rouchenzi@users.noreply.github.com>
Co-authored-by: Bowen Wang <abmfy@icloud.com>
2025-09-17 07:27:34 +00:00
dd39baf717 [XPU] Fix xpu model runner call torch.cuda APIs (#25011)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-17 06:45:25 +00:00
43a62c51be Add more documentation and improve usability of lognormal dist (benchmark_serving_multi_turn) (#23255)
Signed-off-by: daniels <daniels@pliops.com>
2025-09-17 05:53:17 +00:00
ca2d1925ef [Rocm] [quantization] Fix quark ptpc moe and add test case (#24649)
Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
Co-authored-by: Haoyang Li <haoyang.li@amd.com>
2025-09-16 22:15:13 -07:00
0f7acdd73c [Model] Support Qwen3-VL Model Series (#24727)
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-17 05:01:04 +00:00
5801e49776 [V0 Deprecation] Remove MQLLMEngine (#25019)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-16 21:29:27 -07:00
58d4c705a8 [Core] Get num_encoder_tokens from scheduler config (#24989)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-09-16 20:59:07 -07:00
ea3de5ef0d [misc] fix typo in value error (#24995)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
2025-09-16 20:58:38 -07:00
67532a1a68 [UX] Remove "quantization is not fully optimized yet" log (#25012)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-16 20:57:51 -07:00
5672ba90bd [Docs] fix invalid doc link (#25017)
Signed-off-by: zxw <1020938856@qq.com>
2025-09-16 20:53:23 -07:00
dd83a157f1 [UX] Enforce valid choices for envs like VLLM_ATTENTION_BACKEND, etc (#24761)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-09-16 20:42:23 -07:00
5a411ef6c4 [Benchmarks] Add MMVU video dataset support and clean up deprecated datasets (#24719)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-17 03:29:43 +00:00
eeb135eb87 [Core] Use CpuGpuBuffer for block table tensors (#24795)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-09-16 19:18:06 -07:00
3059b9cc6b [Doc] Add --force-overwrite option to generate_cmake_presets.py (#24375)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-09-16 18:45:29 -07:00
64ad551878 Removes source compilation of nixl dependency (#24874)
Signed-off-by: bbartels <benjamin@bartels.dev>
Signed-off-by: Benjamin Bartels <benjamin@bartels.dev>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Daniele <36171005+dtrifiro@users.noreply.github.com>
2025-09-17 01:33:18 +00:00
cef32104b4 [FP8] Extend per-token-group quantization support to QuantFP8 (#24342)
Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
2025-09-16 18:31:06 -07:00
493b10f8bf [CI] GPT-OSS GPQA eval test for Blackwell (#24920)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-16 18:13:21 -07:00
d119fc8614 [CI][Bugfix] Fix failing Blackwell test (#24993)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-16 15:55:02 -07:00
dbebb7f812 [Perf] Reuse workspace for FP8+FP4 Marlin MoE (#20500)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-09-16 15:45:10 -06:00
3053a22b33 fp8 kv cache support fix for torch.compile (#22758)
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
2025-09-16 21:27:11 +00:00
02d4b85454 Use kwargs for long lists of EngineCoreRequest arguments in tests and fix extra kwargs (#24987)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-09-16 14:06:56 -07:00
86daa875fe [gpt-oss][1][bugfix] fix streaming final output (#24466)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-16 13:56:16 -06:00
dcf2f3ec06 [ROCm] Add dependencies for ROCm (#24900)
Signed-off-by: Yida Wu <yida.wu@amd.com>
2025-09-16 19:49:06 +00:00
218454b9b2 [MISC] Add code owners of vllm/v1 to vllm/v1/core (#24928)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-16 19:07:34 +00:00
f4d6eb95cf [gpt-oss][1b] streaming add item id, content id (#24788)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-16 18:41:12 +00:00
cd1f885bcf Directly get max encoder len from VLLM config in V1 (#24866)
Signed-off-by: Sugar-zsg <952242923@qq.com>
2025-09-16 17:52:31 +00:00
d593cf28fa [Misc] Add removed encoder-decoder models to previously supported models list (#24961)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-16 10:46:46 -07:00
faa7a5daac [Bugfix] Fix unable to run encoder model when disable_hybrid_kv_cache_manager is true (#24571)
Signed-off-by: lianyibo <lianyibo1@kunlunit.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-09-16 17:36:58 +00:00
567939953b [Core/DBO][1/N] Add Dual-Batch Overlap mechanism to VLLM (#23693)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-16 12:21:48 -04:00
08369289af [Core][MultiModalHasher] Don't convert memoryviews to bytes during hashing (#24925)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-09-16 15:32:47 +00:00
73cfb3c5ee [Model] Clean up and simplify Mamba2 Metadata Usage in both V0 and V1 (#24331)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-09-16 14:53:43 +00:00
4e5affeaa1 [CI] Add Decode Context Parallelism (DCP) test to CI (#24487)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-09-16 21:21:28 +08:00
e4f0b4cd96 (doc): set cmake c++ compatible standard when building on MacOS CPU. (#23483)
Signed-off-by: teekenl <teekenlau@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-16 06:08:46 -07:00
de3e53a75b feat: Add Grafana and Perces monitoring dashboards for vLLM (#23498) 2025-09-16 05:53:40 -07:00
85e0df1392 [Docs] move benchmarks README to contributing guides (#24820) 2025-09-16 05:52:57 -07:00
0faf3cc3e8 Move SpeculativeConfig from config/__init__.py to config/speculative.py (#24904)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-16 12:51:35 +01:00
7ea5c73ad7 [Feat][EPLB] A novel static EPLB placement strategy for MoE models. (#23745)
Signed-off-by: bruceszchen <bruceszchen@tencent.com>
Signed-off-by: Chen Bruce <bruceszchen@tencent.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Chen Bruce <cszwwdz@vip.qq.com>
Co-authored-by: lemon412 <lemon412@foxmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-16 10:55:16 +00:00
27fcfe7bcf [Mamba] Support TP>1 with quantization for mamba2 mixer in case n_groups % tp_size == 0 (#24593)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-16 10:51:01 +00:00
68dbde5dbb [Bugfix] remove duplicate tokens streamed in required tool choice streaming (#23312)
Signed-off-by: Jason Cheng <jasoncky96@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-09-16 15:16:32 +08:00
04ad0dc275 [benchmark] Add triton version in the moe tuned config (#24769)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-16 14:10:54 +08:00
238c4c1705 [QWEN NEXT] Fused MoE kernels Optimization configs (#24924)
Signed-off-by: Saman Keon <samanamp@outlook.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-09-16 13:06:03 +08:00
8c54610265 [Bug] [Spec Dec]: Fix kv_cache dtype mismatch for Eagle3 drafter on FP8 target (#24505)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-09-16 04:45:38 +00:00
17871983a2 [Bugfix] Fix sequence parallelism bug when enable pipeline parallelism (#24021)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-09-16 04:32:32 +00:00
759ef49b15 Remove V0 Encoder-Decoder Support (#24907)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-15 21:17:14 -07:00
5206ab20ba [XPU] Fix circular import error. (#24927)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-09-16 03:35:36 +00:00
0af3ce1355 Upgrade flashinfer to 0.3.1 (#24470)
Signed-off-by: Lu Fang <lufang@fb.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-09-16 02:36:09 +00:00
e1279ef00f [Docs] Update instructions for how to using existing torch binary (#24892)
Signed-off-by: Richard Zou <zou3519@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-16 02:25:50 +00:00
2942970d44 [Metrics] Hide deprecated metrics with gpu_ prefix (#24245)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-09-15 20:15:57 -06:00
3c96e7b8a1 [CI] Small Accuracy Eval Test for Deepseek Model (#24259)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-15 20:14:50 -06:00
b42566f440 [Bug] Fix is_flashmla_supported Check Error (#24774)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-15 20:10:55 -06:00
d96e11167d Add pytest-cov and .coveragerc (#24778)
Signed-off-by: Reza Barazesh <rezabarazesh@meta.com>
2025-09-15 20:08:46 -06:00
2891603efd [ROCm][Bugfix] Fix the case where there's bias (#24895)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-09-15 20:05:12 -06:00
de2cc3d867 [Deprecation] Remove DeepGEMM Old Symbol Wrapper (#24902)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-15 20:03:29 -06:00
e95084308b Updated CODEOWNERS for flashinfer, mla, fused_moe (#24906)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-16 02:01:28 +00:00
7f6f2c1182 HuggingFace -> Hugging Face in Integration with Hugging Face docs (#24889) 2025-09-15 17:28:35 -07:00
5bcc153d7b [Compile] Fix noop_elimination pass and add tests for noop_elimination (#24880)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-09-15 23:33:18 +00:00
45bfa49cb8 [Tests] fix initialization of kv hash in tests (#24273)
Signed-off-by: Mickael Seznec <mickael@mistral.ai>
2025-09-15 21:48:27 +00:00
fd2f10546c [ci] fix wheel names for arm wheels (#24898)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-09-15 14:39:08 -07:00
e757a629e7 [Bug] Fix Cutlass Scaled MM Compilation Error (#24887)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-09-15 17:21:17 -04:00
aae725af7c [Performance] Remove redundant clone() calls in cutlass_mla (#24891) 2025-09-15 20:21:53 +00:00
73df49ef3a [gpt-oss][1a] create_responses stream outputs BaseModel type, api server is SSE still (#24759)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-15 13:08:08 -07:00
25aba2b6a3 [gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-15 13:07:55 -07:00
94b03f88dd Bump Flashinfer to 0.3.1 (#24868)
Signed-off-by: bbartels <benjamin@bartels.dev>
2025-09-15 12:45:55 -07:00
49bfc538e4 Update num_tokens_across_dp to use nccl instead of gloo (#24105)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-09-15 19:05:48 +00:00
a0b26701c9 [Transform] Deterministic Hadacore Transforms (#24106)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-09-15 12:59:31 -06:00
c4afdb69cc Move MultiModalConfig from config/__init__.py to config/multimodal.py (#24659)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-15 17:43:16 +00:00
b834b4cbf1 [USAGE] Improve error handling for weight initialization in Unquantized… (#20321)
Signed-off-by: Rafael Marcelino Koike <rafael.koike@oracle.com>
Signed-off-by: Rafael Koike <koike.rafael@gmail.com>
2025-09-15 16:45:49 +00:00
740f0647b1 Reinstate existing torch script (#24729)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-15 09:43:40 -07:00
01413e0cf5 Fp8 paged attention update (#22222)
Signed-off-by: Xiao Yu <xiao.yu@amd.com>
Signed-off-by: xiao-llm <xiao.yu.dc@outlook.com>
Co-authored-by: Xiao Yu <xiao.yu@metamaterial.com>
Co-authored-by: Xiao Yu <xiao.yu@amd.com>
Co-authored-by: Bowen Bao <bowenbao@amd.com>
2025-09-15 10:43:26 -04:00
0e219cd50b [Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (#24822)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-15 20:45:06 +08:00
72c99f2a75 [Model]: support Ling2.0 (#24627)
Signed-off-by: vito.yy <vito.yy@antgroup.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-15 05:09:30 -07:00
bf214ca226 [Misc] Fix examples openai_pooling_client.py (#24853)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-15 11:57:30 +00:00
2e41f5abca [XPU] Set consistent default KV cache layout (#24745)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-15 18:09:34 +08:00
bc0f6059a2 [UT] enhance free kv cache block queue popleft_n (#24220)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-09-15 10:04:37 +00:00
8de261b04a [P/D]kv_output_aggregator support P TP > D TP (#23917)
Signed-off-by: LCAIZJ <leichao139636@163.com>
Co-authored-by: leichao.lc <leichao.lc@antgroup.com>
2025-09-15 11:36:06 +02:00
a0d8b9738d [Misc] Own KVConnectors installation (#24867)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-09-15 02:21:09 -07:00
59e17dd4a0 [Misc] rename interval to max_recent_requests (#24229)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-09-15 09:18:42 +00:00
4979eb79da [Doc]: fix typos in various files (#24821)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
2025-09-15 01:08:52 -07:00
a8c0f59973 [Bugfix] MiDashengLM model contact error under concurrent testing (#24738)
Signed-off-by: chenbing8 <chenbing8@xiaomi.com>
Signed-off-by: bingchen-mi <chenbing8@xiaomi.com>
2025-09-15 06:38:12 +00:00
f4a948f33f [Frontend] Skip stop in reasoning content (#14550)
Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-09-15 06:04:55 +00:00
3f3313981c [kv cache] update num_free_blocks in the end (#24228)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-09-15 05:15:12 +00:00
78818dd1b0 [Docs] Have a try to improve frameworks/streamlit.md (#24841)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-09-14 21:50:36 -07:00
8e5cdcda4e [Hybrid Allocator] Support Pipeline Parallel (#23974)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-14 15:55:17 -07:00
90f3f7d73e [Spec Decoding]Support Spec Decoding Metrics in DP Mode (#24049)
Signed-off-by: wuhang <wuhang6@huawei.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-14 21:11:09 +00:00
6dc8da5dc1 [Chore] Remove ipex_ops warning (#24835)
Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-14 19:41:53 +00:00
79cbcab871 Force use C++17 globally to avoid compilation error (#24823)
Signed-off-by: chenfengjin <1871653365@qq.com>
2025-09-14 19:30:10 +00:00
ff68035932 [Benchmarks] Throw usage error when using dataset-name random and dataset-path together (#24819)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-09-14 17:50:01 +00:00
1177dd53e9 fix type of sampling rate for encode_base64 (#24826)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
2025-09-14 16:17:16 +00:00
fc2dbcda8b [Perf] Fix DeepGEMM Contiguous Layout Issue, 5.5% Throughput Improvement (#24783)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-14 11:20:17 -04:00
fec347dee1 [Misc] Improve s3_utils type hints with BaseClient (#24825)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-09-14 12:11:14 +00:00
cc3173ae98 [Multi Modal][Performance] Fused Q,K's apply_rope into one (#24511)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-09-14 08:10:21 +00:00
3e903b6cb4 [Chore] Minor simplification for non-PP path (#24810)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-13 17:41:36 -07:00
973c9d01da [Minor] Simplify duplicative device check for cuda (#24793)
Signed-off-by: Ziliang Peng <ziliangdotme@gmail.com>
2025-09-13 18:28:38 +00:00
2191 changed files with 221819 additions and 182388 deletions

View File

@ -5,11 +5,11 @@ import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
# Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
def print_top_10_largest_files(zip_file):

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.90
limit: 100
num_fewshot: 0

View File

@ -0,0 +1,11 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5

View File

@ -1,4 +1,5 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
# For vllm script, with -t option (tensor parallel size)
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.855
limit: 2500
num_fewshot: 0

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

View File

@ -0,0 +1 @@
Qwen2.5-VL-7B-Instruct.yaml

View File

@ -0,0 +1,44 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage() {
echo``
echo "Runs lm eval harness on ChartQA using multimodal vllm."
echo "This pathway is intended to be used to create baselines for "
echo "our correctness tests in vllm's CI."
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:l:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm-vlm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT

View File

View File

@ -0,0 +1,50 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size auto

View File

@ -19,21 +19,27 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
backend = eval_config.get("backend", "vllm")
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}"
f"max_model_len={max_model_len},"
)
results = lm_eval.simple_evaluate(
model="vllm",
model=backend,
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto",
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm.
apply_chat_template=backend == "vllm-vlm",
batch_size=batch_size,
)
return results

View File

@ -8,7 +8,7 @@ This benchmark aims to:
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup

View File

@ -368,7 +368,7 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
)
# get markdown tables

View File

@ -181,18 +181,14 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
server_command="vllm serve $model \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
server_command="vllm serve $model \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi

View File

@ -365,8 +365,7 @@ run_serving_tests() {
continue
fi
server_command="$server_envs python3 \
-m vllm.entrypoints.openai.api_server \
server_command="$server_envs vllm serve \
$server_args"
# run the server
@ -455,11 +454,6 @@ main() {
fi
check_hf_token
# Set to v1 to run v1 benchmark
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
export VLLM_USE_V1=1
fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)

View File

@ -1,46 +0,0 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true

View File

@ -1,24 +1,22 @@
steps:
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build arm64 wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- block: "Build CUDA 12.8 wheel"
key: block-build-cu128-wheel
- label: "Build wheel - CUDA 12.8"
depends_on: block-build-cu128-wheel
depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@ -30,12 +28,8 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- block: "Build CUDA 12.6 wheel"
key: block-build-cu126-wheel
depends_on: ~
- label: "Build wheel - CUDA 12.6"
depends_on: block-build-cu126-wheel
depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
@ -54,7 +48,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@ -82,7 +76,7 @@ steps:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
@ -102,8 +96,6 @@ steps:
depends_on:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-12-9
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@ -158,11 +150,16 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker push vllm/vllm-openai:nightly"
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
@ -171,3 +168,4 @@ steps:
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"

View File

@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -8,20 +8,41 @@ set -ex
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# Get DockerHub token from environment
# Get DockerHub credentials from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
exit 1
fi
if [ -z "$DOCKERHUB_USERNAME" ]; then
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
exit 1
fi
# Get DockerHub bearer token
echo "Getting DockerHub bearer token..."
set +x
BEARER_TOKEN=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
"https://hub.docker.com/v2/users/login" | jq -r '.token')
set -x
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
echo "Error: Failed to get DockerHub bearer token"
exit 1
fi
# Function to get all tags from DockerHub
get_all_tags() {
local page=1
local all_tags=""
while true; do
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
set +x
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
"$REPO_API_URL?page=$page&page_size=100")
set -x
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@ -43,7 +64,9 @@ delete_tag() {
echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
set +x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"

View File

@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
@ -167,12 +163,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py

View File

@ -25,25 +25,28 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
set -e
set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
# Note: disable Bart until supports V1
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 40m bash -c cpu_tests
timeout 120m bash -c cpu_tests

View File

@ -58,15 +58,11 @@ function cpu_tests() {
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
# Note: disable Bart until supports V1
pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
pytest -x -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"

View File

@ -0,0 +1,191 @@
#!/bin/bash
# This script build the Ascend NPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Base ubuntu image with basic ascend development libraries and python installed
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
VLLM_ASCEND_TMP_DIR=
# Get the test run configuration file from the vllm-ascend repository
fetch_vllm_test_cfg() {
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
cleanup() {
rm -rf "${VLLM_ASCEND_TMP_DIR}"
}
trap cleanup EXIT
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
exit 1
fi
# If the file already exists locally, just overwrite it
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
rm -rf "${VLLM_ASCEND_TMP_DIR}"
trap - EXIT
}
# Downloads test run configuration file from a remote URL.
# Loads the configuration into the current script environment.
get_config() {
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1
fi
source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0
}
# get test running configuration.
fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script.
if [ $? -ne 0 ]; then
exit 1
fi
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p ${builder_cache_dir}
# Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest=${builder_cache_dir},mode=max \
--progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
# Install for pytest to make the docker build cache layer always valid
RUN --mount=type=cache,target=/root/.cache/pip \
pip install pytest>=6.0 modelscope
WORKDIR /workspace/vllm
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements/common.txt
COPY . .
# Install vLLM
RUN --mount=type=cache,target=/root/.cache/pip \
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton
# Install vllm-ascend
WORKDIR /workspace
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
ARG VLLM_ASCEND_TAG=main
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r /workspace/vllm-ascend/requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV VLLM_USE_MODELSCOPE=True
WORKDIR /workspace/vllm-ascend
CMD ["/bin/bash"]
EOF
# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() {
local input="$1"
local index cards_num
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
index="${BASH_REMATCH[1]}"
cards_num="${BASH_REMATCH[2]}"
else
echo "parse error" >&2
return 1
fi
local devices=""
local i=0
while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i ))
devices="$devices --device /dev/davinci${dev_idx}"
((i++))
done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
}
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p ${model_cache_dir}
docker run \
${devices} \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v ${model_cache_dir}:/root/.cache/modelscope \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
pytest -v -s tests/e2e/vllm_interface/
'

View File

@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -35,16 +35,14 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_serial_utils.py
pytest -v -s v1/test_utils.py
pytest -v -s v1/test_metrics_reader.py
'

View File

@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
vllm serve meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

View File

@ -0,0 +1,59 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=10.0
EXPECTED_THROUGHPUT=8.7
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -42,7 +42,7 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
vllm serve $MODEL \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

1267
.buildkite/test-amd.yaml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -6,24 +6,28 @@
# to generate the final pipeline yaml file.
# Documentation
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# label(str): the name of the test. emojis allowed.
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
# fast_check_only(bool): run this test on the fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on first host, the second
# commands(list): the list of commands to run for the test. incompatible with command.
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
# in this case, commands must be specified. the first command runs on the first host, the second
# command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# When adding a test
# - If the test belong to an existing group, add it there
# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
@ -46,25 +50,28 @@ steps:
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/mq_llm_engine
- tests/async_engine
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py
- tests/transformers_utils
no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal
- pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
- pytest -v -s transformers_utils # transformers_utils
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s transformers_utils
- label: Python-only Installation Test # 10min
timeout_in_minutes: 20
@ -84,25 +91,12 @@ steps:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 22min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: true
source_file_dependencies:
- vllm/core
- vllm/distributed
- tests/core
commands:
- pytest -v -s core
- label: Entrypoints Unit Tests # 5min
timeout_in_minutes: 10
@ -127,10 +121,9 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130
@ -168,7 +161,6 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
@ -176,28 +168,34 @@ steps:
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
- tests/v1/test_internal_lb_dp.py
- tests/v1/test_hybrid_lb_dp.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with tp=2 and pp=2
# test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- pushd ../examples/offline_inference
@ -230,16 +228,14 @@ steps:
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/metrics
- tests/v1/tracing
commands:
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s tracing
- pytest -v -s v1/tracing
##### fast check tests #####
##### 1 GPU test #####
@ -300,23 +296,35 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode
- pytest -v -s v1/kv_connector/unit
- pytest -v -s v1/metrics
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py
- pytest -v -s v1/test_request.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Test others (CPU) # 5 mins
source_file_dependencies:
- vllm/
- tests/v1
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'cpu_test' v1/metrics
- label: Examples Test # 30min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
@ -335,12 +343,13 @@ steps:
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15
@ -389,11 +398,12 @@ steps:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@ -422,8 +432,9 @@ steps:
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands:
- pytest -v -s kernels/core
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
@ -467,32 +478,22 @@ steps:
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
- vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
- label: Tensorizer Test # 14min
timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/model_loader
- tests/tensorizer_loader
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test # 7min
timeout_in_minutes: 20
- label: Model Executor Test # 23min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Benchmarks # 11min
timeout_in_minutes: 20
@ -526,8 +527,9 @@ steps:
# since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
timeout_in_minutes: 75
@ -548,15 +550,6 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 12min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/encoder_decoder
commands:
- pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
@ -564,10 +557,17 @@ steps:
source_file_dependencies:
- vllm/
- tests/tool_use
- tests/mistral_tool_use
commands:
- pytest -v -s tool_use
- pytest -v -s mistral_tool_use
- pytest -v -s -m 'not cpu_test' tool_use
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
no_gpu: true
commands:
- pytest -v -s -m 'cpu_test' tool_use
##### models test #####
@ -607,13 +607,19 @@ steps:
- vllm/
- tests/models/test_transformers.py
- tests/models/test_registry.py
commands:
- pytest -v -s models/test_transformers.py models/test_registry.py
- label: Basic Models Test (Other CPU) # 5min
timeout_in_minutes: 10
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_utils.py
- tests/models/test_vision.py
no_gpu: true
commands:
- pytest -v -s models/test_transformers.py \
models/test_registry.py \
models/test_utils.py \
models/test_vision.py
- pytest -v -s models/test_utils.py models/test_vision.py
- label: Language Models Tests (Standard)
timeout_in_minutes: 25
@ -728,6 +734,16 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
optional: true
@ -783,11 +799,13 @@ steps:
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
timeout_in_minutes: 60
@ -817,17 +835,60 @@ steps:
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- tests/quantization/test_blackwell_moe.py
- vllm/model_executor/models/deepseek_v2.py
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/models/llama4.py
- vllm/model_executor/layers/fused_moe
- vllm/model_executor/layers/quantization/compressed_tensors
- vllm/model_executor/layers/quantization/modelopt.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
@ -871,47 +932,58 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 110min
timeout_in_minutes: 150
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- vllm/compilation
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/test_basic_correctness.py
- tests/compile/test_wrapper.py
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- tests/v1/distributed
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Model Tests (2 GPUs) # 37min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/
- tests/basic_correctness/
- tests/model_executor/model_loader/test_sharded_state_loader.py
- tests/models/
commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
# TODO: investigate and fix
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s models/multimodal/generation/test_maverick.py
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
@ -954,7 +1026,6 @@ steps:
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
# - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
- label: LoRA TP Test (Distributed) # 17 min
timeout_in_minutes: 30
@ -1028,9 +1099,36 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: Qwen MoE EP Test # optional
##### H200 test #####
- label: Distrubted Tests (H200) # optional
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
##### B200 test #####
- label: Distributed Tests (B200) # optional
gpu: b200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh

47
.coveragerc Normal file
View File

@ -0,0 +1,47 @@
[run]
# Track the installed vllm package (this is what actually gets imported during tests)
# Use wildcard pattern to match the installed location
source =
vllm
*/dist-packages/vllm
*/site-packages/vllm
omit =
*/tests/*
*/test_*
*/__pycache__/*
*/build/*
*/dist/*
*/vllm.egg-info/*
*/third_party/*
*/examples/*
*/benchmarks/*
*/docs/*
[paths]
# Map all possible vllm locations to a canonical "vllm" path
# This ensures coverage.combine properly merges data from different test runs
source =
vllm
/vllm-workspace/src/vllm
/vllm-workspace/vllm
*/site-packages/vllm
*/dist-packages/vllm
[report]
exclude_lines =
pragma: no cover
def __repr__
if self.debug:
if settings.DEBUG
raise AssertionError
raise NotImplementedError
if 0:
if __name__ == .__main__.:
class .*\bProtocol\):
@(abc\.)?abstractmethod
[html]
directory = htmlcov
[xml]
output = coverage.xml

4
.git-blame-ignore-revs Normal file
View File

@ -0,0 +1,4 @@
# Migrate from `yapf` & `isort` to `ruff`
d6953beb91da4e9c99be4c0a1304a2d24189535c
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
8fcaaf6a165e661f63fc51be906bc05b0767332f

62
.github/CODEOWNERS vendored
View File

@ -2,72 +2,85 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @heheda12345
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/offloading @ApostaC
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/evals @mgoin
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @heheda12345
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
# Transformers backend
/vllm/model_executor/models/transformers @hmellor
/tests/models/test_transformers.py @hmellor
# Docs
/docs @hmellor
/docs/mkdocs @hmellor
/docs/**/*.yml @hmellor
/requirements/docs.txt @hmellor
.readthedocs.yaml @hmellor
mkdocs.yaml @hmellor
# Linting
.markdownlint.yaml @hmellor
.pre-commit-config.yaml @hmellor
/tools/pre_commit @hmellor
# CPU
/vllm/v1/worker/^cpu @bigPYJ1151
/vllm/v1/worker/cpu* @bigPYJ1151
/csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU
/vllm/v1/worker/^xpu @jikunshang
/vllm/v1/worker/xpu* @jikunshang
/vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang
@ -101,4 +114,15 @@ mkdocs.yaml @hmellor
/vllm/v1/worker/tpu* @NickLucche
/vllm/platforms/tpu.py @NickLucche
/vllm/v1/sample/tpu @NickLucche
/vllm/tests/v1/tpu @NickLucche
/vllm/tests/v1/tpu @NickLucche
# KVConnector installation files
/requirements/kv_connectors.txt @NickLucche
# Pooling models
/examples/*/pooling/ @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop

View File

@ -43,10 +43,6 @@ body:
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:

54
.github/mergify.yml vendored
View File

@ -2,6 +2,7 @@ pull_request_rules:
- name: label-documentation
description: Automatically apply documentation label
conditions:
- label != stale
- or:
- files~=^[^/]+\.md$
- files~=^docs/
@ -10,10 +11,13 @@ pull_request_rules:
label:
add:
- documentation
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
- label != stale
- or:
- files~=^\.github/
- files~=\.buildkite/
@ -30,6 +34,7 @@ pull_request_rules:
- name: label-deepseek
description: Automatically apply deepseek label
conditions:
- label != stale
- or:
- files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py
@ -46,6 +51,7 @@ pull_request_rules:
- name: label-frontend
description: Automatically apply frontend label
conditions:
- label != stale
- files~=^vllm/entrypoints/
actions:
label:
@ -55,6 +61,7 @@ pull_request_rules:
- name: label-llama
description: Automatically apply llama label
conditions:
- label != stale
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
@ -70,6 +77,7 @@ pull_request_rules:
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
- label != stale
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
@ -83,6 +91,7 @@ pull_request_rules:
- name: label-new-model
description: Automatically apply new-model label
conditions:
- label != stale
- and:
- files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py
@ -94,6 +103,7 @@ pull_request_rules:
- name: label-performance
description: Automatically apply performance label
conditions:
- label != stale
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
@ -107,6 +117,7 @@ pull_request_rules:
- name: label-qwen
description: Automatically apply qwen label
conditions:
- label != stale
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
@ -121,6 +132,7 @@ pull_request_rules:
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
- label != stale
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
@ -142,6 +154,7 @@ pull_request_rules:
- name: label-rocm
description: Automatically apply rocm label
conditions:
- label != stale
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
@ -162,6 +175,7 @@ pull_request_rules:
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
- label != stale
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
@ -171,7 +185,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@ -181,6 +195,7 @@ pull_request_rules:
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
- label != stale
- or:
- files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/
@ -196,6 +211,7 @@ pull_request_rules:
- name: label-v1
description: Automatically apply v1 label
conditions:
- label != stale
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
@ -208,6 +224,7 @@ pull_request_rules:
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
- label != stale
- or:
- files~=tpu.py
- files~=_tpu
@ -223,6 +240,7 @@ pull_request_rules:
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
- label != stale
- and:
- -files~=tpu.py
- -files~=_tpu
@ -237,9 +255,9 @@ pull_request_rules:
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- label != stale
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
@ -256,8 +274,9 @@ pull_request_rules:
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- conflict
- -closed
- label != stale
- conflict
- -closed
actions:
label:
add:
@ -271,10 +290,12 @@ pull_request_rules:
- name: assign reviewer for tensorizer changes
conditions:
- label != stale
- or:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/tensorizer_loader/
- files~=^tests/model_executor/model_loader/tensorizer_loader/
actions:
assign:
users:
@ -282,6 +303,7 @@ pull_request_rules:
- name: assign reviewer for modelopt changes
conditions:
- label != stale
- or:
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -296,9 +318,27 @@ pull_request_rules:
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict
- -closed
- -conflict
- -closed
actions:
label:
remove:
- needs-rebase
- name: label-kv-connector
description: Automatically apply kv-connector label
conditions:
- label != stale
- or:
- files~=^examples/online_serving/disaggregated[^/]*/.*
- files~=^examples/offline_inference/disaggregated[^/]*/.*
- files~=^examples/others/lmcache/
- files~=^tests/v1/kv_connector/
- files~=^vllm/distributed/kv_transfer/
- title~=(?i)\bP/?D\b
- title~=(?i)NIXL
- title~=(?i)LMCache
actions:
label:
add:
- kv-connector

View File

@ -13,6 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
@ -42,7 +43,6 @@ jobs:
searchIn: "body"
},
],
// Substring search - matches anywhere in text (partial matches)
substrings: [
{
@ -89,14 +89,12 @@ jobs:
term: "hip_",
searchIn: "both"
},
// ROCm tools and libraries
{
term: "hipify",
searchIn: "both"
},
],
// Regex patterns - for complex pattern matching
regexPatterns: [
{
@ -107,13 +105,17 @@ jobs:
}
],
},
// Add more label configurations here as needed
// example: {
// keywords: [...],
// substrings: [...],
// regexPatterns: [...]
// },
};
// Helper function to create regex based on search type
function createSearchRegex(term, type) {
// Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) {
case 'keyword':
// Word boundary search - matches whole words only
@ -125,16 +127,13 @@ jobs:
throw new Error(`Unknown search type: ${type}`);
}
}
// Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = [];
const lines = text.split('\n');
for (const termConfig of searchTerms) {
let regex;
let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object)
if (typeof termConfig === 'string') {
term = termConfig;
@ -146,21 +145,17 @@ jobs:
description = termConfig.description;
flags = termConfig.flags;
}
// Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) {
continue;
}
// Create appropriate regex
if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi");
} else {
regex = createSearchRegex(term, searchType);
}
const termMatches = [];
// Check each line for matches
lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex);
@ -175,15 +170,14 @@ jobs:
originalTerm: term || pattern,
description: description,
// Show context around the match in the line
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim()
});
});
}
});
if (termMatches.length > 0) {
matches.push({
term: term || (description || pattern),
@ -196,64 +190,48 @@ jobs:
});
}
}
return matches;
}
// Helper function to check if label should be added
async function processLabel(labelName, config) {
const body = context.payload.issue.body || "";
const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false;
let allMatches = [];
let reason = '';
const keywords = config.keywords || [];
const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title
if (title.trim()) {
core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
}
// Search in body
if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
}
if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
}
// Show details for each match
termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -266,7 +244,6 @@ jobs:
}
});
}
shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -274,13 +251,10 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
}
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) {
@ -296,14 +270,92 @@ jobs:
core.notice(`Label "${labelName}" already present.`);
return false;
}
core.notice(`No matching terms found for label "${labelName}".`);
return false;
}
// Process all configured labels
const processLabels = Object.entries(labelConfig)
.map(([labelName, config]) => processLabel(labelName, config));
const labelsAdded = await Promise.all(processLabels);
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
const labelsAddedResults = await Promise.all(
Object.entries(labelConfig).map(([labelName, config]) =>
processLabel(labelName, config).then(added => ({ labelName, added }))
)
);
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
// Return which labels were added for the next step
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
core.setOutput('labels_added', JSON.stringify(addedLabels));
return addedLabels;
- name: CC users for labeled issues
if: steps.label-step.outputs.labels_added != '[]'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Map labels to GitHub users to CC
// You can add multiple users per label, and multiple label configurations
const ccConfig = {
rocm: {
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
},
// Add more label -> user mappings here
// Example:
// cuda: {
// users: ['user1', 'user2'],
// message: 'CC {users} for CUDA-related issue'
// },
// performance: {
// users: ['perfexpert'],
// message: 'CC {users} for performance issue'
// },
};
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
// Get existing comments to check for already mentioned users
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const issueBody = context.payload.issue.body || '';
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
// Process each label that was added
for (const label of labelsAdded) {
if (ccConfig[label]) {
const config = ccConfig[label];
const usersToMention = [];
// Check which users haven't been mentioned yet
for (const user of config.users) {
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
if (!mentionPattern.test(allExistingText)) {
usersToMention.push(user);
} else {
core.notice(`@${user} already mentioned for label "${label}", skipping`);
}
}
// Post comment if there are users to mention
if (usersToMention.length > 0) {
const mentions = usersToMention.map(u => `@${u}`).join(' ');
const message = config.message.replace('{users}', mentions);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message
});
core.notice(`CC comment added for label "${label}": ${mentions}`);
} else {
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}

View File

@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months

View File

@ -6,30 +6,19 @@ default_stages:
- manual # Run in CI
exclude: 'vllm/third_party/.*'
repos:
- repo: https://github.com/google/yapf
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
rev: v0.14.0
hooks:
- id: ruff
- id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
rev: v1.35.5
rev: v1.38.1
hooks:
- id: typos
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:
- id: isort
args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v20.1.3
rev: v21.1.2
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -46,10 +35,10 @@ repos:
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.6.17
rev: 0.9.1
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
@ -60,38 +49,32 @@ repos:
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
entry: python tools/pre_commit/mypy.py 0 "local"
stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: tools/mypy.sh 1 "3.9"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
<<: &mypy_common
language: python
types_or: [python, pyi]
require_serial: true
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: tools/mypy.sh 1 "3.10"
language: python
types: [python]
additional_dependencies: *mypy_deps
entry: python tools/pre_commit/mypy.py 1 "3.10"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
entry: tools/mypy.sh 1 "3.11"
language: python
types: [python]
additional_dependencies: *mypy_deps
entry: python tools/pre_commit/mypy.py 1 "3.11"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
entry: tools/mypy.sh 1 "3.12"
language: python
types: [python]
additional_dependencies: *mypy_deps
entry: python tools/pre_commit/mypy.py 1 "3.12"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.13
entry: python tools/pre_commit/mypy.py 1 "3.13"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
@ -155,18 +138,15 @@ repos:
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/check_pickle_imports.py
entry: python tools/pre_commit/check_pickle_imports.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [pathspec, regex]
additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
types: [python]
pass_filenames: true
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
additional_dependencies: [regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@ -13,6 +13,7 @@ build:
mkdocs:
configuration: mkdocs.yaml
fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:

View File

@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
#
# Supported/expected torch versions for CUDA/ROCm.
@ -82,6 +86,9 @@ find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
#
# Set compression mode for CUDA >=13.x.
#
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
endif()
#
# Set CUDA include flags for CXX compiler.
#
if(VLLM_GPU_LANG STREQUAL "CUDA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
endif()
endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -243,8 +269,8 @@ set(VLLM_EXT_SRC
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/quantization/w8a8/int8/scaled_quant.cu"
"csrc/quantization/w8a8/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
@ -256,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -288,14 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
"csrc/attention/mla/cutlass_mla_entry.cu"
"csrc/quantization/fp8/per_token_group_quant.cu")
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@ -399,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -427,12 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -457,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -493,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -537,7 +570,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
# CUDA 12.8 or later
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -556,7 +593,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@ -578,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# CUTLASS MLA Archs and flags
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu"
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -605,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -623,9 +667,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -644,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# moe_data.cu is used by all CUTLASS MoE kernels.
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -663,9 +715,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -779,6 +835,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# Hadacore kernels
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
if(HADACORE_ARCHS)
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${HADACORE_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
message(STATUS "Building hadacore")
endif()
# if CUDA endif
endif()
@ -940,6 +1007,7 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)

View File

@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@ -148,6 +149,7 @@ Compute Resources:
- Trainy
- UC Berkeley
- UC San Diego
- Volcengine
Slack Sponsor: Anyscale

View File

@ -1,874 +1,20 @@
# Benchmarking vLLM
# Benchmarks
This README guides you through running benchmark tests with the extensive
datasets supported on vLLM. Its a living document, updated as new features and datasets
become available.
This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
## Dataset Overview
## Contents
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr>
<th style="width:15%; text-align: left;">Dataset</th>
<th style="width:10%; text-align: center;">Online</th>
<th style="width:10%; text-align: center;">Offline</th>
<th style="width:65%; text-align: left;">Data Path</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>ShareGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
</tr>
<tr>
<td><strong>ShareGPT4V (Image)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>
<code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
<br>
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
</td>
</tr>
<tr>
<td><strong>ShareGPT4Video (Video)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>
<code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
</td>
</tr>
<tr>
<td><strong>BurstGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
</tr>
<tr>
<td><strong>Sonnet (deprecated)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
</tr>
<tr>
<td><strong>Random</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>RandomMultiModal (Image/Video)</strong></td>
<td style="text-align: center;">🟡</td>
<td style="text-align: center;">🚧</td>
<td><code>synthetic</code> </td>
</tr>
<tr>
<td><strong>Prefix Repetition</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>HuggingFace-VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmarena-ai/VisionArena-Chat</code></td>
</tr>
<tr>
<td><strong>HuggingFace-InstructCoder</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr>
<tr>
<td><strong>HuggingFace-MTBench</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>philschmid/mt-bench</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Blazedit</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
</tr>
<tr>
<td><strong>Spec Bench</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
</tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>data.jsonl</code></td>
</tr>
</tbody>
</table>
- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
✅: supported
## Usage
🟡: Partial support
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
🚧: to be supported
For full CLI reference see:
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
```bash
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
```
## 🚀 Example - Online Benchmark
<details>
<summary>Show more</summary>
<br/>
First start serving your model
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--num-prompts 10
```
If successful, you will see the following output
```text
============ Serving Benchmark Result ============
Successful requests: 10
Benchmark duration (s): 5.78
Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03
---------------Inter-token Latency----------------
Mean ITL (ms): 7.74
Median ITL (ms): 7.70
P99 ITL (ms): 8.39
==================================================
```
### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```json
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
```
```bash
# run benchmarking script
vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
### VisionArena Benchmark for Vision Language Models
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--hf-split train \
--num-prompts 1000
```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
``` bash
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Spec Bench Benchmark with Speculative Decoding
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
Run all categories:
``` bash
# Download the dataset using:
# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name spec_bench \
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
--num-prompts -1
```
Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
Run only a specific category like "summarization":
``` bash
vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name spec_bench \
--dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
--num-prompts -1
--spec-bench-category "summarization"
```
### Other HuggingFaceDataset Examples
```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
`AI-MO/aimo-validation-aime`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
`philschmid/mt-bench`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
``` bash
vllm bench serve \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path vdaita/edit_5k_char \
--num-prompts 90 \
--blazedit-min-distance 0.01 \
--blazedit-max-distance 0.99
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
vllm bench serve \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
```
### Running With Ramp-Up Request Rate
The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the
server or finding the maximum throughput that it can handle, given some latency budget.
Two ramp-up strategies are supported:
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
The following arguments can be used to control the ramp-up:
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
</details>
## 📈 Example - Offline Throughput Benchmark
<details>
<summary>Show more</summary>
<br/>
```bash
vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
--num-prompts 10
```
If successful, you will see the following output
```text
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```
### VisionArena Benchmark for Vision Language Models
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--num-prompts 1000 \
--hf-split train
```
The `num prompt tokens` now includes image token counts
```text
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
vllm bench throughput \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
```text
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
`AI-MO/aimo-validation-aime`:
```bash
vllm bench throughput \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
Benchmark with LoRA adapters:
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name sharegpt \
--num-prompts 10 \
--max-loras 2 \
--max-lora-rank 8 \
--enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test
```
</details>
## 🛠️ Example - Structured Output Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of structured output generation (JSON, grammar, regex).
### Server Setup
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
### JSON Schema Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
### Grammar-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
### Regex-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
### Choice-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
### XGrammar Benchmark Dataset
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
</details>
## 📚 Example - Long Document QA Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of long document question-answering with prefix caching.
### Basic Long Document QA Test
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
### Different Repeat Modes
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
</details>
## 🗂️ Example - Prefix Caching Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the efficiency of automatic prefix caching.
### Fixed Prompt with Prefix Caching
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
### ShareGPT Dataset with Prefix Caching
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
### Prefix Repetition Dataset
```bash
vllm bench serve \
--backend openai \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-name prefix_repetition \
--num-prompts 100 \
--prefix-repetition-prefix-len 512 \
--prefix-repetition-suffix-len 128 \
--prefix-repetition-num-prefixes 5 \
--prefix-repetition-output-len 128
```
</details>
## ⚡ Example - Request Prioritization Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of request prioritization in vLLM.
### Basic Prioritization Test
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
### Multiple Sequences per Prompt
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```
</details>
## 👁️ Example - Multi-Modal Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of multi-modal requests in vLLM.
### Images (ShareGPT4V)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"image": 1}' \
--allowed-local-media-path /path/to/sharegpt4v/images
```
Send requests with images:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
### Videos (ShareGPT4Video)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"video": 1}' \
--allowed-local-media-path /path/to/sharegpt4video/videos
```
Send requests with videos:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
### Synthetic Random Images (random-mm)
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
Notes:
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented.
Start the server (example):
```bash
vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
--dtype bfloat16 \
--max-model-len 16384 \
--limit-mm-per-prompt '{"image": 3, "video": 0}' \
--mm-processor-kwargs max_pixels=1003520
```
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
```bash
vllm bench serve \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-3B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name random-mm \
--num-prompts 100 \
--max-concurrency 10 \
--random-prefix-len 25 \
--random-input-len 300 \
--random-output-len 40 \
--random-range-ratio 0.2 \
--random-mm-base-items-per-request 2 \
--random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
--random-mm-bucket-config '{(224, 224, 1): 1.0}' \
--request-rate inf \
--ignore-eos \
--seed 42
```
The number of items per request can be controlled by passing multiple image buckets:
```bash
--random-mm-base-items-per-request 2 \
--random-mm-num-mm-items-range-ratio 0.5 \
--random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
--random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
```
Flags specific to `random-mm`:
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
Behavioral notes:
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
How sampling works:
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
</details>
- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>

View File

@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
## Batched `auto_tune`
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
### Prerequisites
- **jq**: This script requires `jq` to parse the JSON configuration file.
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
### How to Run
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
2. **Execute the script**:
```bash
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
```
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
### Configuration File
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
Here is an example `runs_config.json` with two benchmark configurations:
```json
[
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 128,
"output_len": 2048,
"max_model_len": 2300,
"num_seqs_list": "128 256",
"num_batched_tokens_list": "8192 16384"
},
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-70B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 4000,
"output_len": 16,
"max_model_len": 4096,
"num_seqs_list": "64 128",
"num_batched_tokens_list": "4096 8192",
"max_latency_allowed_ms": 500
}
]
```
### Output
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
- `run_id`: A unique identifier for the run, derived from the timestamp.
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
A summary of successful and failed runs is also printed to the console upon completion.

View File

@ -74,7 +74,7 @@ start_server() {
local vllm_log=$4
local profile_dir=$5
pkill -if vllm
pkill -if "vllm serve" || true
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
@ -96,17 +96,22 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
local server_pid=$!
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
# This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -118,7 +123,7 @@ start_server() {
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
return 1
else
return 0
@ -134,7 +139,7 @@ run_benchmark() {
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
pkill -if vllm
pkill -if "vllm serve" || true
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
@ -227,7 +232,7 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill -if vllm
pkill -if "vllm serve" || true
sleep 10
echo "===================="
return 0
@ -303,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
pkill -if vllm
pkill -if "vllm serve" || true
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -0,0 +1,128 @@
#!/bin/bash
INPUT_JSON="$1"
GCS_PATH="$2" # Optional GCS path for uploading results for each run
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
if [[ -z "$INPUT_JSON" ]]; then
echo "Error: Input JSON file not provided."
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
exit 1
fi
if [[ ! -f "$INPUT_JSON" ]]; then
echo "Error: File not found at '$INPUT_JSON'"
exit 1
fi
if ! command -v jq &> /dev/null; then
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
exit 1
fi
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
exit 1
fi
SUCCESS_COUNT=0
FAILURE_COUNT=0
FAILED_RUNS=()
SCRIPT_START_TIME=$(date +%s)
json_content=$(cat "$INPUT_JSON")
if ! num_runs=$(echo "$json_content" | jq 'length'); then
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
exit 1
fi
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
echo "Starting benchmark runs..."
echo "--------------------------------------------------"
for i in $(seq 0 $(($num_runs - 1))); do
run_object=$(echo "$json_content" | jq ".[$i]")
RUN_START_TIME=$(date +%s)
ENV_VARS_ARRAY=()
# Dynamically create env vars from the JSON object's keys
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
value=$(echo "$run_object" | jq -r ".$key")
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
ENV_VARS_ARRAY+=("${var_name}=${value}")
done
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
# Execute auto_tune.sh and capture output
RUN_OUTPUT_FILE=$(mktemp)
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
STATUS="SUCCESS"
((SUCCESS_COUNT++))
else
STATUS="FAILURE"
((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
rm "$RUN_OUTPUT_FILE"
# Parse results and optionally upload them to GCS
RUN_ID=""
RESULTS=""
GCS_RESULTS_URL=""
if [[ "$STATUS" == "SUCCESS" ]]; then
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
RESULTS=$(cat "$RESULT_FILE_PATH")
if [[ -n "$GCS_PATH" ]]; then
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
echo "Uploading results to GCS..."
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
echo "GCS upload successful."
else
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
fi
fi
else
echo "Warning: Could not find result file for a successful run."
STATUS="WARNING_NO_RESULT_FILE"
fi
fi
# Add the results back into the JSON object for this run
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
RUN_END_TIME=$(date +%s)
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
echo "--------------------------------------------------"
# Save intermediate progress back to the file
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
done
SCRIPT_END_TIME=$(date +%s)
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
echo
echo "====================== SUMMARY ======================"
echo "Successful runs: $SUCCESS_COUNT"
echo "Failed runs: $FAILURE_COUNT"
echo "==================================================="
if [[ $FAILURE_COUNT -gt 0 ]]; then
echo "Details of failed runs (see JSON file for full parameters):"
for failed in "${FAILED_RUNS[@]}"; do
echo " - $failed"
done
fi
echo "Updated results have been saved to '$INPUT_JSON'."

View File

@ -8,7 +8,6 @@ import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp
import huggingface_hub.constants
@ -28,13 +27,13 @@ class RequestFuncInput:
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict | list[dict]] = None
model_name: str | None = None
logprobs: int | None = None
extra_body: dict | None = None
multi_modal_content: dict | list[dict] | None = None
ignore_eos: bool = False
language: Optional[str] = None
request_id: Optional[str] = None
language: str | None = None
request_id: str | None = None
@dataclass
@ -52,7 +51,7 @@ class RequestFuncOutput:
async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@ -133,7 +132,7 @@ async def async_request_tgi(
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@ -204,7 +203,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@ -367,7 +366,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), (
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
@ -610,7 +609,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path
):

View File

@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
from benchmark_utils import TimeCollector
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,31 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import time
from unittest import mock
import numpy as np
from benchmark_utils import TimeCollector
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.config import (
CacheConfig,
DeviceConfig,
LoadConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
)
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
def main(args):
def benchmark_propose(args):
rows = []
for max_ngram in args.max_ngram:
collector = TimeCollector(TimeCollector.US)
@ -69,10 +83,88 @@ def main(args):
)
def benchmark_batched_propose(args):
NUM_SPECULATIVE_TOKENS_NGRAM = 10
PROMPT_LOOKUP_MIN = 5
PROMPT_LOOKUP_MAX = 15
MAX_MODEL_LEN = int(1e7)
DEVICE = current_platform.device_type
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
speculative_config = SpeculativeConfig(
target_model_config=model_config,
target_parallel_config=ParallelConfig(),
method="ngram",
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
prompt_lookup_max=PROMPT_LOOKUP_MAX,
prompt_lookup_min=PROMPT_LOOKUP_MIN,
)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
mock_pp_group = mock.MagicMock()
mock_pp_group.world_size = 1
with mock.patch(
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
):
runner = GPUModelRunner(vllm_config, DEVICE)
# hack max model len
runner.max_model_len = MAX_MODEL_LEN
runner.drafter.max_model_len = MAX_MODEL_LEN
dummy_input_batch = InputBatch(
max_num_reqs=args.num_req,
max_model_len=MAX_MODEL_LEN,
max_num_batched_tokens=args.num_req * args.num_token,
device=DEVICE,
pin_memory=False,
vocab_size=256000,
block_sizes=[16],
)
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
dummy_input_batch.spec_decode_unsupported_reqs = ()
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
dummy_input_batch.token_ids_cpu = np.random.randint(
0, 20, (args.num_req, args.num_token)
)
runner.input_batch = dummy_input_batch
sampled_token_ids = [[0]] * args.num_req
print("Starting benchmark")
# first run is warmup so ignore it
for _ in range(args.num_iteration):
start = time.time()
runner.drafter.propose(
sampled_token_ids,
dummy_input_batch.req_ids,
dummy_input_batch.num_tokens_no_spec,
dummy_input_batch.token_ids_cpu,
dummy_input_batch.spec_decode_unsupported_reqs,
)
end = time.time()
print(f"Iteration time (s): {end - start}")
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of N-gram speculative decode drafting"
)
parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch"
)
parser.add_argument(
"--num-iteration",
type=int,
@ -105,8 +197,17 @@ def invoke_main() -> None:
help="Number of speculative tokens to generate",
)
args = parser.parse_args()
main(args)
if not args.batched:
benchmark_propose(args)
else:
benchmark_batched_propose(args)
"""
# Example command lines:
# time python3 benchmarks/benchmark_ngram_proposer.py
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
""" # noqa: E501
if __name__ == "__main__":
invoke_main() # pragma: no cover

View File

@ -32,7 +32,6 @@ import dataclasses
import json
import random
import time
from typing import Optional
from transformers import PreTrainedTokenizerBase
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -128,7 +127,7 @@ def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
prefix_len: int,
) -> list[Request]:
requests = []

View File

@ -7,7 +7,6 @@ import dataclasses
import json
import random
import time
from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -24,7 +23,7 @@ def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")

View File

@ -31,20 +31,19 @@ import time
import uuid
import warnings
from collections.abc import AsyncGenerator
from contextlib import nullcontext
from dataclasses import dataclass
from typing import Optional
import datasets
import numpy as np
import pandas as pd
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
RequestFuncInput,
RequestFuncOutput,
)
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
@ -317,7 +316,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
@ -437,9 +436,9 @@ async def benchmark(
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
ignore_eos: bool,
max_concurrency: Optional[int],
max_concurrency: int | None,
structured_output_ratio: float,
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@ -449,7 +448,8 @@ async def benchmark(
def prepare_extra_body(request) -> dict:
extra_body = {}
# Add the schema to the extra_body
extra_body[request.structure_type] = request.schema
extra_body["structured_outputs"] = {}
extra_body["structured_outputs"][request.structure_type] = request.schema
return extra_body
print("Starting initial single prompt test run...")
@ -502,15 +502,9 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
async def limited_request_func(request_func_input, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar)
@ -696,11 +690,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
if args.structure_type == "guided_json":
if args.structure_type == "json":
return _eval_correctness_json(expected, actual)
elif args.structure_type == "guided_regex":
elif args.structure_type == "regex":
return _eval_correctness_regex(expected, actual)
elif args.structure_type == "guided_choice":
elif args.structure_type == "choice":
return _eval_correctness_choice(expected, actual)
else:
return None
@ -780,18 +774,18 @@ def main(args: argparse.Namespace):
)
if args.dataset == "grammar":
args.structure_type = "guided_grammar"
args.structure_type = "grammar"
elif args.dataset == "regex":
args.structure_type = "guided_regex"
args.structure_type = "regex"
elif args.dataset == "choice":
args.structure_type = "guided_choice"
args.structure_type = "choice"
else:
args.structure_type = "guided_json"
args.structure_type = "json"
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
result_file_name = f"{args.structured_output_ratio}guided"
result_file_name = f"{args.structured_output_ratio}so"
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"
@ -909,13 +903,13 @@ def create_argument_parser():
parser.add_argument(
"--tokenizer",
type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--tokenizer-mode",
type=str,
default="auto",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--num-prompts",

View File

@ -6,7 +6,7 @@ import math
import os
import time
from types import TracebackType
from typing import Any, Optional, Union
from typing import Any
def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None:
self.cnt: int = 0
self._sum: int = 0
self._max: Optional[int] = None
self._max: int | None = None
self.scale = scale
self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
else:
self._max = max(self._max, v)
def avg(self) -> Union[float, str]:
def avg(self) -> float | str:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> Union[float, str]:
def max(self) -> float | str:
return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[Union[float, str]]:
def dump_avg_max(self) -> list[float | str]:
return [self.avg(), self.max()]
def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_traceback: Optional[TracebackType],
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
self.collect(time.monotonic_ns() - self.start_time)

View File

@ -6,8 +6,7 @@ import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable
from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark

View File

@ -6,8 +6,7 @@ import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable, Optional
from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark
@ -17,7 +16,7 @@ from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_block_fp8_matmul,
w8a8_triton_block_scaled_mm,
)
from vllm.utils import FlexibleArgumentParser, cdiv
@ -53,7 +52,7 @@ def bench_int8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels."""
assert dtype == torch.int8
@ -108,7 +107,7 @@ def bench_fp8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn
@ -158,7 +157,7 @@ def bench_fp8(
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
),
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
@ -183,7 +182,7 @@ def bench(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(
dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]],
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:

View File

@ -55,9 +55,7 @@ benchmark() {
output_len=$2
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
@ -65,9 +63,7 @@ benchmark() {
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \

View File

@ -38,16 +38,12 @@ wait_for_server() {
launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--enable-chunked-prefill \
--gpu-memory-utilization 0.6 &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--enable-chunked-prefill \
@ -62,18 +58,14 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \

View File

@ -3,10 +3,9 @@
import pickle as pkl
import time
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@ -68,7 +67,7 @@ def unfused_int8_impl(
def unfused_fp8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@ -85,7 +84,7 @@ def unfused_fp8_impl(
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
out, _ = ops.rms_norm_dynamic_per_token_quant(

View File

@ -0,0 +1,191 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"mxfp4": dict(no_a_quant=False, enabled=True),
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_mxfp4(
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
b, forward_hadamard_matrix, method="abs_max"
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
return weight_hf_e2m1, weight_hf_scale_block
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
b, forward_hadamard_matrix, device
)
alpha = torch.tensor([1.0], device="cuda")
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
def run():
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs MXFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_mxfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_mxfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -3,6 +3,7 @@
import argparse
import copy
import itertools
import os
import torch
from weight_shapes import WEIGHT_SHAPES
@ -23,21 +24,45 @@ PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
}
_needs_fbgemm = any(
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
)
if _needs_fbgemm:
try:
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
triton_scale_nvfp4_quant,
)
except ImportError:
print(
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
"These providers will be skipped. Please install fbgemm_gpu with: "
"'pip install fbgemm-gpu-genai' to run them."
)
# Disable FBGEMM providers so the benchmark can run.
for cfg in PROVIDER_CFGS.values():
if cfg.get("fbgemm"):
cfg["enabled"] = False
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
# Compute global scale for weight
b_amax = torch.abs(b).max().to(torch.float32)
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
if "fbgemm" in cfg and cfg["fbgemm"]:
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
else:
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
return b_fp4, scale_b_fp4, b_global_scale
def build_nvfp4_runner(cfg, a, b, dtype, device):
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
# Compute global scale for activation
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
# Alpha for the GEMM operation
alpha = 1.0 / (a_global_scale * b_global_scale)
if "fbgemm" in cfg and cfg["fbgemm"]:
if cfg["no_a_quant"]:
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
def run():
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
else:
def run():
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
if cfg["no_a_quant"]:
# Pre-quantize activation
@ -130,10 +184,13 @@ if __name__ == "__main__":
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
os.makedirs(save_dir, exist_ok=True)
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
save_path=save_dir,
N=N,
K=K,
)

View File

@ -0,0 +1,207 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_nvfp4(
b: torch.Tensor,
forward_hadamard_matrix: torch.Tensor,
global_scale: torch.Tensor,
device: str,
M: int,
N: int,
K: int,
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
b, forward_hadamard_matrix, global_scale
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
-1, K // 16
)
return weight_hf_e2m1, weight_hf_scale_block
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
alpha = torch.tensor([1.0], device="cuda")
global_scale = torch.tensor([1.0], device="cuda")
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
b, forward_hadamard_matrix, global_scale, device, M, N, K
)
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
def run():
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs NVFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_nvfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [16, 32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -1,15 +1,26 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
from collections.abc import Callable
from unittest.mock import patch
import pandas as pd
import torch
from vllm import _custom_ops as ops
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
def with_triton_mode(fn):
"""Temporarily force the Triton fallback path"""
def wrapped(*args, **kwargs):
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
return fn(*args, **kwargs)
return wrapped
# TODO(luka): use standalone_compile utility
@ -21,78 +32,238 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
return inner
torch._dynamo.config.recompile_limit = 8888
compilation_config = CompilationConfig(custom_ops=["none"])
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
torch_per_token_quant_fp8 = torch.compile(
QuantFP8(False, GroupShape.PER_TOKEN),
fullgraph=True,
dynamic=False, # recompile for different shapes
)
def bench_compile(fn: Callable):
# recompile for different shapes
fwd = torch.compile(fn, fullgraph=True, dynamic=False)
# First dim is explicitly dynamic to simulate vLLM usage
torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
return with_dyn_arg(fwd, 0, 0)
def cuda_per_token_quant_fp8(
input: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
return ops.scaled_fp8_quant(input)
torch._dynamo.config.recompile_limit = 8888
def calculate_diff(batch_size: int, seq_len: int):
"""Calculate difference between Triton and CUDA implementations."""
def calculate_diff(
batch_size: int,
hidden_size: int,
group_shape: GroupShape,
dtype: torch.dtype,
):
"""Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda")
x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
torch_out, torch_scale = torch_per_token_quant_fp8(x)
cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
if torch.allclose(
cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
try:
torch.testing.assert_close(
cuda_out.to(torch.float32),
torch_out.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
torch.testing.assert_close(
cuda_out.to(torch.float32),
torch_eager_out.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
print("✅ All implementations match")
else:
except AssertionError as e:
print("❌ Implementations differ")
print(e)
batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
configs = list(itertools.product(batch_size_range, seq_len_range))
configs = []
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size", "seq_len"],
x_vals=configs,
line_arg="provider",
line_vals=["torch", "cuda"],
line_names=["Torch", "CUDA"],
styles=[("blue", "-"), ("green", "-")],
ylabel="us",
plot_name="per-token-dynamic-quant-fp8-performance",
args={},
)
)
def benchmark_quantization(batch_size, seq_len, provider):
dtype = torch.float16
def benchmark_quantization(
batch_size,
hidden_size,
provider,
group_shape: GroupShape,
col_major: bool,
dtype: torch.dtype,
):
device = torch.device("cuda")
x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
if provider == "torch":
fn = lambda: torch_per_token_quant_fp8(x.clone())
fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
elif provider == "cuda":
fn = lambda: cuda_per_token_quant_fp8(x.clone())
fn = lambda: quant_fp8.forward_cuda(x.clone())
elif provider == "triton":
if not group_shape.is_per_group():
# Triton only supported for per-group
return 0, 0, 0
fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
# TODO(luka) extract to utils
def compute_geomean_speedups(
df: pd.DataFrame,
baseline_col: str,
speedup_cols: list[str],
groupby_cols: list[str] | None = None,
) -> pd.DataFrame:
"""
Compute geometric mean speedups over a baseline column.
Args:
df: Input dataframe
baseline_col: Column to use as baseline
speedup_cols: Columns to compute speedups for
groupby_cols: Columns to group by. If None, compute over entire df.
Returns:
pd.DataFrame with geometric mean speedups
"""
from scipy.stats import gmean
def geo_speedup(group: pd.DataFrame) -> pd.Series:
ratios = {
col: (group[baseline_col] / group[col]).values for col in speedup_cols
}
return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
if groupby_cols is None:
result = geo_speedup(df).to_frame().T
else:
result = (
df.groupby(groupby_cols)
.apply(geo_speedup, include_groups=False)
.reset_index()
)
return result
if __name__ == "__main__":
calculate_diff(batch_size=4, seq_len=4096)
benchmark_quantization.run(print_data=True)
parser = FlexibleArgumentParser(
description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
)
parser.add_argument("-c", "--check", action="store_true")
parser.add_argument(
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
)
parser.add_argument(
"--hidden-sizes",
type=int,
nargs="+",
default=[896, 1024, 2048, 4096, 7168],
help="Hidden sizes to benchmark",
)
parser.add_argument(
"--batch-sizes",
type=int,
nargs="+",
default=[1, 16, 128, 512, 1024],
help="Batch sizes to benchmark",
)
parser.add_argument(
"--group-sizes",
type=int,
nargs="+",
default=None,
help="Group sizes for GroupShape(1,N) to benchmark. "
"Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
)
parser.add_argument(
"--no-column-major",
action="store_true",
help="Disable column-major scales testing",
)
args = parser.parse_args()
assert args
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
hidden_sizes = args.hidden_sizes
batch_sizes = args.batch_sizes
if args.group_sizes is not None:
group_shapes = []
for size in args.group_sizes:
if size == 0:
group_shapes.append(GroupShape.PER_TENSOR)
elif size == -1:
group_shapes.append(GroupShape.PER_TOKEN)
else:
group_shapes.append(GroupShape(1, size))
else:
group_shapes = [
GroupShape.PER_TENSOR,
GroupShape.PER_TOKEN,
GroupShape(1, 64),
GroupShape(1, 128),
]
column_major_scales = [False] if args.no_column_major else [True, False]
config_gen = itertools.product(
group_shapes,
column_major_scales,
batch_sizes,
hidden_sizes,
)
# filter out column-major scales for non-group, reverse order
configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
print(f"Running {len(configs)} configurations:")
print(f" Hidden sizes: {hidden_sizes}")
print(f" Batch sizes: {batch_sizes}")
print(f" Group shapes: {[str(g) for g in group_shapes]}")
print(f" Column major scales: {column_major_scales}")
print()
if args.check:
for group_shape in group_shapes:
group_size = group_shape[1]
print(f"{group_size=}")
calculate_diff(
batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
)
benchmark = triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
x_vals=configs,
line_arg="provider",
line_vals=["torch", "cuda", "triton"],
line_names=["Torch (Compiled)", "CUDA", "Triton"],
styles=[("blue", "-"), ("green", "-"), ("black", "-")],
ylabel="us",
plot_name="QuantFP8 performance",
args={},
)
)(benchmark_quantization)
df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
# Print geomean speedups
geo_table_grouped = compute_geomean_speedups(
df,
baseline_col="Torch (Compiled)",
speedup_cols=["CUDA", "Triton"],
groupby_cols=["col_major", "group_shape"],
)
print("Speedup over Torch (Compiled)")
print(geo_table_grouped.to_string(index=False))

View File

@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config,
nvfp4_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
@ -140,6 +144,12 @@ def bench_run(
a_fp8_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
for _ in range(num_repeats):
fused_experts(
a,
@ -147,10 +157,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
quant_config=quant_config,
)
def run_cutlass_moe_fp4(
@ -172,25 +179,27 @@ def bench_run(
device: torch.device,
num_repeats: int,
):
quant_config = nvfp4_moe_quant_config(
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_scale=w1_blockscale,
w2_scale=w2_blockscale,
g1_alphas=w1_gs,
g2_alphas=w2_gs,
)
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp4", color="green"):
cutlass_moe_fp4(
a=a,
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_fp4=w1_fp4,
w1_blockscale=w1_blockscale,
w1_alphas=w1_gs,
w2_fp4=w2_fp4,
w2_blockscale=w2_blockscale,
w2_alphas=w2_gs,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
device=device,
quant_config=quant_config,
)
def run_cutlass_from_graph(
@ -211,26 +220,29 @@ def bench_run(
e: int,
device: torch.device,
):
quant_config = nvfp4_moe_quant_config(
a1_gscale=a1_gs,
a2_gscale=a2_gs,
w1_scale=w1_blockscale,
w2_scale=w2_blockscale,
g1_alphas=w1_gs,
g2_alphas=w2_gs,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
return cutlass_moe_fp4(
a=a,
a1_gscale=a1_gs,
w1_fp4=w1_fp4,
w1_blockscale=w1_blockscale,
w1_alphas=w1_alphas,
a2_gscale=a2_gs,
w2_fp4=w2_fp4,
w2_blockscale=w2_blockscale,
w2_alphas=w2_alphas,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
device=device,
quant_config=quant_config,
)
def run_triton_from_graph(
@ -246,16 +258,18 @@ def bench_run(
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
)
return fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_fp8_scale,
quant_config=quant_config,
)
def replay_graph(graph, num_repeats):

View File

@ -0,0 +1,406 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
but use different quantization strategies and backends.
"""
import nvtx
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]
WEIGHT_SHAPES_MOE = {
"mixtral-8x7b": [
[8, 2, 4096, 14336],
],
"deepseek-v2": [
[160, 6, 5120, 12288],
],
"custom-small": [
[8, 2, 2048, 7168],
],
"glm45-fp8": [
[128, 8, 4096, 1408],
],
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
[128, 1, 5120, 8192],
],
}
DEFAULT_MODELS = [
"mixtral-8x7b",
]
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
DEFAULT_TP_SIZES = [1]
PER_ACT_TOKEN_OPTS = [False, True]
PER_OUT_CH_OPTS = [False, True]
FP8_DTYPE = current_platform.fp8_dtype()
def bench_run(
results: list,
model: str,
num_experts: int,
topk: int,
per_act_token: bool,
per_out_ch: bool,
mkn: tuple[int, int, int],
):
(m, k, n) = mkn
dtype = torch.half
device = "cuda"
# Create input activations
a = torch.randn((m, k), device=device, dtype=dtype) / 10
# Create weights
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
# Create FP8 quantized weights and scales for both kernels
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
# Create scales based on quantization strategy
if per_out_ch:
# Per-channel quantization
w1_scale = torch.empty(
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
)
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
else:
# Per-tensor quantization
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
# Quantize weights
for expert in range(num_experts):
if per_out_ch:
# Per-channel quantization - not yet implemented properly
# For now, fall back to per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Expand scalar scales to the expected per-channel shape
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
w2_scale[expert] = w2_scale_temp.expand(k, 1)
else:
# Per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Store scalar scales in [1, 1] tensors
w1_scale[expert, 0, 0] = w1_scale_temp
w2_scale[expert, 0, 0] = w2_scale_temp
# Prepare weights for CUTLASS (no transpose needed)
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
# Create router scores and get topk
score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
# Force per-tensor quantization for all cases to match working e2e setup
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
# Force per-tensor quantization for all cases
per_act_token = False
# Create stride tensors for CUTLASS
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
def run_triton_moe(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
def run_cutlass_moe_fp8(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
cutlass_moe_fp8(
a=a,
w1_q=w1,
w2_q=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
# Pre-create quantization config to avoid creating it inside CUDA graph
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
cutlass_moe_fp8(
a=a,
w1_q=w1_fp8q_cutlass,
w2_q=w2_fp8q_cutlass,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
torch.cuda.synchronize()
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(triton_graph, stream=triton_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
fused_experts(
a,
w1_fp8q,
w2_fp8q,
topk_weights,
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
"""Benchmark CUDA graph using events like benchmark_moe.py"""
# Warmup
for _ in range(num_warmup):
graph.replay()
torch.cuda.synchronize()
# Timing
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies = []
for _ in range(num_iters):
torch.cuda.synchronize()
start_event.record()
graph.replay()
end_event.record()
end_event.synchronize()
latencies.append(start_event.elapsed_time(end_event))
# Divide by 10 since graph contains 10 calls
return sum(latencies) / (num_iters * 10)
# Benchmark parameters
num_warmup = 5
num_iters = 100
# Benchmark only CUDA graphs (more reliable and faster)
# Benchmark Triton MoE with CUDA graphs
triton_graph_time = bench_cuda_graph(
triton_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Benchmark CUTLASS MoE with CUDA graphs
cutlass_graph_time = bench_cuda_graph(
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Convert ms to us and return results
triton_time_us = triton_graph_time * 1000
cutlass_time_us = cutlass_graph_time * 1000
return {
"batch_size": m,
"triton_time_us": triton_time_us,
"cutlass_time_us": cutlass_time_us,
}
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
all_results = []
for model in args.models:
for tp in args.tp_sizes:
for layer in WEIGHT_SHAPES_MOE[model]:
num_experts = layer[0]
topk = layer[1]
size_k = layer[2]
size_n = layer[3] // tp
if len(args.limit_k) > 0 and size_k not in args.limit_k:
continue
if len(args.limit_n) > 0 and size_n not in args.limit_n:
continue
for per_act_token in args.per_act_token_opts:
for per_out_ch in args.per_out_ch_opts:
print(
f"\n=== {model}, experts={num_experts}, topk={topk},"
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
)
config_results = []
for size_m in args.batch_sizes:
mkn = (size_m, size_k, size_n)
result = bench_run(
[], # Not used anymore
model,
num_experts,
topk,
per_act_token,
per_out_ch,
mkn,
)
if result:
config_results.append(result)
# Print results table for this configuration
if config_results:
print(
f"\n{'Batch Size':<12}"
f"{'Triton (us)':<15}"
f"{'CUTLASS (us)':<15}"
)
print("-" * 45)
for result in config_results:
print(
f"{result['batch_size']:<12}"
f"{result['triton_time_us']:<15.2f}"
f"{result['cutlass_time_us']:<15.2f}"
)
all_results.extend(config_results)
print(f"\nTotal benchmarks completed: {len(all_results)}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
across specified models/shapes/batches
Example usage:
python benchmark_cutlass_moe_fp8.py \
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
--tp-sizes 8 \
--batch-size 2 4 8 \
--per-act-token-opts false \
--per-out-ch-opts false
"""
)
parser.add_argument(
"--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES_MOE.keys(),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
parser.add_argument(
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
)
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
parser.add_argument(
"--per-act-token-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-activation token quantization options (true/false)",
)
parser.add_argument(
"--per-out-ch-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-output channel quantization options (true/false)",
)
args = parser.parse_args()
main(args)

View File

@ -7,6 +7,10 @@ Benchmark script for device communicators:
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
and SymmMemCommunicator (multimem, two-shot).
for NCCL symmetric memory you need to set the environment variables
NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
not use fast NVLS implementation for all reduce.
Usage:
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
@ -18,15 +22,21 @@ Example:
import json
import os
import time
from collections.abc import Callable
from contextlib import nullcontext
from typing import Callable, Optional
import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.device_communicators.pynccl import (
PyNcclCommunicator,
register_nccl_symmetric_ops,
)
from vllm.distributed.device_communicators.pynccl_allocator import (
set_graph_pool_id,
)
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
@ -98,6 +108,7 @@ class CommunicatorBenchmark:
)
if not self.pynccl_comm.disabled:
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
register_nccl_symmetric_ops(self.pynccl_comm)
else:
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
self.pynccl_comm = None
@ -194,6 +205,15 @@ class CommunicatorBenchmark:
None, # no env variable needed
)
)
communicators.append(
(
"pynccl-symm",
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
lambda t: True, # Always available if initialized
nullcontext(),
None, # no env variable needed
)
)
if self.symm_mem_comm_multimem is not None:
comm = self.symm_mem_comm_multimem
@ -244,12 +264,12 @@ class CommunicatorBenchmark:
def benchmark_allreduce_single(
self,
sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
should_use_fn: Callable[[torch.Tensor], bool],
context,
num_warmup: int,
num_trials: int,
) -> Optional[float]:
) -> float | None:
"""Benchmark method with CUDA graph optimization."""
try:
# Create test tensor (2D: sequence_length x hidden_size)
@ -271,7 +291,9 @@ class CommunicatorBenchmark:
# Capture the graph using context manager
with context:
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
graph_pool = torch.cuda.graph_pool_handle()
set_graph_pool_id(graph_pool)
with torch.cuda.graph(graph, pool=graph_pool):
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
allreduce_fn(graph_input)

View File

@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
@ -96,6 +97,11 @@ def bench_run(
a_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
for _ in range(num_repeats):
fused_experts(
a,
@ -103,10 +109,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
quant_config=quant_config,
)
def run_cutlass_moe(
@ -125,6 +128,12 @@ def bench_run(
per_act_token: bool,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
per_act_token_quant=per_act_token,
)
for _ in range(num_repeats):
cutlass_moe_fp8(
a,
@ -132,14 +141,11 @@ def bench_run(
w2,
topk_weights,
topk_ids,
w1_scale,
w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
per_act_token,
a1_scale=None,
quant_config=quant_config,
)
def run_cutlass_from_graph(
@ -156,6 +162,12 @@ def bench_run(
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
per_act_token_quant=per_act_token,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@ -165,14 +177,11 @@ def bench_run(
w2_q,
topk_weights,
topk_ids,
w1_scale,
w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
per_act_token,
a1_scale=None,
quant_config=quant_config,
)
def run_triton_from_graph(
@ -185,6 +194,11 @@ def bench_run(
w2_scale: torch.Tensor,
a_scale: torch.Tensor,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@ -194,10 +208,7 @@ def bench_run(
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale,
quant_config=quant_config,
)
def replay_graph(graph, num_repeats):

View File

@ -6,11 +6,12 @@ import copy
import json
import pickle
import time
from collections.abc import Callable
from dataclasses import dataclass
from enum import Enum, auto
from itertools import product
from pathlib import Path
from typing import Any, Callable, Optional
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@ -79,9 +80,9 @@ def make_rand_lora_weight_tensor(
def make_rand_tensors(
a_shape: tuple[int],
b_shape: tuple[int],
c_shape: tuple[int],
a_shape: tuple[int, ...],
b_shape: tuple[int, ...],
c_shape: tuple[int, ...],
a_dtype: torch.dtype,
b_dtype: torch.dtype,
c_dtype: torch.dtype,
@ -158,7 +159,7 @@ def ref_group_gemm(
seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor,
scaling: float,
add_inputs: Optional[bool],
add_inputs: bool | None,
):
"""
Torch group gemm reference implementation to test correctness of
@ -243,7 +244,7 @@ class OpType(Enum):
lora_rank: int,
num_loras: int,
num_slices: int,
) -> tuple[tuple[int], tuple[int], tuple[int]]:
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type
@ -316,8 +317,8 @@ class BenchmarkContext:
lora_rank: int
sort_by_lora_id: bool
dtype: torch.dtype
seq_length: Optional[int] = None
num_slices: Optional[int] = None # num_slices for slice based ops
seq_length: int | None = None
num_slices: int | None = None # num_slices for slice based ops
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
ctx = copy.copy(self)
@ -464,7 +465,11 @@ class BenchmarkTensors:
for field_name in LoRAKernelMeta.__dataclass_fields__:
field = getattr(self.lora_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
setattr(self.lora_kernel_meta, field_name, to_device(field))
setattr(
self.lora_kernel_meta,
field_name,
to_device(field) if field_name != "no_lora_flag_cpu" else field,
)
def metadata(self) -> tuple[int, int, int]:
"""
@ -512,6 +517,7 @@ class BenchmarkTensors:
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"scaling": 1.0,
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
@ -552,10 +558,11 @@ class BenchmarkTensors:
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"offset_start": 0,
"add_inputs": add_inputs,
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def bench_fn_kwargs(
self, op_type: OpType, add_inputs: Optional[bool] = None
self, op_type: OpType, add_inputs: bool | None = None
) -> dict[str, Any]:
if op_type.is_shrink_fn():
assert add_inputs is None
@ -569,7 +576,7 @@ class BenchmarkTensors:
raise ValueError(f"Unrecognized optype {self}")
def test_correctness(
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
self, op_type: OpType, expand_fn_add_inputs: bool | None
) -> bool:
"""
Test correctness of op_type implementation against a grouped gemm
@ -605,8 +612,8 @@ def bench_optype(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
cuda_graph_nops: Optional[int] = None,
expand_fn_add_inputs: Optional[bool] = None,
cuda_graph_nops: int | None = None,
expand_fn_add_inputs: bool | None = None,
test_correctness: bool = False,
) -> TMeasurement:
assert arg_pool_size >= 1
@ -673,7 +680,7 @@ def bench_torch_mm(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
cuda_graph_nops: Optional[int] = None,
cuda_graph_nops: int | None = None,
) -> TMeasurement:
"""
Benchmark basic torch.mm as a roofline.
@ -738,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
"""
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
compare = TBenchmark.Compare(timers)
compare.print()

View File

@ -8,10 +8,9 @@ import math
import os
import pickle as pkl
import time
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
import pandas as pd
import torch
@ -63,23 +62,23 @@ class BenchmarkTensors:
a: torch.Tensor
w_q: torch.Tensor
group_size: Optional[int]
group_size: int | None
wtype: ScalarType
w_g_s: torch.Tensor
w_g_zp: Optional[torch.Tensor]
w_ch_s: Optional[torch.Tensor]
w_tok_s: Optional[torch.Tensor]
w_g_zp: torch.Tensor | None
w_ch_s: torch.Tensor | None
w_tok_s: torch.Tensor | None
@dataclass
class TypeConfig:
act_type: torch.dtype
weight_type: ScalarType
output_type: Optional[torch.dtype]
group_scale_type: Optional[torch.dtype]
group_zero_type: Optional[torch.dtype]
channel_scale_type: Optional[torch.dtype]
token_scale_type: Optional[torch.dtype]
output_type: torch.dtype | None
group_scale_type: torch.dtype | None
group_zero_type: torch.dtype | None
channel_scale_type: torch.dtype | None
token_scale_type: torch.dtype | None
def rand_data(shape, dtype=torch.float16, scale=1):
@ -93,8 +92,8 @@ def quantize_and_pack(
atype: torch.dtype,
w: torch.Tensor,
wtype: ScalarType,
stype: Optional[torch.dtype],
group_size: Optional[int],
stype: torch.dtype | None,
group_size: int | None,
zero_points: bool = False,
):
assert wtype.is_integer(), "TODO: support floating point weights"
@ -113,7 +112,7 @@ def quantize_and_pack(
def create_bench_tensors(
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
) -> list[BenchmarkTensors]:
m, n, k = shape
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
return res
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
def bench(

View File

@ -14,6 +14,10 @@ import ray
import torch
from ray.experimental.tqdm_ray import tqdm
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
_get_config_dtype_str,
)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
@ -134,43 +138,36 @@ def benchmark_config(
def run():
from vllm.model_executor.layers.fused_moe import override_config
if use_fp8_w8a8:
quant_dtype = torch.float8_e4m3fn
elif use_int8_w8a16:
quant_dtype = torch.int8
else:
quant_dtype = None
quant_config = FusedMoEQuantConfig.make(
quant_dtype=quant_dtype,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
with override_config(config):
if use_deep_gemm:
topk_weights, topk_ids, token_expert_indices = fused_topk(
x, input_gating, topk, False
)
return fused_experts(
x,
w1,
w2,
topk_weights,
topk_ids,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
allow_deep_gemm=True,
)
else:
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
topk_weights, topk_ids, token_expert_indices = fused_topk(
x, input_gating, topk, renormalize=not use_deep_gemm
)
return fused_experts(
x,
w1,
w2,
topk_weights,
topk_ids,
inplace=True,
quant_config=quant_config,
allow_deep_gemm=use_deep_gemm,
)
# JIT compilation & warmup
run()
@ -414,7 +411,7 @@ class BenchmarkWorker:
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(
dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
@ -547,7 +544,7 @@ def save_configs(
block_quant_shape: list[int],
save_dir: str,
) -> None:
dtype_str = get_config_dtype_str(
dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
@ -560,7 +557,7 @@ def save_configs(
filename = os.path.join(save_dir, filename)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
json.dump(configs, f, indent=4)
json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
f.write("\n")
@ -582,18 +579,22 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
hidden_size = config.hidden_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
"DeepseekV3ForCausalLM",
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"DeepseekV32ForCausalLM",
"Glm4MoeForCausalLM",
):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
"Qwen2MoeForCausalLM",
"Qwen3MoeForCausalLM",
@ -602,10 +603,18 @@ def main(args: argparse.Namespace):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
text_config = config.get_text_config()
E = text_config.num_experts
topk = text_config.num_experts_per_tok
intermediate_size = text_config.moe_intermediate_size
hidden_size = text_config.hidden_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
hidden_size = config.hidden_size
else:
# Support for llama4
config = config.get_text_config()
@ -613,6 +622,7 @@ def main(args: argparse.Namespace):
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")
@ -621,8 +631,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)

View File

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute

View File

@ -3,7 +3,6 @@
import random
import time
from typing import Optional
import torch
@ -37,7 +36,7 @@ def main(
seed: int,
do_profile: bool,
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
kv_cache_dtype: str | None = None,
) -> None:
current_platform.seed_everything(seed)

View File

@ -3,8 +3,8 @@
import argparse
import math
from collections.abc import Callable
from contextlib import contextmanager
from typing import Callable
from unittest.mock import patch
import torch

View File

@ -0,0 +1,172 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import time
import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)
logger = init_logger(__name__)
@torch.inference_mode()
def run_benchmark(
num_tokens: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
kv_cache_dtype: str,
num_iters: int,
benchmark_mode: str,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
current_platform.seed_everything(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
value = torch.randn_like(key)
# prepare the slot mapping.
# each token is assigned a unique slot in the KV-cache.
num_slots = block_size * num_blocks
if num_tokens > num_slots:
raise ValueError("num_tokens cannot exceed the total number of cache slots")
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
key_caches, value_caches = create_kv_caches_with_random(
num_blocks,
block_size,
1, # num_layers
num_heads,
head_size,
kv_cache_dtype,
dtype,
device=device,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# to free unused memory
del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
function_under_test = lambda: ops.reshape_and_cache(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
if benchmark_mode == "cudagraph":
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
function_under_test()
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
# warm-up
run_cuda_benchmark(3)
lat = run_cuda_benchmark(num_iters)
# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.cuda.empty_cache()
return lat
def main(args):
rows = []
for exp in range(1, 17):
n_tok = 2**exp
lat = run_benchmark(
num_tokens=n_tok,
num_heads=args.num_heads,
head_size=args.head_size,
block_size=args.block_size,
num_blocks=args.num_blocks,
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
kv_cache_dtype=args.kv_cache_dtype,
num_iters=args.iters,
benchmark_mode=args.mode,
device="cuda",
)
rows.append([n_tok, lat * 1e6]) # convert to microseconds
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--num-heads", type=int, default=128)
parser.add_argument(
"--head-size",
type=int,
choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128,
)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--num-blocks", type=int, default=128 * 128)
parser.add_argument(
"--dtype",
type=str,
choices=["half", "bfloat16", "float"],
default="bfloat16",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8"],
default="auto",
)
parser.add_argument("--iters", type=int, default=200)
parser.add_argument(
"--mode",
type=str,
choices=["cudagraph", "no_graph"],
default="cudagraph",
)
args = parser.parse_args()
main(args)

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time
@ -9,6 +7,9 @@ import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
@ -31,6 +32,8 @@ def run_benchmark(
kv_cache_dtype: str,
kv_cache_layout: str,
num_iters: int,
implementation: str,
benchmark_mode: str,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
@ -38,6 +41,14 @@ def run_benchmark(
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
if implementation not in ("cuda", "triton"):
raise ValueError(
f"Unsupported implementation: {implementation}. "
"Only 'cuda' and 'triton' are supported."
)
if implementation == "triton" and kv_cache_layout == "HND":
return float("nan") # Triton does not support HND layout yet.
current_platform.seed_everything(42)
torch.set_default_device(device)
@ -65,27 +76,49 @@ def run_benchmark(
cache_layout=kv_cache_layout,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# to free unused memory
del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
if implementation == "cuda":
function_under_test = lambda: ops.reshape_and_cache_flash(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
else:
function_under_test = lambda: triton_reshape_and_cache_flash(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
if benchmark_mode == "cudagraph":
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
ops.reshape_and_cache_flash(
key,
value,
key_cache,
value_cache,
slot_mapping,
kv_cache_dtype,
k_scale,
v_scale,
)
torch.cuda.synchronize()
function_under_test()
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
@ -116,10 +149,16 @@ def main(args):
kv_cache_dtype=args.kv_cache_dtype,
kv_cache_layout=layout,
num_iters=args.iters,
implementation=args.implementation,
benchmark_mode=args.mode,
device="cuda",
)
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
print(
f"Benchmark results for implementation {args.implementation}"
f" (measuring with {args.mode}):"
)
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
@ -151,6 +190,21 @@ if __name__ == "__main__":
)
parser.add_argument("--iters", type=int, default=100)
parser.add_argument(
"--implementation",
type=str,
choices=["cuda", "triton"],
default="cuda",
)
parser.add_argument(
"--mode",
type=str,
choices=["cudagraph", "no_graph"],
default="cudagraph",
)
args = parser.parse_args()
main(args)

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Optional, Union
import torch
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
def forward(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
residual: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
orig_dtype = x.dtype
x = x.to(torch.float32)
if residual is not None:
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
def rmsnorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -65,7 +64,7 @@ def rmsnorm_naive(
def rmsnorm_flashinfer(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
def rmsnorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate
from typing import Optional
import nvtx
import torch
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
seq_len: int,
num_heads: int,
head_size: int,
rotary_dim: Optional[int],
rotary_dim: int | None,
dtype: torch.dtype,
seed: int,
device: str,

View File

@ -1,5 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Comprehensive 3-way SiLU Benchmark Suite
This benchmark compares three SiLU implementations:
1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
2. Triton Kernel - Triton-based implementation
The suite generates detailed performance comparisons including:
- Memory bandwidth utilization
- Speedup ratios (baseline vs optimized implementations)
- Performance across different expert configurations and token distributions
"""
from collections.abc import Callable
import matplotlib.pyplot as plt
@ -7,7 +21,7 @@ import numpy as np
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
silu_mul_fp8_quant_deep_gemm_cuda,
persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
num_parallel_tokens,
group_size: int = 128,
eps: float = 1e-10,
expert_offsets: torch.Tensor = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
# Parse generation strategies
strategies = ["uniform", "max_t", "first_t"]
strategies = ["random_imbalanced", "uniform", "max_t"]
def benchmark(
@ -195,15 +210,27 @@ def benchmark(
current_platform.seed_everything(42 + seed_offset)
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
if gen_strategy == "uniform":
r = torch.rand(size=(E,), device="cuda")
if gen_strategy == "random_imbalanced":
def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
mean = total_tokens // n_e
min_max = mean // ratio
e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
e[0] = min_max
r = torch.rand(size=(E - 1,))
r /= r.sum()
r *= total_tokens - min_max
r = r.round().long()
e[1:] = r.to(device=device)
return e
tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
elif gen_strategy == "uniform":
r = torch.rand(size=(E,))
r /= r.sum()
r *= total_tokens
tokens_per_expert = r.int()
tokens_per_expert = torch.minimum(
tokens_per_expert,
torch.ones((E,), device=r.device, dtype=torch.int) * T,
)
r = r.round().long()
tokens_per_expert = r
elif gen_strategy == "max_t":
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
tokens_per_expert.fill_(total_tokens / E)
@ -281,40 +308,34 @@ def benchmark(
def create_comparison_plot(
ratio, cuda_times, baseline_times, config_labels, strategy_name, id
ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
):
"""Create a comparison plot for a specific generation strategy"""
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
# Configure x-axis positions
x = np.arange(len(config_labels))
width = 0.35
width = 0.25
# Execution Time plot (lower is better)
ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
ax.bar(
x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
)
ax.bar(
x + width / 2,
baseline_times,
width,
label="Baseline",
alpha=0.8,
color="orange",
x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
)
# Add speedup labels over each bar pair
# Add speedup labels over each bar trio
for i in range(len(x)):
speedup = ratio[i]
max_height = max(cuda_times[i], baseline_times[i])
triton_v2_speedup = ratios[i][1] # triton/v2
max_height = max(silu_v2_times[i], triton_times[i])
# Triton/V2 speedup
ax.text(
x[i],
x[i] + width / 2,
max_height + max_height * 0.02,
f"{speedup:.2f}x",
f"{triton_v2_speedup:.2f}x",
ha="center",
va="bottom",
fontweight="bold",
fontsize=9,
fontsize=8,
)
ax.set_xlabel("Configuration")
@ -332,56 +353,75 @@ def create_comparison_plot(
def create_combined_plot(all_results):
"""Create a combined plot with all strategies in one PNG"""
num_strategies = len(all_results)
fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
if num_strategies == 1:
axes = [axes]
for idx, (
strategy_name,
ratio,
cuda_times,
baseline_times,
all_ratios,
all_silu_v2_results,
all_triton_results,
config_labels,
config_x_axis,
) in enumerate(all_results):
ax = axes[idx]
# Flatten the nested results to get bandwidth percentages for plotting
silu_v2_bandwidths = []
triton_bandwidths = []
flat_ratios = []
for config_results in all_silu_v2_results:
for result in config_results:
silu_v2_bandwidths.append(result[3]) # bandwidth percentage
for config_results in all_triton_results:
for result in config_results:
triton_bandwidths.append(result[3]) # bandwidth percentage
for config_ratios in all_ratios:
for ratio in config_ratios:
flat_ratios.append(ratio)
# Configure x-axis positions
x = np.arange(len(config_labels))
width = 0.35
width = 0.25
# Execution Time plot (lower is better)
# Bandwidth utilization plot (higher is better)
ax.bar(
x - width / 2,
cuda_times,
x,
silu_v2_bandwidths,
width,
label="CUDA Kernel",
label="SiLU V2 (CUDA)",
alpha=0.8,
color="blue",
)
ax.bar(
x + width / 2,
baseline_times,
x + width,
triton_bandwidths,
width,
label="Baseline",
label="Triton Kernel",
alpha=0.8,
color="orange",
color="green",
)
# Add speedup labels over each bar pair
# Add speedup labels over each bar trio
for i in range(len(x)):
speedup = ratio[i]
max_height = max(cuda_times[i], baseline_times[i])
triton_v2_speedup = flat_ratios[i] # triton/v2
max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
# Triton/V2 speedup
ax.text(
x[i],
x[i] + width / 2,
max_height + max_height * 0.02,
f"{speedup:.2f}x",
f"{triton_v2_speedup:.2f}x",
ha="center",
va="bottom",
fontweight="bold",
fontsize=9,
fontsize=8,
)
ax.set_xlabel("Configuration")
@ -395,7 +435,7 @@ def create_combined_plot(all_results):
ax.grid(True, alpha=0.3)
plt.tight_layout()
filename = "../../silu_bench/silu_benchmark_combined.png"
filename = "silu_benchmark_combined_3way.png"
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show()
@ -405,7 +445,9 @@ def create_combined_plot(all_results):
outer_dim = 7168
configs = [
# DeepSeekV3 Configs
# (1, 56, 7168),
(8, 1024, 7168),
# (32, 56, 7168),
# DeepSeekV3 Configs
(32, 1024, 7168),
# DeepSeekV3 Configs
@ -417,6 +459,7 @@ num_warmups = 20
strategy_descriptions = {
"uniform": "Uniform Random",
"random_imbalanced": "Imbalanced Random",
"max_t": "Even Assignment",
"first_t": "experts[0] = T, experts[1:] = 0",
}
@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies):
print(f"Testing strategy: {strategy_descriptions[strategy]}")
print(f"{'=' * 60}")
# Collect benchmark data for both algorithms
# Collect benchmark data for all three algorithms
config_labels = []
config_x_axis = []
all_cuda_results = []
all_baseline_results = []
all_silu_v2_results = []
all_triton_results = []
all_ratios = []
for E, T, H in configs:
total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
total_tokens_config = []
for i in [8, 16, 32, 64, 128, 256, 512]:
if i <= T:
total_tokens_config.append(i * E)
config_x_axis.append(total_tokens_config)
cuda_results = []
baseline_results = []
silu_v2_results = []
triton_results = []
ratios = []
for total_tokens in total_tokens_config:
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
config_labels.append(config_label)
# CUDA kernel results
time_ms_cuda, gflops, gbps, perc = benchmark(
silu_mul_fp8_quant_deep_gemm_cuda,
# SiLU V2 (CUDA kernel) results
time_ms_silu_v2, gflops, gbps, perc = benchmark(
persistent_masked_m_silu_mul_quant,
E,
T,
H,
@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups,
gen_strategy=strategy,
)
cuda_results.append((time_ms_cuda, gflops, gbps, perc))
silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
# Baseline results
# Triton kernel results
time_ms_triton, gflops, gbps, perc = benchmark(
silu_mul_fp8_quant_deep_gemm_triton,
E,
@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups,
gen_strategy=strategy,
)
baseline_results.append((time_ms_triton, gflops, gbps, perc))
ratios.append(time_ms_triton / time_ms_cuda)
triton_results.append((time_ms_triton, gflops, gbps, perc))
print(f"Completed: {config_label}")
all_cuda_results.append(cuda_results)
all_baseline_results.append(baseline_results)
# Calculate speedup ratios (triton baseline / implementation)
triton_v2_ratio = time_ms_triton / time_ms_silu_v2
ratios.append(triton_v2_ratio)
print(
f"Completed: {config_label}:"
f" V2: {time_ms_silu_v2:.3f}ms,"
f" Triton: {time_ms_triton:.3f}ms"
)
all_silu_v2_results.append(silu_v2_results)
all_triton_results.append(triton_results)
all_ratios.append(ratios)
# Store results for combined plotting
@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies):
(
strategy_descriptions[strategy],
all_ratios,
all_cuda_results,
all_baseline_results,
all_silu_v2_results,
all_triton_results,
config_labels,
config_x_axis,
)
@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies):
# Print summary table for this strategy
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
print("-" * 60)
print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
print("-" * 90)
for i, (E, T, H) in enumerate(configs):
speedup = baseline_results[i][0] / cuda_results[i][0]
# Get the first result for each config (simplifying for summary)
v2_time = silu_v2_results[i][0]
triton_time = triton_results[i][0]
triton_v2_speedup = triton_time / v2_time
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
print(
f"{config_label:<20} {cuda_results[i][0]:8.5f} "
f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
f"{triton_v2_speedup:8.2f}x"
)
@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results):
num_strategies = len(all_results)
num_configs = len(configs)
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
fig, axs = plt.subplots(
num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
)
# Add main title to the entire figure
fig.suptitle(
"Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
fontsize=16,
"Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
fontsize=18,
fontweight="bold",
y=0.98,
)
@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results):
(
strategy_name,
all_ratios,
all_cuda_results,
all_baseline_results,
all_silu_v2_results,
all_triton_results,
config_labels,
config_x_axis,
) = result
@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results):
ratios = all_ratios[config_idx]
total_tokens_values = config_x_axis[config_idx]
# Extract CUDA and Triton bandwidth percentages
cuda_bandwidth_percentages = [
result[3] for result in all_cuda_results[config_idx]
# Extract speedup ratios
triton_v2_ratios = [ratio for ratio in ratios]
# Extract bandwidth percentages for all implementations
v2_bandwidth_percentages = [
result[3] for result in all_silu_v2_results[config_idx]
]
triton_bandwidth_percentages = [
result[3] for result in all_baseline_results[config_idx]
result[3] for result in all_triton_results[config_idx]
]
# Plot speedup ratios vs total tokens (left plot)
ax_speedup.plot(
total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
total_tokens_values,
triton_v2_ratios,
"go-",
linewidth=3,
markersize=8,
label="Triton/V2 Speedup",
)
ax_speedup.set_title(
f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
fontsize=12,
fontweight="bold",
)
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
ax_speedup.legend(prop={"weight": "bold"})
ax_speedup.grid(True, alpha=0.3)
# Plot bandwidth utilization (right plot)
ax_bandwidth.plot(
total_tokens_values,
cuda_bandwidth_percentages,
"ro-",
v2_bandwidth_percentages,
"o-",
linewidth=3,
markersize=8,
label="CUDA",
label="SiLU V2",
color="blue",
)
ax_bandwidth.plot(
total_tokens_values,
triton_bandwidth_percentages,
"go-",
"o-",
linewidth=3,
markersize=8,
label="Triton",
color="green",
)
ax_bandwidth.set_title(
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results):
for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_fontweight("bold")
# Add value labels on speedup points
for x, y in zip(total_tokens_values, ratios):
# Add value labels on Triton/V2 speedup points
for x, y in zip(total_tokens_values, triton_v2_ratios):
ax_speedup.annotate(
f"{y:.2f}x",
(x, y),
textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=10,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
)
# Add value labels on CUDA bandwidth points
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=9,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
)
# Add value labels on Triton bandwidth points
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, -15),
ha="center",
fontsize=9,
@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results):
plt.tight_layout()
plt.subplots_adjust(top=0.93) # Make room for main title
filename = "silu_benchmark_total_tokens.png"
filename = "silu_benchmark_total_tokens_3way.png"
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show()
return filename
# Create combined plot with all strategies
combined_plot_filename = create_total_tokens_plot(all_results)
# Create comprehensive 3-way comparison plots
combined_plot_filename = create_combined_plot(all_results)
total_tokens_plot_filename = create_total_tokens_plot(all_results)
print(f"\n{'=' * 60}")
print("Benchmark Complete!")
print(f"Generated combined plot: {combined_plot_filename}")
print(f"{'=' * 60}")
print(f"\n{'=' * 80}")
print("3-Way Benchmark Suite Complete!")
print(f"Generated combined comparison plot: {combined_plot_filename}")
print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
print("Compared: SiLU V2 (CUDA), and Triton implementations")
print(f"{'=' * 80}")

View File

@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
from typing import Optional
import flashinfer
import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_decode(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),

View File

@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
from typing import Optional
import flashinfer
import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_prefill(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),

View File

@ -11,13 +11,13 @@ from datetime import datetime
from typing import Any
import torch
import triton
from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_block_fp8_matmul,
_w8a8_triton_block_scaled_mm,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)
@ -83,7 +83,7 @@ def w8a8_block_matmul(
)
if A.dtype == torch.float8_e4m3fn:
kernel = _w8a8_block_fp8_matmul
kernel = _w8a8_triton_block_scaled_mm
else:
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off
# ruff: noqa: E501
import time
@ -8,27 +7,33 @@ import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8,
w8a8_block_fp8_matmul,
w8a8_triton_block_scaled_mm,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
from vllm.utils.deep_gemm import (
calc_diff,
fp8_gemm_nt,
get_col_major_tma_aligned_tensor,
per_block_cast_to_fp8,
)
def benchmark_shape(m: int,
n: int,
k: int,
warmup: int = 100,
repeat: int = 10000,
verbose: bool = False) -> dict:
def benchmark_shape(
m: int,
n: int,
k: int,
warmup: int = 100,
repeat: int = 10000,
verbose: bool = False,
) -> dict:
"""Benchmark all implementations for a specific (m, n, k) shape."""
if verbose:
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
# Create test tensors
A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
# Reference result in BF16
torch.cuda.synchronize()
@ -45,34 +50,39 @@ def benchmark_shape(m: int,
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
A, block_size[1], column_major_scales=True)
A, block_size[1], column_major_scales=True
)
# === DeepGEMM Implementation ===
def deepgemm_gemm():
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
(B_deepgemm, B_scale_deepgemm),
C_deepgemm)
fp8_gemm_nt(
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
)
return C_deepgemm
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
return w8a8_block_fp8_matmul(A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,
block_size,
output_dtype=torch.bfloat16)
return w8a8_triton_block_scaled_mm(
A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,
block_size,
output_dtype=torch.bfloat16,
)
# === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm():
return ops.cutlass_scaled_mm(A_vllm_cutlass,
B_vllm.T,
scale_a=A_scale_vllm_cutlass,
scale_b=B_scale_vllm.T,
out_dtype=torch.bfloat16)
return ops.cutlass_scaled_mm(
A_vllm_cutlass,
B_vllm.T,
scale_a=A_scale_vllm_cutlass,
scale_b=B_scale_vllm.T,
out_dtype=torch.bfloat16,
)
# Run correctness check first
if verbose:
@ -89,26 +99,23 @@ def benchmark_shape(m: int,
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
print("vLLM Triton vs DeepGEMM difference: "
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
print("vLLM CUTLASS vs DeepGEMM difference: "
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
print(
"vLLM Triton vs DeepGEMM difference: "
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
)
print(
"vLLM CUTLASS vs DeepGEMM difference: "
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
)
# Benchmark implementations
implementations = {
"DeepGEMM": deepgemm_gemm,
"vLLM Triton": vllm_triton_gemm,
"vLLM CUTLASS": vllm_cutlass_gemm
"vLLM CUTLASS": vllm_cutlass_gemm,
}
benchmark_results = {
"shape": {
"m": m,
"n": n,
"k": k
},
"implementations": {}
}
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
for name, func in implementations.items():
# Warmup
@ -136,38 +143,36 @@ def benchmark_shape(m: int,
"tflops": tflops,
"gb_s": gb_s,
"diff": {
"DeepGEMM":
0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
"Reference":
deepgemm_diff if name == "DeepGEMM" else
(vllm_triton_diff
if name == "vLLM Triton" else vllm_cutlass_diff)
}
"DeepGEMM": 0.0
if name == "DeepGEMM"
else calc_diff(func(), C_deepgemm),
"Reference": deepgemm_diff
if name == "DeepGEMM"
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
},
}
if verbose:
print(
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
)
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
# Calculate speedups
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
for name, data in benchmark_results["implementations"].items():
if name != "DeepGEMM":
speedup = baseline / data["time_ms"]
benchmark_results["implementations"][name][
"speedup_vs_deepgemm"] = speedup
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
if verbose:
print(f"DeepGEMM is {1/speedup:.2f}x "
f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
print(
f"DeepGEMM is {1 / speedup:.2f}x "
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
)
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
"time_ms"]
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
"time_ms"]
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
benchmark_results["implementations"]["vLLM CUTLASS"][
"speedup_vs_triton"] = cutlass_vs_triton
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
cutlass_vs_triton
)
if verbose:
print(
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@ -179,8 +184,7 @@ def benchmark_shape(m: int,
def format_table_row(values, widths):
"""Format a row with specified column widths."""
return "| " + " | ".join(f"{val:{w}}"
for val, w in zip(values, widths)) + " |"
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
def print_table(headers, rows, title=None):
@ -288,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["DeepGEMM"]
deepgemm_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
])
deepgemm_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
]
)
print_table(deepgemm_headers,
deepgemm_rows,
title="DeepGEMM Implementation:")
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
# Print vLLM Triton table
triton_headers = [
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
]
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
triton_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM Triton"]
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
triton_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
format_speedup(speedup)
])
triton_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(speedup),
]
)
print_table(triton_headers,
triton_rows,
title="vLLM Triton Implementation:")
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
# Print vLLM CUTLASS table
cutlass_headers = [
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
"vs Triton"
"m",
"n",
"k",
"Time (μs)",
"TFLOPS",
"GB/s",
"vs DeepGEMM",
"vs Triton",
]
cutlass_rows = []
for result in all_results:
@ -327,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
impl_data = result["implementations"]["vLLM CUTLASS"]
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
cutlass_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
format_speedup(vs_deepgemm),
format_speedup(vs_triton)
])
cutlass_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(vs_deepgemm),
format_speedup(vs_triton),
]
)
print_table(cutlass_headers,
cutlass_rows,
title="vLLM CUTLASS Implementation:")
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
# Calculate and print averages
print("\n===== AVERAGE PERFORMANCE =====")
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
avg_metrics = {
impl: {
"tflops": 0,
"gb_s": 0,
"time_ms": 0
}
for impl in implementations
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
}
for result in all_results:
@ -366,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
avg_rows.append([
impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
])
avg_rows.append(
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
)
print_table(avg_headers, avg_rows)
@ -376,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
avg_speedups = {
"DeepGEMM vs vLLM Triton": 0,
"DeepGEMM vs vLLM CUTLASS": 0,
"vLLM CUTLASS vs vLLM Triton": 0
"vLLM CUTLASS vs vLLM Triton": 0,
}
for result in all_results:
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
"time_ms"]
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
avg_speedups[
"DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
avg_speedups[
"DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
avg_speedups[
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
vllm_triton_time / vllm_cutlass_time
)
print("\n===== AVERAGE SPEEDUPS =====")
speedup_headers = ["Comparison", "Speedup"]
@ -408,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
for impl in implementations:
avg_diff[impl] += result["implementations"][impl]["diff"][
"Reference"]
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
diff_headers = ["Implementation", "Avg Diff vs Reference"]
diff_rows = []

View File

@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Iterable
from typing import Any, Callable, Optional
from collections.abc import Callable, Iterable
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
def __init__(
self,
cuda_graph_params: Optional[CudaGraphBenchParams],
cuda_graph_params: CudaGraphBenchParams | None,
label: str,
sub_label: str,
description: str,

View File

@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
```
### JSON configuration file for synthetic conversations generation
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
The file `generate_multi_turn.json` is an example file.
The file must contain the sections `prompt_input` and `prompt_output`.
The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
The final value will always be rounded to an even number so each user turn has a reply.
* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
* `num_tokens` - Total token length of each **user** message (one turn).
The `prompt_output` section must contain `num_tokens`:
* `num_tokens` - Total token length of each **assistant** message (one turn).
### Random distributions for synthetic conversations generation
When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
The distribution determines how to randomly sample values for the field.
The available distributions are listed below.
**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
#### constant
```json
{
"distribution": "constant",
"value": 500
}
```
* `value` - the fixed integer value (always returns the same number).
#### uniform
```json
{
"distribution": "uniform",
"min": 12,
"max": 18
}
```
* `min` - minimum value (inclusive).
* `max` - maximum value (inclusive), should be equal or larger than min.
#### lognormal
```json
{
"distribution": "lognormal",
"average": 1000,
"max": 5000
}
```
You can parameterize the lognormal distribution in one of two ways:
Using the average and optional median ratio:
* `average` - target average value of the distribution.
* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
Using the parameters of the underlying normal distribution:
* `mean` - mean of the underlying normal distribution.
* `sigma` - standard deviation of the underlying normal distribution.
#### zipf
```json
{
"distribution": "zipf",
"alpha": 1.2,
"max": 100
}
```
* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
#### poisson
```json
{
"distribution": "poisson",
"alpha": 10,
"max": 50
}
```
* `alpha` - expected value (λ). Also the variance of the distribution.
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from statistics import mean
from typing import Any, NamedTuple, Optional, Union
from typing import Any, NamedTuple
import numpy as np # type: ignore
import pandas as pd # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
class UniformDistribution(Distribution):
def __init__(
self,
min_val: Union[int, float],
max_val: Union[int, float],
min_val: int | float,
max_val: int | float,
is_integer: bool = True,
) -> None:
self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
class ConstantDistribution(Distribution):
def __init__(self, value: Union[int, float]) -> None:
def __init__(self, value: int | float) -> None:
self.value = value
self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution):
def __init__(
self, mean: float, sigma: float, max_val: Optional[int] = None
self,
mean: float | None = None,
sigma: float | None = None,
average: int | None = None,
median_ratio: float | None = None,
max_val: int | None = None,
) -> None:
self.average = average
self.median_ratio = median_ratio
self.max_val = max_val
if average is not None:
if average < 1:
raise ValueError("Lognormal average must be positive")
if mean or sigma:
raise ValueError(
"When using lognormal average, you can't provide mean/sigma"
)
if self.median_ratio is None:
# Default value that provides relatively wide range of values
self.median_ratio = 0.85
# Calculate mean/sigma of np.random.lognormal based on the average
mean, sigma = self._generate_lognormal_by_median(
target_average=self.average, median_ratio=self.median_ratio
)
else:
if mean is None or sigma is None:
raise ValueError(
"Must provide both mean and sigma if average is not used"
)
if mean <= 0 or sigma < 0:
raise ValueError(
"Lognormal mean must be positive and sigma must be non-negative"
)
# Mean and standard deviation of the underlying normal distribution
# Based on numpy.random.lognormal
self.mean = mean
self.sigma = sigma
self.max_val = max_val
@staticmethod
def _generate_lognormal_by_median(
target_average: int, median_ratio: float
) -> tuple[float, float]:
"""
Compute (mu, sigma) for a lognormal distribution given:
- a target average (mean of the distribution)
- a ratio of median / mean (controls skewness), assume mean > median
Background:
If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
* mean(X) = exp(mu + sigma^2 / 2)
* median(X) = exp(mu)
So:
median / mean = exp(mu) / exp(mu + sigma^2 / 2)
= exp(-sigma^2 / 2)
Rearranging:
sigma^2 = 2 * ln(mean / median)
mu = ln(median)
This gives a unique (mu, sigma) for any valid mean and median.
"""
# Check input validity: median must be smaller than mean
if median_ratio <= 0 or median_ratio >= 1:
raise ValueError("median_ratio must be in range (0, 1)")
target_median = target_average * median_ratio
# Solve sigma^2 = 2 * ln(mean / median)
sigma = np.sqrt(2 * np.log(target_average / target_median))
mu = np.log(target_median)
return mu, sigma
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
if self.average is not None:
# Scale to average
samples *= self.average / samples.mean()
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
return f"LognormalDistribution[{self.mean}, {self.sigma}]"
if self.average:
return (
f"LognormalDistribution[{self.average}, "
f"{self.median_ratio}, {self.max_val}]"
)
return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
class GenConvArgs(NamedTuple):
@ -173,10 +257,21 @@ def get_random_distribution(
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
max_val = conf.get("max", None)
if "average" in conf:
# Infer lognormal mean/sigma (numpy) from input average
median_ratio = conf.get("median_ratio", None)
return LognormalDistribution(
average=conf["average"], median_ratio=median_ratio, max_val=max_val
)
# Use mean/sigma directly (for full control over the distribution)
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
max_val = conf.get("max", None)
return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
return LognormalDistribution(
mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
)
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum
from http import HTTPStatus
from statistics import mean
from typing import NamedTuple, Optional, Union
from typing import NamedTuple
import aiohttp # type: ignore
import numpy as np # type: ignore
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
class ClientArgs(NamedTuple):
seed: int
max_num_requests: Optional[int]
max_num_requests: int | None
skip_first_turn: bool
max_turns: Optional[int]
max_turns: int | None
max_active_conversations: int
verbose: bool
print_content: bool
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
class MetricStats:
def __init__(self) -> None:
self.min: Optional[float] = None
self.max: Optional[float] = None
self.avg: Optional[float] = None
self.min: float | None = None
self.max: float | None = None
self.avg: float | None = None
self.sum = 0.0
self.count = 0
@ -143,7 +143,7 @@ class MovingAverage:
self.index = 0
self.sum = 0.0
self.count = 0
self.avg: Optional[float] = None
self.avg: float | None = None
def update(self, new_value: float) -> None:
if self.count < self.window_size:
@ -169,7 +169,7 @@ class MovingAverage:
class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
self.metrics: dict[str, MovingAverage | MetricStats] = {
"moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(),
@ -198,14 +198,6 @@ class DebugStats:
self.logger.info("-" * 50)
# Must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text
def nanosec_to_millisec(value: float) -> float:
return value / 1000000.0
@ -220,8 +212,8 @@ async def send_request(
chat_url: str,
model: str,
stream: bool = True,
min_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
min_tokens: int | None = None,
max_tokens: int | None = None,
) -> ServerResponse:
payload = {
"model": model,
@ -250,9 +242,9 @@ async def send_request(
timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True
ttft: Optional[float] = None
ttft: float | None = None
chunk_delay: list[int] = []
latency: Optional[float] = None
latency: float | None = None
first_chunk = ""
generated_text = ""
@ -269,7 +261,7 @@ async def send_request(
if not chunk_bytes:
continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk == "[DONE]":
# End of stream
latency = time.perf_counter_ns() - start_time
@ -364,7 +356,7 @@ async def send_turn(
req_args: RequestArgs,
verbose: bool,
verify_output: bool,
) -> Optional[RequestStats]:
) -> RequestStats | None:
assert messages_to_use > 0
assert messages_to_use <= len(conversation_messages)
@ -644,7 +636,7 @@ async def client_main(
if args.verbose:
curr_time_sec: float = time.perf_counter()
time_since_last_turn: Union[str, float] = "N/A"
time_since_last_turn: str | float = "N/A"
if conv_id in time_of_last_turn:
time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3
@ -769,7 +761,7 @@ def get_client_config(
"Number of conversations must be equal or larger than the number of clients"
)
max_req_per_client: Optional[int] = None
max_req_per_client: int | None = None
if args.max_num_requests is not None:
# Max number of requests per client
req_per_client = args.max_num_requests // args.num_clients
@ -936,13 +928,13 @@ async def main_mp(
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
)
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed)
rps = "N/A"
runtime_left_sec: Union[str, float] = round(
runtime_left_sec: str | float = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
)
if percent < 0.05:
@ -1032,7 +1024,7 @@ def process_statistics(
warmup_percentages: list[float],
test_params: dict,
verbose: bool,
gen_conv_args: Optional[GenConvArgs] = None,
gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False,
) -> None:
if len(client_metrics) == 0:

View File

@ -13,7 +13,7 @@ import argparse
import json
import random
from statistics import mean
from typing import Any, Optional
from typing import Any
import pandas as pd # type: ignore
import tqdm # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
def content_is_valid(
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
content: str, min_content_len: int | None, max_content_len: int | None
) -> bool:
if min_content_len and len(content) < min_content_len:
return False
@ -37,7 +37,7 @@ def content_is_valid(
def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
) -> None:
# Collect statistics
stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
seed: int,
input_file: str,
output_file: str,
max_items: Optional[int],
min_content_len: Optional[int] = None,
max_content_len: Optional[int] = None,
min_turns: Optional[int] = None,
max_turns: Optional[int] = None,
model: Optional[str] = None,
max_items: int | None,
min_content_len: int | None = None,
max_content_len: int | None = None,
min_turns: int | None = None,
max_turns: int | None = None,
model: str | None = None,
) -> None:
if min_turns and max_turns:
assert min_turns <= max_turns

View File

@ -15,9 +15,8 @@
},
"prefix_num_tokens": {
"distribution": "lognormal",
"mean": 6,
"sigma": 4,
"max": 1500
"average": 1000,
"max": 5000
},
"num_tokens": {
"distribution": "uniform",

View File

@ -1,49 +0,0 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.lint.isort]
known-first-party = ["vllm"]
[tool.ruff.format]
docstring-code-format = true

View File

@ -101,6 +101,7 @@ else()
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND)
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED)
@ -177,8 +178,14 @@ elseif (S390_FOUND)
"-mzvector"
"-march=native"
"-mtune=native")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
if(RVV_FOUND)
message(FAIL_ERROR "Can't support rvv now.")
else()
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
endif()
else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
endif()
#
@ -191,13 +198,24 @@ else()
endif()
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.9
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
FetchContent_Declare(
oneDNN
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
)
else()
message(STATUS "Downloading oneDNN from GitHub")
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.9
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
endif()
if(USE_ACL)
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@ -206,6 +224,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
add_compile_definitions(VLLM_USE_ACL)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
@ -258,7 +277,8 @@ set(VLLM_EXT_SRC
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp")
"csrc/cpu/torch_bindings.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
@ -300,4 +320,4 @@ define_gpu_extension_target(
WITH_SOABI
)
message(STATUS "Enabling C extension.")
message(STATUS "Enabling C extension.")

View File

@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
@ -33,23 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with
# sm90a
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
set(SUPPORT_ARCHS)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
list(APPEND SUPPORT_ARCHS 9.0a)
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
list(APPEND SUPPORT_ARCHS 10.0a)
endif()
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS)
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
)
set(FlashMLA_Extension_SOURCES
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
)
set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc)
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
)
set(FlashMLA_Extension_INCLUDES
${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
)
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_Extension_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
define_gpu_extension_target(
_flashmla_C
DESTINATION vllm
@ -60,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
USE_SABI 3
WITH_SOABI)
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
target_compile_options(_flashmla_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
define_gpu_extension_target(
_flashmla_extension_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${FlashMLA_Extension_SOURCES}
COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
USE_SABI 3
WITH_SOABI)
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
target_compile_options(_flashmla_extension_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
else()
# Create an empty target for setup.py when not targeting sm90a systems
# Create empty targets for setup.py when not targeting sm90a systems
add_custom_target(_flashmla_C)
add_custom_target(_flashmla_extension_C)
endif()

View File

@ -0,0 +1,97 @@
include(FetchContent)
set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
if(DEFINED ENV{QUTLASS_SRC_DIR})
set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
endif()
if(QUTLASS_SRC_DIR)
FetchContent_Declare(
qutlass
SOURCE_DIR ${QUTLASS_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
qutlass
GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif()
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
if(QUTLASS_ARCHS MATCHES "10\\.0a")
set(QUTLASS_TARGET_CC 100)
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
set(QUTLASS_TARGET_CC 120)
else()
message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
endif()
set(QUTLASS_SOURCES
${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
)
set(QUTLASS_INCLUDES
${qutlass_SOURCE_DIR}
${qutlass_SOURCE_DIR}/qutlass
${qutlass_SOURCE_DIR}/qutlass/csrc/include
${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
)
if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
else()
message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
"Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
endif()
set_gencode_flags_for_srcs(
SRCS "${QUTLASS_SOURCES}"
CUDA_ARCHS "${QUTLASS_ARCHS}"
)
target_sources(_C PRIVATE ${QUTLASS_SOURCES})
target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
target_compile_definitions(_C PRIVATE
QUTLASS_DISABLE_PYBIND=1
TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
)
set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
)
else()
if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
message(STATUS
"[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
else()
message(STATUS
"[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
"CUDA_ARCHS='${CUDA_ARCHS}'.")
endif()
endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -16,7 +16,7 @@ import shutil
from torch.utils.hipify.hipify_python import hipify
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Project directory where all the source + include files live.
@ -34,15 +34,14 @@ if __name__ == '__main__':
)
# Source files to convert.
parser.add_argument("sources",
help="Source files to hipify.",
nargs="*",
default=[])
parser.add_argument(
"sources", help="Source files to hipify.", nargs="*", default=[]
)
args = parser.parse_args()
# Limit include scope to project_dir only
includes = [os.path.join(args.project_dir, '*')]
includes = [os.path.join(args.project_dir, "*")]
# Get absolute path for all source files.
extra_files = [os.path.abspath(s) for s in args.sources]
@ -51,25 +50,31 @@ if __name__ == '__main__':
# The directory might already exist to hold object files so we ignore that.
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
hipify_result = hipify(project_directory=args.project_dir,
output_directory=args.output_dir,
header_include_dirs=[],
includes=includes,
extra_files=extra_files,
show_detailed=True,
is_pytorch_extension=True,
hipify_extra_files_only=True)
hipify_result = hipify(
project_directory=args.project_dir,
output_directory=args.output_dir,
header_include_dirs=[],
includes=includes,
extra_files=extra_files,
show_detailed=True,
is_pytorch_extension=True,
hipify_extra_files_only=True,
)
hipified_sources = []
for source in args.sources:
s_abs = os.path.abspath(source)
hipified_s_abs = (hipify_result[s_abs].hipified_path if
(s_abs in hipify_result
and hipify_result[s_abs].hipified_path is not None)
else s_abs)
hipified_s_abs = (
hipify_result[s_abs].hipified_path
if (
s_abs in hipify_result
and hipify_result[s_abs].hipified_path is not None
)
else s_abs
)
hipified_sources.append(hipified_s_abs)
assert (len(hipified_sources) == len(args.sources))
assert len(hipified_sources) == len(args.sources)
# Print hipified source files.
print("\n".join(hipified_sources))

View File

@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
list(REMOVE_DUPLICATES _PTX_ARCHS)
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
# If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
set(_CUDA_ARCHS)
foreach(_arch ${_SRC_CUDA_ARCHS})
if(_arch MATCHES "\\a$")
if(_arch MATCHES "[af]$")
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
string(REPLACE "a" "" _base "${_arch}")
string(REGEX REPLACE "[af]$" "" _base "${_arch}")
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
list(APPEND _CUDA_ARCHS "${_arch}")
@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
endif()
set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
target_compile_options(${GPU_MOD_NAME} PRIVATE
$<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)

12
codecov.yml Normal file
View File

@ -0,0 +1,12 @@
codecov:
require_ci_to_pass: false
fixes:
# Map source code paths to repository root paths
# Wildcards match any Python version (python3.*)
- "/vllm-workspace/src/vllm/::vllm/"
- "/vllm-workspace/vllm/::vllm/"
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"

View File

@ -28,10 +28,10 @@
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
#include "../quantization/fp8/amd/quant_utils.cuh"
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
typedef __hip_bfloat16 __nv_bfloat16;
#else
#include "../quantization/fp8/nvidia/quant_utils.cuh"
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))

Some files were not shown because too many files have changed in this diff Show More