Compare commits

...

421 Commits

Author SHA1 Message Date
8a81d776ce Fix typo in ValueError message: use kv_role instead of kv_disagg_role (#27166)
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
2025-10-19 19:47:19 +00:00
f6fdacd82c [Bugfix] Fix error with penalties when speculative decoding and structural output are enabled (#26586)
Signed-off-by: southfreebird <yvorott@gmail.com>
2025-10-19 19:24:46 +00:00
d31f7844f8 [Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-19 05:20:55 -07:00
7a6c8c3fa1 [Chore] Separate out vllm.utils.network_utils (#27164)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
2025-10-19 03:06:32 -07:00
221bf72577 output type conversion fix (#27159) 2025-10-19 08:10:07 +00:00
b3aba04e5a [Benchmark] Convenience script for multiple parameter combinations (#27085)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-18 23:57:01 -07:00
8a297115e2 [Chore] Separate out hashing utilities from vllm.utils (#27151)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-10-19 11:09:38 +08:00
191eed0bb9 [BugFix] Fix lazy imports involving outlines_core (#27158)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-10-19 02:35:32 +00:00
fb860670da [Minor] Remove unused env variable (#27161) 2025-10-18 18:48:35 -07:00
83e760c57d [V1][Metrics][Plugin] Add plugin support for custom StatLoggerBase implementations (#22456)
Signed-off-by: tovam <tovam@pliops.com>
2025-10-18 15:12:46 -07:00
c2bba69065 [BugFix] Disable fp8 kv-cache by default for DeepSeek V3.2 (#27121)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-18 22:05:23 +00:00
e133d6d218 [BugFix] fix graph partition signature (#27139)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-18 17:34:36 -04:00
a1946c9f61 [Chore] Separate out profiling utilities from vllm.utils (#27150)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-10-18 19:12:01 +00:00
9f020f4f31 [BugFix] Fix failing gemma-3-1b-it test: test_lm_eval_accuracy_v1_engine[google/gemma-3-1b-it] (#27111)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-18 12:44:39 -06:00
3b45075206 [Minor] Add some clarifying comments to recent changes (#27130)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-18 09:52:45 -07:00
168e578efc Fix incorrect string formatting in barrier timeout exceptions (#27149)
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
2025-10-18 09:51:57 -07:00
6ac5e06f7c [Chore] Clean up pytorch helper functions in vllm.utils (#26908)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: isotr0py <2037008807@qq.com>
2025-10-18 09:48:22 -07:00
5c2acb270a [Models][QwenVL] Remove unnecessary .contiguous() calls (#27106)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-18 07:05:05 -07:00
b26b70bec4 [Misc] Refactor get_kv_cache_spec into AttentionLayerBase (#26587)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-18 13:51:21 +00:00
ab4be40fc5 [fix][cpu] fix prefill attention in CPU attention backend (#27035)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2025-10-18 13:30:21 +00:00
245e4f2c01 [Feature] Batch Invariant: Support DeepGEMM and Blackwell (#27127)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-18 09:28:05 -04:00
1d165d6d85 [Chore] Separate out vllm.utils.mem_utils (#27143)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-18 10:06:59 +00:00
83004020fd [Test] Add test for /health endpoint on engine failure (#26074)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-10-18 09:59:05 +00:00
12e21701e7 [DOC][FEATURES][CPU]update cpu feature for v1 (#27135)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-18 01:10:45 -07:00
30a33b92ee [Misc] Rev DeepEP (#27122)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-18 14:54:29 +08:00
7c572544e4 [GPT-OSS] Structure_Tag support for gpt-oss tool-call in cot (#25515)
Signed-off-by: Hanchenli <lihanc2002@gmail.com>
Signed-off-by: Hanchenli <61769611+Hanchenli@users.noreply.github.com>
Signed-off-by: Wei Wei <wwei6@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Wei Wei <wwei6@meta.com>
Co-authored-by: Wei Wei <weiweinpu@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-17 21:55:54 -07:00
c312320764 [CI/Build] tests(v1): feed Triton attention the (num_blocks, 2, …) KV cache layout in backend-correctness tests (#26663)
Signed-off-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-17 21:11:26 -07:00
c981f0ea78 [Perf] Add H100 fused MoE config (#25398)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-10-18 02:21:27 +00:00
6367bde739 [BugFix][Core] Fix error when enable async-scheduling in multi-node env (#25887)
Signed-off-by: Lehua Ding <lehuading@tencent.com>
Signed-off-by: Lehua Ding <lehuading@qq.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
2025-10-17 22:16:18 +00:00
f50cc221ea [Test] Make test_failure more stable for batch invariance (#27054) 2025-10-17 16:59:08 -04:00
acedc74b1a [V1][Spec Decode] Fix greedy temperature detection after sampler refactor (#27077)
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
2025-10-17 13:27:47 -07:00
d29483b58a [Minor] Remove unnecessary error message (#27115)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-10-17 20:02:12 +00:00
950cf9e58e [Bugfix] Use PIECEWISE cudagraphs on Blackwell if max_model_len > 131072 (#27114)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-17 19:47:18 +00:00
3125d79950 [Chore] Remove unused PolyNorm layer (#27110)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-17 19:03:43 +00:00
e33ee23ee3 [Bugfix] [AITER] [ROCm] Fix Quark MoE Quant Config and AITER Fused MoE quant type logic (#27029)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-17 12:51:10 -06:00
b10c64c834 [ROCm][Bugfix][Model] Fix illegal memory access when running qwen3_moe models with rms_norm (Qwen3-235B-A22B, Qwen3-30B-A3B, etc.) (#26192)
Signed-off-by: Randall Smith <ransmith@amd.com>
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
Signed-off-by: rasmith <Randall.Smith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-17 14:17:18 -04:00
0925b28a8e [ROCM] MoE fp4 CK kernel (#26545)
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
2025-10-17 14:06:33 -04:00
99722d5f0e [CI] Remove forbidden slash (#27112)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-17 09:38:00 -07:00
4c91a28e30 [bugfix] Qwen3-VL fix video incorrect timestamp calculations while do_sample_frames=True (#27104)
Co-authored-by: 松灵 <wpf272043@alibaba-inc.com>
2025-10-17 16:26:33 +00:00
b038d9c40c [Data-parallel] Allow DP>1 for world_size > num_gpus on node (8) (#26367)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-17 08:24:42 -07:00
2ba60ec7fe [CI] Nixl integration tests (#27010)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-17 07:13:31 -07:00
bd7157a071 [torch.compile] Enable attention and allreduce fusion without custom ops enabled (#24604)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-17 08:10:23 -06:00
be429d0cfd Fix incorrect docstring for stop_profile() method (#27101)
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
2025-10-17 06:30:23 -07:00
c253745eb8 [Harware][AMD][Model] Triton MoE tuning configs for GLM-4.5 for MI350 and MI355 (#25586)
Signed-off-by: Reima Karhila <reima.karhila@amd.com>
Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com>
Co-authored-by: xaguilar <Xavier.AguilarFruto@amd.com>
2025-10-17 04:56:12 -07:00
daec4d2624 [Model]Improve Qwen3VLMoeForConditionalGeneration packed_modules_mapping (#27096)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-17 04:47:00 -07:00
6c9fdbf725 [Docs] Replace rst style double-backtick with md single-backtick (#27091)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-17 02:47:34 -07:00
483ea64611 [Docs] Replace all explicit anchors with real links (#27087)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-17 02:22:06 -07:00
e20eba753b [VLM][Refactor] Remove useless func get_input_positions in MRotaryEmbedding (#27088)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-17 02:00:30 -07:00
bbc1b29665 Update troubleshooting.md and remind VLLM_TRACE_FUNCTION usage (#27069)
Signed-off-by: cong-meta <prowindy@hotmail.com>
2025-10-17 01:53:06 -07:00
acb1bfa601 [CI] fix docs build failed (#27082)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-17 07:53:40 +00:00
75c7ad9918 [Kernel][Performance] Fuse float cast and renormalize to topk softmax kernel (#26717)
Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
2025-10-17 07:30:35 +00:00
5550ff9c25 [CI/Build] Update compressed tensor test path to fix CPU CI (#27068)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-10-16 22:34:56 -07:00
3aeb19a39e [Model] Add support for LightOnOCR (#26916)
Signed-off-by: Said Taghadouini <taghadouinisaid@gmail.com>
Signed-off-by: Said Taghadouini <84044788+staghado@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-17 05:05:24 +00:00
8c017b3490 [Model] Always use Transformers backend for PaliGemma and Gemma3-MM (#26715)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-17 05:03:35 +00:00
9c2c2287a0 [CI/Build] Update Llama4 eval yaml (#27070)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-17 04:59:47 +00:00
fec2b341ad [Kernel] Lazy import FlashInfer (#26977) 2025-10-17 04:48:18 +00:00
87bc0c492f [Bugfix] Fix ReplicatedLinearWithLoRA (#27065)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-17 04:43:16 +00:00
fe3b9372ad [Core] Change execute_model_with_error_logging() to be a ctx manager (#27060)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-17 11:45:32 +08:00
bde9e2272a [Bugfix][Qwen] fixes the weights dtype in qwen3_next: it is actually a bfloat16 (#27030)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-10-17 03:37:52 +00:00
08405609cc disable graph partition in custom op (#26952)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-17 11:08:47 +08:00
ab81379ea6 [Perf] Exploit out-of-band buffers in shm_broadcast (#26961)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-16 20:08:03 -07:00
4ffd6e8942 [Docs] Reduce custom syntax used in docs (#27009)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 20:05:34 -07:00
965c5f4914 vllm bench serve shows num of failed requests (#26478)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
2025-10-16 19:55:09 -07:00
4d055ef465 Remove unused imports (#26972)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-16 19:51:17 -07:00
17c540a993 [torch.compile] fix simple inductor graph partition test (#27050)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-16 21:09:36 -04:00
4d4d6bad19 [Chore] Separate out vllm.utils.importlib (#27022)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-17 00:48:59 +00:00
11ae016bd7 [torch.compile] Passing only necessary compilation config to inductor pass config (#27041)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-10-17 00:01:52 +00:00
41d3071918 [NVIDIA] [Perf] Update to leverage flashinfer trtllm FP4 MOE throughput kernel (#26714)
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-16 16:20:25 -07:00
fb5e10d3fb Refactor Transformers backend to use mixins (#26906)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 21:50:39 +00:00
b2f78cbad4 [small][batch invariance] Rename the env and internal flags to simplify usage (#26855)
Signed-off-by: Bram Wasti <bwasti@meta.com>
2025-10-16 21:40:25 +00:00
23583ee28c [Bug] Add Assertion for random-input-len / random-output-len (#26834)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-16 21:36:39 +00:00
01c977e96d [CI] Prune Quantization Tests and skip compilation (#27038)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-16 17:26:35 -04:00
b3dda72c23 [Feature] Migrate DeepGEMM API from get_m_alignment_for_contiguous_layout to get_mk_alignment_for_contiguous_layout (#26935)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-16 16:46:48 -04:00
fb0571b077 [GPTOSS][DP/EP][Marlin] Enable GPTOSS Batched DP/EP using Marlin kernels (#25997)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-16 12:53:11 -07:00
2ed8b6b3d0 [Bug] Fix batch invariant test has to is (#27032)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-16 19:45:14 +00:00
013abde6ef Adding Warmup to Benchmark Serving (#26943)
Signed-off-by: Kimbo Chen <chentenghung@gmail.com>
2025-10-16 12:44:32 -07:00
a5464dcf92 [Compressed Tensors] Always clone output for compile robustness (#26849)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-16 19:29:59 +00:00
ac3ed5a815 Support block size of 256 used by Intel HPU (#26883)
Signed-off-by: mandy-li <mandy.j.li@intel.com>
2025-10-16 15:10:57 -04:00
e6ba2000ae [gpt-oss][1/N] EZ: refactor serving_responses for modularity (#26948)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-10-16 18:44:06 +00:00
aa255ff55a Support set in the CLI generation (#27031)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 18:07:18 +00:00
7bb736d00e Fix Qwen2.5 VL image grid docstring (#27033)
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
2025-10-16 09:57:36 -07:00
9f4e30904b [Model] Fix Qwen3VL mm mapping (#27027)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-16 09:45:59 -07:00
5afd3276df [Feature] Add process_weights_after_loading to AttentionImpl (#26870)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-10-16 08:02:30 -07:00
43721bc67f [CI] Replace large models with tiny alternatives in tests (#24057)
Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 15:51:27 +01:00
02d709a6f1 [docs] standardize Hugging Face env var to HF_TOKEN (deprecates HUGGING_FACE_HUB_TOKEN) (#27020)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-10-16 15:31:02 +01:00
4a510ab487 [NIXL] Improve request_finished() debug logs (#25665)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-16 15:55:17 +02:00
314fa8abbf [Attention] Tune CUTLASS MLA num_splits (#26846)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-16 06:36:09 -07:00
334535b6fb [Benchmark] Show E2EL by default for pooling models (#27014)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 12:47:09 +00:00
dcbb3f1871 [Bugfix] Correct LayerNorm epsilon parameter in modernbert.py (#27008)
Signed-off-by: bogdanm <152898065+bogdan01m@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-16 12:27:44 +00:00
00417f4e44 [MISC] fix import violations for re and triton modules (#26654)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-10-16 03:38:27 -07:00
ed344f4116 Cleanup code after Python 3.10 upgrade (#26520)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-16 03:38:23 -07:00
e51928793e [Model][Bugfix] fix ernie45 vl run failed from shared experts optimization (#26885)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-16 03:37:35 -07:00
d2740fafbf [Chore] Separate out vllm.utils.collections (#26990)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 08:35:35 +00:00
17838e50ef [Benchmark] Use truncation by default for pooling benchmarks (#26992)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 16:02:39 +08:00
44c8555621 [CI/Build] Fix AMD import failures in CI (#26841)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-16 07:28:20 +00:00
f7d318de2b [Hardware][CPU][PowerPC]Disable torch.compile() in toptopk sampling (#26987)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-10-15 22:36:59 -07:00
76f0d05bc6 [CI/Build] Update expected beam search output for Phi3V (#26978)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 05:12:44 +00:00
7d8975de84 Deepseek-v3 Batch Invariant on 8xH100 (#26609)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-15 22:06:02 -07:00
785d8b6410 [PERF] Qwen3-next MTP speedup (change bool mask indexing to index_select / index_copy to reduce d2h) (#26437)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2025-10-16 12:18:31 +08:00
f6cdc9a02f [Chore] Rename utils submodules (#26920)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 03:58:13 +00:00
509cdc0370 [DOC][XPU]update feature parity with Intel GPU (#26954)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-15 20:07:10 -07:00
9b6504c307 [BugFix] Work around graph partition x torch.compile cache issue (#26956)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-10-15 20:06:11 -07:00
e19b16dde6 [bugfix] Fix SP + PP without specifying compile size (#26955)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 20:05:33 -07:00
582f2c6be7 [BUG] Allow runai_streamer_sharded in config check (#26958)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-10-15 20:05:14 -07:00
f8a0acbdbe [CI] Enable Blackwell Llama4 MoE tests (#26731)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-15 21:02:57 -06:00
1317034379 [ROCm][FEAT] Fuse DeepSeek shared experts into AITER fused_moe ops (#24097)
Signed-off-by: chenjun <junchen2@amd.com>
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
Co-authored-by: valarLip <103567126+valarLip@users.noreply.github.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2025-10-16 10:41:34 +08:00
0ecc553ee6 [Bugfix] reasoning_parser parameter handling in run_batch.py (#26225)
Signed-off-by: inc-jeong <inc.jeong@navercorp.com>
Signed-off-by: InChang Jeong <inc.jeong@navercorp.com>
Co-authored-by: USER <user@AL02367916.local>
2025-10-16 10:24:05 +08:00
f96bc3649c [Qwen3-Next] Add tuned MoE config for Qwen3-Next FP8 on H100 tp2 (#26887)
Signed-off-by: Felix Zhu <felixzhu555@gmail.com>
2025-10-15 18:55:05 -07:00
938c43ea7f [ci] Adjusting AMD test composition 2025-10-14 (#26852)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-15 23:52:13 +00:00
0a9ef0cfce Move query quantization to attention layer for Flashinfer & Triton. (#26534)
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Adrian Abeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-15 19:01:38 -04:00
e5b438a247 [Bug] Temporally Disable VLLM_ALLREDUCE_USE_SYMM_MEM by Default (#26925)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-15 16:18:50 -04:00
0b99f5d302 support flashinfer_fp4 moe for 5090 gpu (#26669)
Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-15 15:06:47 -04:00
1f491aa0c8 Vectorize RMS norm variance using vectorize_read_with_alignment (#26234)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-15 11:54:41 -07:00
de92d916fe [NVIDIA] Add support for cudnn fp4 gemm via flashinfer (#26107)
Signed-off-by: kaixih <kaixih@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-10-15 13:53:00 -04:00
a1063628a4 [Chore] Clean up CODEOWNERS (#26923)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-15 10:52:54 -07:00
d796375258 [ModelOpt] Remove NVFP4 MoE K%16==0 constraint (#26891)
Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
2025-10-15 13:06:17 -04:00
14f8456344 [Feature]: Use pydantic validation in observability.py config (#26637)
Signed-off-by: Samuel Wu <cernunnos1710@gmail.com>
Signed-off-by: Sam/Samuel <57896620+cern1710@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 16:44:03 +00:00
4794c2bd92 Olmo 3 tool parser and tests (#26143)
Signed-off-by: Pradeep Dasigi <pradeepd@allenai.org>
2025-10-15 16:36:12 +00:00
d3cbaa08dc Lower sevarity of log when model info cache misses due to exception (#26917)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 09:01:09 -07:00
828523ad8e [Chore] Separate out vllm.utils.async_utils (#26913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 15:33:00 +00:00
136a17fe6e [Chore] Separate out vllm.utils.func (#26904)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 13:03:58 +00:00
f57438338d [BugFix] Patch inductor memory plan logic (#26878)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 12:51:45 +00:00
5d598680e3 chore: remove unused marker (#26890)
Signed-off-by: Max Wittig <max.wittig@siemens.com>
2025-10-15 05:40:33 -07:00
8f4b313c37 [Misc] rename torch_dtype to dtype (#26695)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-15 12:11:48 +00:00
f93e348010 [Misc] Remove isort and yapf ignores (#26888)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 12:09:03 +00:00
f54f85129e [Model][2/N] Improve all pooling task | Support multi-vector retrieval (#25370)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-15 11:14:41 +00:00
d4d1a6024f [Lora]Load tuned multi-lora kernel configs from json files (#26319)
Signed-off-by: li2haipeng <44383182+li2haipeng@users.noreply.github.com>
Signed-off-by: Haipeng Li <li2haipeng@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-15 09:45:14 +00:00
db1764e4e0 [Platform] allow platform to init dp group (#22243)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-15 02:32:17 -07:00
7f83b4ee8e [Easy] Get rid of unnecessary paraenthesis in kv_cache_manager (#26842)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-15 09:17:43 +00:00
5c3bae1a6a [Fix] Remove divisibility requirement between num_kv_heads and tp_size in bailing_moe (#26876)
Signed-off-by: vito.yy <vito.yy@antgroup.com>
2025-10-15 16:44:04 +08:00
5210dc3940 [Misc] Update TritonLanguagePlaceholder to have attributes that are used by Flash Linear Attention ops. (#26853)
Co-authored-by: Xudong Ma <mxd@meta.com>
2025-10-15 08:37:49 +00:00
650b51f9f9 [doc] add Context Parallel Deployment doc (#26877)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-15 16:33:52 +08:00
6256697997 [Doc] ruff format remaining Python examples (#26795)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 01:25:49 -07:00
71557a5f7c [CI] Fix mypy for vllm/executor (#26845)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-15 01:23:33 -07:00
f3c378ffa7 [CI/Build] Add Qwen2.5-VL-7B-Instruct ChartQA Accuracy Tests in CI (#21810)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Ye (Charlotte) Qi <ye.charlotte.qi@gmail.com>
2025-10-15 08:09:56 +00:00
f5ed68ef63 [Deepseek-V3.2][Kernel] Integrate cuda indexer k cache gather (#26456)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-10-15 16:05:01 +08:00
efdef57b1f [bugfix] Lazy import cv2 (#26869)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 07:47:50 +00:00
b8a4572157 [Misc] Use helper function to generate dummy messages in OpenAI MM tests (#26875)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 07:17:37 +00:00
302ef403a2 [DSA][MLA] Tiny refactor on DeepSeek to make it reusable for different backends (#26656)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-15 00:16:44 -07:00
8865da157b [Bugfix][Multi Modal] Fix incorrect Molmo token processing (#26873)
Signed-off-by: sanghol <sanghol@allenai.org>
2025-10-15 07:13:59 +00:00
f0862eae43 [Graph Partition] pass tests for decorator (#26831)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-15 06:39:48 +00:00
8c851f6d04 [Bugfix] Fix qwen3-omni audio truncation issue (#26815)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-15 05:38:36 +00:00
7cfa420f49 [BugFix] Patch inductor partitioning logic (#26735)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 05:04:32 +00:00
a27b288e4a [Feature] default --extra-body param to disable thinking in vllm bench serve (#26784)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-10-15 04:23:44 +00:00
e471d7ca7e [CI/Build][Bugfix] fix qutlass cmake error when set QUTLASS_SRC_DIR (#26773)
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-15 04:09:44 +00:00
c43ca8259e [Docs] Move build.inc into arm.inc (#26862)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-10-14 20:35:08 -07:00
85a65e7f51 [Model] Add DeepSeek-V3.1 reasoning parser (split from PR #24972) (#25589)
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-10-15 11:09:52 +08:00
a2986b3e33 [Bugfix] Fixes prefix-repetition benchmark script (#26828)
Signed-off-by: Kourosh Hakhamaneshi <Kourosh@anyscale.com>
2025-10-15 02:54:43 +00:00
96b9aa5aa0 [Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-15 02:51:16 +00:00
e66d787bce Disable FlashInfer sampler by default (#26859)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-15 02:35:18 +00:00
bfad142e25 [BUGFIX][NIXL] quick fix for 'assert self.connector_worker is not None' in get_kv_connector_stats (#26851)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-15 02:33:25 +00:00
9354660036 [Bugfix]fix Qwen3 xml tool parser (#26345)
Signed-off-by: Zhikaiiii <1658973216@qq.com>
2025-10-15 09:50:30 +08:00
07ca70af8d [Core][Easy] Use envs.__getattr__ for all Unify to environment variable access (#26810)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-15 01:41:18 +00:00
2dcd12d357 [torch.compile] Fix tests for torch==2.9 inductor partition (#26116)
Signed-off-by: ProExpertProg <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-10-14 19:55:02 -04:00
579d2e5458 [WideEP][P/D] Add usage stats for DP+EP and KV Connector (#26836)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-14 23:51:54 +00:00
0512c04aee [frontend][gptoss] Add per turn stats into Harmony Context (#25061)
Signed-off-by: lacora <hyelacora@gmail.com>
Co-authored-by: Ye Hu <yehu@fb.com>
2025-10-14 16:48:13 -07:00
7e0ef4084a [CI Failure] Fix torchao dep failure for Quantization Test (#26824)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 16:41:43 -07:00
4aed506b65 [Core] Streamline some structured output related code (#26737)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-14 23:27:44 +00:00
a86b4c58e8 remove attn output view kernel (#26680)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 22:53:10 +00:00
ff4810ba73 [Minor] Group async_scheduling related fields in model runner init (#26736)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-14 14:46:37 -07:00
9d6964926e fix: response_format for completion (#23212)
Signed-off-by: Nan2018 <qinnanjoshua@gmail.com>
2025-10-14 21:23:22 +00:00
0e65818910 Added MoE configs for llama 4, H200 device with tp=4/8 tuning (#26837)
Signed-off-by: Dhruvil Bhatt <bhattdbh@amazon.com>
2025-10-14 14:21:03 -07:00
380f17527c [Perf] Cache vllm.env.__getattr__ result to avoid recomputation (#26146)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 17:03:21 -04:00
b92ab3deda Notice for deprecation of AutoAWQ (#26820)
Signed-off-by: HDCharles <39544797+HDCharles@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 13:39:59 -07:00
acaa2c0a4a [Core] Reuse empty block lists whenever possible in KVCacheBlocks to mitigate GC costs (#24964)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 12:58:43 -07:00
82af928c41 [Attention][Spec Decode] FlashMLA spec decode support (#26541)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-14 19:38:20 +00:00
87efc681db llama4_vision_rope: add HIP override to accept (q, k) and avoid (positions, q, k) mismatch (#26790)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-14 11:54:12 -07:00
c3a722fcb2 [CI Failure] Fix tests with missing TinyLlama-1.1B-Chat-v1.0-FP8-e2e (#26816)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 18:38:59 +00:00
aba48f7db1 [Kernel][MoE] Add MoE tunings for GLM 4.6-FP8 and GLM 4.5 Air on NVidia B200 (#26818) 2025-10-14 11:20:39 -07:00
04b5f9802d [CI] Raise VLLM_MAX_SIZE_MB to 500 due to failing Build wheel - CUDA 12.9 (#26722)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 10:52:05 -07:00
efc8f7d814 Update coveragerc and add codecov.yml for path fixes (#26435)
Signed-off-by: Reza Barazesh <rezabarazesh@meta.com>
2025-10-14 09:45:06 -07:00
6d87a2838c [Config] Remove Unused Environment Variable VLLM_DISABLE_PAD_FOR_CUDAGRAPH (#26743)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-14 11:47:49 -04:00
e6cdbd6792 Revert "[issues template] Encourage the author implement their own ideas" (#26814) 2025-10-14 08:37:34 -07:00
df850c4912 [Feature][Responses API] Stream Function Call - harmony (#24317)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-14 08:31:43 -07:00
720394de43 [KVConnector][Metrics] Aggregate scheduler-side KVConnectorStats (#26046)
Signed-off-by: Qier Li <kevin44036@gmail.com>
2025-10-14 14:38:07 +00:00
88a49745af [issues template] Encourage the author implement their own ideas (#26671)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-14 22:32:36 +08:00
ca683a2a72 use combo kernel to fuse qk-norm and qk-rope (#26682)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-14 09:40:59 -04:00
e9f1b8c9e9 Adjusted the model order of the model registration file (#26798)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-10-14 13:26:11 +00:00
ea97940d6c [DCP] Support Decode Context Parallel (DCP) for GQA with FlashAttention (#24864)
Signed-off-by: yuanyongjie.yyj <yuanyongjie.yyj@antgroup.com>
Signed-off-by: FENP <32334296+FENP@users.noreply.github.com>
Signed-off-by: Jaya Yuan <yuanyongjie.yyj@antgroup.com>
2025-10-14 13:07:50 +00:00
fdd32750f0 [CI/Build] Cleanup LoRA test (#26752)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-14 12:06:35 +00:00
c715ba3735 [Feature] Change vllm.py with pydantic validation (#26726)
Signed-off-by: Vladislav <vladislav.bronzov@gmail.com>
Signed-off-by: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-14 12:00:54 +00:00
9c4cb68339 [Chore] Remove SupportsV0Only interface and update supported models docs (#26783)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 04:55:10 -07:00
780eb03d9b [CI] Fix test_tool_id_kimi_k2 (#26787)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-14 10:27:07 +00:00
ef9676a1f1 [Doc] ruff format some Python examples (#26767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 03:21:53 -07:00
70b1b330e1 Don't allow typos to fix by default (#26785)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-14 03:05:15 -07:00
d1d063a588 [Chore] Use max_transformers_version for Qwen-VL test (#26792)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 03:03:46 -07:00
7e6edb1469 [NIXL][HeteroTP] Enable KV transfer from HND prefill to NHD decode (#26556)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-14 09:46:05 +00:00
74704d4553 [Model] Use merge_by_field_config for MM models (O-P) (#26776)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 09:42:45 +00:00
d2f816d6ff [Bugfix] Standardize merging multimodal embeddings (#26771)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 09:36:21 +00:00
577d498212 [Plugin] Make plugin group clear (#26757)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-14 07:49:59 +00:00
fd85c9f426 [Bugfix][FE]: Always include usage with --enable-force-include-usage (#20983)
Signed-off-by: Max Wittig <max.wittig@siemens.com>
Signed-off-by: Antoine Auger <antoineauger@users.noreply.github.com>
Co-authored-by: Antoine Auger <antoineauger@users.noreply.github.com>
2025-10-14 09:17:39 +02:00
d32c611f45 [CI/Build] Use 127.0.0.1 instead of localhost in utils (#26750)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-14 07:04:00 +00:00
01ad27faff [Model][Bugfix]fix ernie45 load failed due to ernie45 eplb code (#26684)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-14 06:55:23 +00:00
481545b397 scheduler.py: Update the name of the default scheduler. (#26758)
Signed-off-by: Ryan Li <ryanli@ryanli.org>
2025-10-14 06:52:21 +00:00
d3cc8427c0 [ci] Adding the test-amd.yaml for test definitions for the AMD backend. (alternative PR) (#26718)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-13 23:10:23 -07:00
4821ac1b4d [CI] [ROCm] Automate CC list for ROCm related issue (#26753)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-14 13:57:26 +08:00
4497c8f821 Fix lora tests failure in TPU CI due to the removal of LoRA bias (#26723)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-10-14 13:04:23 +08:00
2e36cdbe2b [Docs] Add a start tag to build.inc.md (#26747)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-10-13 21:51:55 -07:00
fe3edb4cf0 Add support for the /rerank endpoint in vllm bench serve (#26602)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-10-14 04:25:43 +00:00
29350922c6 [Feature][Quantization] auto_round format add support for regex (#24024)
Signed-off-by: n1ck-guo <heng.guo@intel.com>
Signed-off-by: Heng Guo <heng.guo@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 03:03:16 +00:00
8ae169286f [torch.compile] Unwrap fused_marlin_moe custom op (#26739)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-14 02:22:16 +00:00
8a0af6a561 [build][torch.compile] upgrade depyf version (#26702)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-14 10:12:09 +08:00
cfded80793 [Easy] Fix env type check errors from VLLM_DEBUG_LOG_API_SERVER_RESPONSE (#26742)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 01:46:44 +00:00
b59dd19b55 [compile] Enable sequence parallelism for full cuda graph without specifying compile sizes (#26681)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-13 18:15:34 -07:00
3e051bda82 [UX] Replace VLLM_ALL2ALL_BACKEND with --all2all-backend (#26732)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-13 18:12:52 -07:00
8317f72354 [Misc][DP] support customized aggregated logger for dp (#24354)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-10-13 17:45:59 -07:00
d8bebb008a Add tests for chunked prefill and prefix cache with causal pooling models (#26526)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Ayush Singh <ayush1009208@gmail.com>
2025-10-14 07:45:04 +08:00
35bc22f23c [ResponseAPI] Further polish message serialization and unit tests (#26728)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-13 23:31:35 +00:00
fa96fb9c70 Pruning kernel Core Tests (#26727)
Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
2025-10-13 23:08:18 +00:00
e3fdb627d9 [FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
2025-10-13 22:47:16 +00:00
7200a21cd1 [Bug] Fix Assertion error DeepEP/csrc/kernels/intranode.cu:928: 'false and Unsupported type' (#26532)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-13 18:26:37 -04:00
577c72a227 [CI Perf]Prune Tests in kernel/mamba (#26538)
Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-13 18:22:31 -04:00
314285d4f2 [CI] Fix mypy for vllm/distributed (#26593)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 16:02:24 -04:00
d2a7938582 [Frontend][1/N] Improve all pooling task | Support FP16 Embedding Base64 (Still uses fp32 by default). (#26414)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-13 19:06:43 +00:00
89342ce4c0 [Quantization] [Performance] Enable Marlin GEMM kernels for the calibration-free RTN-based quantization (#26051)
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Signed-off-by: Alex Kogan <82225080+sakogan@users.noreply.github.com>
2025-10-13 18:52:54 +00:00
f89f599395 [CI][Release][Arm64]: Build arm64 release for gpu arch 8.9 (#26698) 2025-10-13 18:42:12 +00:00
e251e457c5 [Log] Optimize Startup Log (#26601)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-14 02:06:57 +08:00
afc47e4de7 [Model] Use merge_by_field_config for MM models (M-N) (#26710)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 01:27:01 +08:00
e3b90c1ba2 [Bugfix][Speculative Decoding] Extend Eagle quantization config fix to llama_eagle.py (#26590)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-13 17:17:13 +00:00
134f70b3ed [Bugfix][Rocm] fix qr error when different inp shape (#25892)
Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-10-13 10:04:21 -07:00
a1b2d658ee [CI/Build] upgrade compressed-tensors to 0.12.2 to address LGPLv3 (#26501)
Signed-off-by: Sangyeon Cho <josang1204@gmail.com>
2025-10-13 12:58:33 -04:00
5c7fe25491 [Misc] Separate prompt logging to debug (#26713)
Signed-off-by: Aleksei Tsvetkov <aitsvet@ya.ru>
2025-10-13 09:04:18 -07:00
53c9a7cee2 [P/D] [NixlConnector] kv load recovery integration (#26171)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-10-13 08:48:04 -07:00
0d21b9b51e [UX] Speedup DeepGEMM warmup with heuristics (#25619)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-13 07:59:27 -07:00
10214b6935 [FEATURE]: Use pydantic validation in multimodal.py config (#26629)
Signed-off-by: Anand Roy <86306690+andycandy@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 07:56:59 -07:00
4a61950f4d [Hardware][CPU] Disable torch.compile for RISC-V to prevent APIError (#26693)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn
2025-10-13 07:56:01 -07:00
3263799056 [unrevert] Add batch invariant kernel override for FlashInfer backend [2/n] (#26373)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
2025-10-13 10:24:53 -04:00
8e67b2557a [Bugfix] Fix out of bound index issue for Jina-embedding-v3 RoPE with cuda graph (#26687)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-13 03:21:48 -07:00
4073c82c4e [ResponseAPI] Simplify input/output message serialization (#26620)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-13 09:59:15 +00:00
767c3ab869 [Model][0/N] Improve all pooling task | clean up (#25817)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-13 16:44:50 +08:00
4f207c7174 Ignore large reformatting PRs in git blame (#26690)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 01:20:47 -07:00
782505ed8e [Model] Add reasoning_parser and tool_parser for Ernie45 thinking (#25027)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-13 15:55:20 +08:00
98f30b8cba [Model] Fix Skywork R1V mlp (#26673)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-12 22:42:17 -07:00
3cd36660f7 docs: wrong command in structured_outputs README (#26677)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-10-12 20:59:01 -07:00
46ad73955a [FIX] Throwing an exception when the model does not support pool tasks (#25840) (#25855)
Signed-off-by: zxw <1020938856@qq.com>
Co-authored-by: wang.yuqi <noooop@126.com>
2025-10-12 20:56:21 -07:00
41f3884438 [Bugfix][Core]Fix block table out-of-range issue in priority scheduling (#26661)
Signed-off-by: quanliu <18646313696@163.com>
2025-10-13 01:25:42 +00:00
60e419c1ee [Misc] cache result of disable_inplace (#26666)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-13 00:17:50 +00:00
7ef6052804 [CI/Build] Add tool to build vllm-tpu wheel (#19165)
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-12 16:25:40 -06:00
4fca1a1bd2 [easy] fix pre commit error on trunk (#26665)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-12 21:25:34 +00:00
a6049be73c [Models][Qwen3VL] Speedup fast_pos_embed_interpolate (#26647)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-13 01:20:07 +08:00
18ed7746ea [Feature] Add support for naver/splade-v3 (BERT-based sparse embedding model) (#26339)
Signed-off-by: gjgjos <gjgjos@naver.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-12 17:00:52 +00:00
8fcaaf6a16 Update Optional[x] -> x | None and Union[x, y] to x | y (#26633)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-12 09:51:31 -07:00
9bb38130cb [Bugfix] Fix GPU_ID issue in test script (#26442)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-12 11:39:05 +00:00
b91d8db873 [Bugfix][DCP] Set default CUDAGraphMode to PIECEWISE for DCP (#26574)
Signed-off-by: FENP <32334296+FENP@users.noreply.github.com>
2025-10-12 09:58:38 +00:00
045b396d09 [Bugfix][CI/Build] Fix failing Mteb CI (#26638)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-12 02:42:42 -07:00
76852017ea [MISC] Rename the torch profiler filename as instance_id+rank_id for merging the Profiler results of each Rank (#25867)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-12 09:29:08 +00:00
82e64c7a20 [PERF] [Qwen3-next] Speed up gated RMSNorm (#26207)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
Signed-off-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-12 08:27:50 +00:00
4ca204055e Add @noooop to codeowner for pooling models (#26652)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-12 14:04:44 +08:00
c5c8f5ea59 [EPLB] Support ernie4.5-moe (#22100)
Signed-off-by: Haisheng Chen <langzs335@outlook.com>
Signed-off-by: Haisheng Chen <60504847+HsChen-sys@users.noreply.github.com>
Signed-off-by: Haisheng Chen <hac048@ucsd.edu>
Co-authored-by: Haisheng Chen <langzs335@outlook.com>
2025-10-12 10:40:47 +08:00
01653a917b [compile] Fix inductor partition config (#26645)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-11 21:03:14 +00:00
0cd103e7cb CP: make correct_attn_out robust to 4‑D views and fix Triton arg binding (#26509)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-11 20:50:57 +00:00
5be7ca1b99 [Benchmark] Support Infinity API (#26641)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-12 01:45:32 +08:00
f0a30a067b [Bugfix] Fix qwen-moe packed_modules_mapping (#26634)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-11 15:21:33 +00:00
9d6cff3ede [Bugfix][Qwen3VL] fix deepstack in qwen3vl (#26626)
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
2025-10-11 05:58:33 -07:00
a25f2adee9 [compile] Add patched_fused_scaled_matmul_reduce_scatter (#26604)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-11 05:44:43 -07:00
d0bed837ac [Refactor]Reduce duplicate code in serving_chat (#26627)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-11 12:04:49 +00:00
f7ee69868a [CPU] fix the issue when the node is '-' cause json decode error. (#26562)
Signed-off-by: muzian666 <andylee_2001@163.com>
Co-authored-by: qingan.li <qingan.li@wizpresso.com>
2025-10-11 12:04:04 +00:00
d2a71530c1 Add EAGLE-3 Speculative Decoding Support for Qwen3 MoE (#26485)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-11 10:14:41 +00:00
086609de64 fix(nix): Allow local oneDNN path to fix vLLM CPU build failure (#26401)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
2025-10-11 09:12:16 +00:00
727144bed1 [Refactor]: Use M-RoPE interface directly while defining model class instead of maintaining model specific M-RoPE implementation in mrope.py (#24172)
Signed-off-by: Divyansh Singhvi <divyanshsinghvi@gmail.com>
Signed-off-by: dsinghvi <divyanshsinghvi@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: wwl2755 <wangwenlong2755@gmail.com>
2025-10-11 07:21:04 +00:00
55392bc879 [Bugfix][Multi Modal] Fix incorrect Molmo image processing (#26563)
Signed-off-by: sanghol <sanghol@allenai.org>
2025-10-10 22:28:23 -07:00
ddaff2938e [MM] Move Qwen3Omni MRoPE impl to model file (#26608)
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-10 22:17:24 -07:00
27ed39a347 [XPU] Upgrade NIXL to remove CUDA dependency (#26570)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
2025-10-11 05:15:23 +00:00
8f8474fbe3 [CI/Build] Fix ppc64le CPU build and tests (#22443)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
2025-10-11 13:04:42 +08:00
be067861c6 [Frontend] Improve the performance of is_reasoning_end (#25735)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-11 10:43:39 +08:00
5bc26c438d [BugFix] Make penalties and bad_words work with async scheduling (#26467)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-10 23:27:04 +00:00
eef921f45e AOT Compilation for torch.compile (Bundled) (#24274)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2025-10-10 19:02:11 -04:00
e317414ce1 Cache the environment variable check for batch invariance (#26510)
Signed-off-by: Bram Wasti <bwasti@meta.com>
2025-10-10 22:47:34 +00:00
949cb0170d [BugFix] Fix async scheduling + request preemption (#26385)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-10 20:29:57 +00:00
e94cfd51da [BUG] Qwen3-next MTP. Fix attn metadata build bug (#26564)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2025-10-10 14:59:03 -04:00
7c12763b24 Fix some typing issues found by mypy==1.18.2 (#26596)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-10 18:21:25 +00:00
3b780a4bbb Update CUDA architecture list in build pipeline for 12.9.1 wheels (#26592)
Signed-off-by: Will Eaton <wseaton@users.noreply.github.com>
2025-10-10 11:15:27 -07:00
30f78af147 Update pre-commit hook versions (#26591)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-10 17:03:44 +00:00
19a9b169bf Add Qwen3-Omni moe thinker (#25550)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Xiong Wang <feizi.wx@alibaba-inc.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-10 17:00:56 +00:00
96ad65b7fe [Transform] [Quantization] Add QuTLASS support to vLLM (#24440)
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Signed-off-by: Andrei Panferov <andrei@panferov.org>
Co-authored-by: Andrei Panferov <andrei@panferov.org>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-10 09:43:40 -07:00
8d2b8c0ff2 [Model] Add FlexOlmo model implementation (#24923)
Signed-off-by: Shane A <shanea@allenai.org>
2025-10-10 09:43:15 -07:00
b2155ed317 [Model][Qwen3VL] Compute cu_seqlens on CPU to remove (#26496)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-10 09:42:17 -07:00
910abdbd08 [Bugfix] fixed top_logprobs: -1 does not appear to work as intended (#26470)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-11 00:41:17 +08:00
cddce79fda [torch.compile] Make inductor partition rules respect splitting_ops #25691 (#25845)
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
Signed-off-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-10 16:35:28 +00:00
e519281920 [Metrics] Add test for multi-modal cache stats logging (#26588)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-10 16:00:50 +00:00
7b03584de8 Silu v2 (#25074)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: elvircrn <elvircrn@gmail.com>
Signed-off-by: Elvir Crnčević <elvircrn@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
2025-10-10 15:19:53 +00:00
ae9d0e7da5 [Bugfix] Make DP padding optional in coordinate_batch_across_dp (#26375)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-10-10 10:53:33 -04:00
0e67102d93 Added test_top_k_per_row to test-pipeline.yaml. (#26569)
Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
2025-10-10 10:48:33 -04:00
f4ba2061cf [BugFix][torch.compile] Fix fused_scaled_matmul_reduce_scatter signature for PyTorch 2.8 (#26038)
Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
Signed-off-by: <>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-10 07:42:13 -07:00
1e6848a65d [CI] fix test_run_batch.py::test_completions - AssertionError (#26578)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-10 22:16:28 +08:00
67661375fa [BugFix] Fix noop elimination edge case (#26394)
Signed-off-by: Andy Lo <andy@mistral.ai>
2025-10-10 13:33:04 +00:00
213b64452a [Bugfix] Convert untraceable GroupShape to list for AMD impl (#26535)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
2025-10-10 13:32:29 +00:00
784c231151 [NIXL] Ignore abort on already-finished request (#25067)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-10 12:21:56 +02:00
606b00e80f [bugfix][DCP] fix block_size of hash in DCP prefix caching (#26296)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-10 03:02:49 -07:00
720d3cd0f0 [CI] fix ruff format (#26579)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-10 03:02:12 -07:00
ab196edefb Remove LoRA bias support (#25807)
Signed-off-by: Ashwin Phadke <ashwinphadke12@rediffmail.com>
Signed-off-by: Ashwin Phadke <23502062+ashwin-phadke@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-10 09:50:33 +00:00
3ee202ea1e [GPT-OSS] Add support for arrays at tool message content (#25593)
Signed-off-by: Luis Tomas Bolivar <ltomasbo@redhat.com>
2025-10-10 09:00:45 +00:00
ad430a67ca [Metrics] Log multi-modal cache stats and fix reset (#26285)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-10 01:45:55 -07:00
6f0f570c43 [deepseek] kernel block size for UniformTypeKVCacheSpecs (#26559)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-10 16:40:41 +08:00
b545a0b207 fix test_simple_inductor_graph_partition (#26522)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-10 06:39:19 +00:00
29255cfc3b [Spec-Decode] Support piecewise cudagraphs for Eagle head (#25109)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
2025-10-10 01:20:31 -04:00
da4455609d [Chore]: One pythonic tool parser test uses the wrong parser (#26515)
Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-10-10 04:03:55 +00:00
aafb99a4d4 [Core] Small simplification in GPUModelRunner._update_states() (#26508)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-10 10:53:58 +08:00
757fa4a4da [DP][ray] Support different VLLM_RAY_DP_PACK_STRATEGY (#23849)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-09 19:53:43 -07:00
c6187f55f7 Refactor MistralTokenizer (#26358)
Signed-off-by: Julien Denize <julien.denize@mistral.ai>
2025-10-09 22:48:58 +00:00
8983e0216f [CI] Fix Pre-commit Issue Cannot determine type of "rank" and "world_size" (#26448)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-09 15:16:48 -07:00
1ee35382cb [Bug] Fix modular_kernel: ZeroDivisionError: integer division or modulo by zero (#26528)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-09 15:13:27 -07:00
6e783bc54b [Bugfix] Fix CUDA graph selection bug in FlashInfer at high concurrency (#26499)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-09 17:12:34 -04:00
c9d33c60dc [UX] Add FlashInfer as default CUDA dependency (#26443)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-09 14:10:02 -07:00
2e54db4d2b [Core] Remove unused prev_sampled_token_ids_invalid_indices input batch field (#26514)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-09 20:22:14 +00:00
44f633dba1 [Flashinfer][gpt-oss] Support FP8-qkv Flashinfer TRTLLM Sinks Attention (#25674)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-10-09 16:13:39 -04:00
a462331e36 [Bugfix] Disable moe inplace for torch >= 2.9 (#26497)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-09 18:07:38 +00:00
4069db3f2e [Bugfix] Enable padded FP4 quantization (#25947)
Signed-off-by: Roi Koren <roik@nvidia.com>
2025-10-09 10:59:41 -07:00
0d37450eb7 [BUGFIX] Add cu_tokens_across_sp to DPMetadata (#26457)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-10-09 17:13:56 +00:00
47e66c24e2 [Model] Apply shared experts overlap optimization to all models with shared experts (#26145)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-09 11:31:04 -04:00
3b736e1c38 [Attention][DCP] Support DCP with query length > 1 (MTP) with FA3 (#25049)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-10-09 08:06:29 -07:00
2c1c7dfb35 [Models][Qwen] Replace pad with cat for better performance (#26486)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-09 14:51:26 +00:00
e246ad6f0c Upgrade Pydantic to v2.12.0 and remove hack for Python 3.13 (#26481)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-09 06:02:40 -07:00
5728da11ea Revert #26113 "[Frontend] CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops" (#26472)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-10-09 05:43:55 -07:00
92be3f3517 [Feature] Use pydantic validation in parallel.py config (#26417)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-09 12:41:31 +00:00
d1ddf340c8 [V0 deprecation] Remove QKVCrossParallelLinear implementation (#26475)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-09 10:52:27 +00:00
ec10fd0abc [Bugfix] Move current_platform import to avoid python import cache. (#16601)
Signed-off-by: iwzbi <wzbi@zju.edu.cn>
2025-10-09 10:46:19 +00:00
0426e3c5e1 [Models][Qwen3VL] Optimise _validate_and_reshape_mm_tensor (#26426)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-09 10:25:48 +00:00
4bdf7ac593 [Bugfix] Fix SHM cache initialization (#26427)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-09 02:48:04 -07:00
dc7976dd9f [Misc] Upgrade more code to Python 3.10 (#26463)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-09 10:43:53 +01:00
e4791438ed [Feature] Use pydantic validation in lora.py and load.py configs (#26413)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
2025-10-09 02:38:33 -07:00
e6e898f95d [doc] add Volcengine as a compute sponsor (#26477)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-09 17:11:47 +08:00
ddcbc2f334 [Misc] Misc code simplifications (#26450)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-09 02:10:06 -07:00
a83ff278d6 [torchao] Add support for ModuleFqnToConfig using regex (#26001)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-10-09 08:32:32 +00:00
cf4cd6c24f Add: Support for multiple hidden layers in Eagle3 (#26164)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-09 07:30:50 +00:00
b960441812 Enable RMSNorm substitution for Transformers backend (#26353)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-09 07:28:51 +00:00
1317028aa8 [Model] Gemma3: Fix GGUF loading and quantization (#26189)
Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-09 07:00:53 +00:00
5e49c3e777 Bump Flashinfer to v0.4.0 (#26326)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-10-08 23:58:44 -07:00
0d7c3cb51d Update Dockerfile and install runai-model-streamer[gcs] package (#26464)
Signed-off-by: Peter Schuurman <psch@google.com>
2025-10-08 23:48:51 -07:00
1b2c440cd6 [Core] Relax the LoRA max rank (#26461)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-08 23:47:14 -07:00
0f29dca988 [CI/Build] Fix model nightly tests (#26466)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-08 23:44:16 -07:00
d24cf322e1 [Hybrid]: Decouple Kernel Block Size from KV Page Size (#24486)
Signed-off-by: lizhiyuan <uniartisan2017@gmail.com>
Signed-off-by: Zhiyuan Li <uniartisan2017@gmail.com>
2025-10-08 23:43:39 -07:00
d17f0fbf30 [Core][KVConnector] Propagate all tokens on resumed preemptions (#24926)
Signed-off-by: Qier Li <kevin44036@gmail.com>
Co-authored-by: Qier Li <qier@fb.com>
2025-10-09 14:43:31 +08:00
43ab8cfaa5 [MM][Doc] Add documentation for configurable mm profiling (#26200)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
2025-10-08 23:21:20 -07:00
de253d63b7 [Hardware][AMD] Enable FlexAttention backend on ROCm (#26439)
Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
2025-10-09 06:20:18 +00:00
8bd696fa53 [Bugfix] Incorrect another MM data format in vllm bench throughput (#26462)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-10-09 05:58:46 +00:00
bb6d8c21f9 [Bugfix] Catch and log invalid token ids in detokenizer #2 (#26445)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-08 21:20:25 -07:00
ebf6ef1a9b [Minor] Change warning->warning_once in preprocess (#26455)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-10-08 21:09:06 -07:00
0c52d6ef81 [Bugfix] Set the minimum python version for gpt-oss (#26392)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-08 20:35:49 -07:00
467a4f98f1 [Misc] Redact ray runtime env before logging (#26302)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-08 17:43:34 -07:00
e614ab7806 Separate MLAAttention class from Attention (#25103)
Signed-off-by: Naveenraj Kamalakannan <therealnaveenkamal@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-08 17:11:11 -07:00
2a03f93de9 [Attention] Register FLASHMLA_SPARSE (#26441)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-08 22:28:52 +00:00
da364615fc [Kernels] Modular kernel refactor (#24812)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-08 17:51:52 -04:00
f08919b7d1 [Bugfix] Respect min_tokens in scheduler stop check (#26317)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
2025-10-08 14:08:24 -07:00
93f2c0aa08 [Models] Improve iteration over layers (#26425)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-08 20:48:33 +00:00
4ebc9108a7 [Kernel] Centralize platform kernel import in current_platform.import_kernels (#26286)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-08 20:25:31 +00:00
e1ba235668 [BugFix] Fix failing test quantization/test_compressed_tensors.py::test_compressed_tensors_fp8_block_enabled (#26436)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
2025-10-08 20:04:12 +00:00
b82f4307c9 [Bugfix][Flashinfer] fix VLLM_USE_TRTLLM_ATTENTION issue for models with diff hyperparameters (#25924)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-10-08 19:54:48 +00:00
76879cc160 [Attention] Implement universal BACKEND_MAP (#25900)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-08 12:00:25 -07:00
b25d7b5657 [Feature] Change cache.py with pydantic validation (#26390)
Signed-off-by: Vinay Damodaran <vrdn@hey.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 11:12:59 -07:00
e09d1753ec Remove Python 3.9 support ahead of PyTorch 2.9 in v0.11.1 (#26416)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 10:40:42 -07:00
4ba8875749 [Bug] Fix Test in Batch Invariant (#26128)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-08 10:13:47 -07:00
6273fe8d3d [Benchmarks] Fix imports in FP8 tuning script (#26407)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-08 16:31:59 +00:00
9fb3ae4e6f [Bug] Fix DeepGEMM Attention Test (#26423)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-08 12:23:41 -04:00
76afe4edf8 [Bugfix] Fix vllm bench ... on CPU-only head nodes (#25283)
Signed-off-by: Aydin Abiar <aydin@anyscale.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Aydin Abiar <aydin@anyscale.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-08 16:06:42 +00:00
c1b06fc182 [CI Failure] Fix pre-commit issue for install_nixl_from_source_ubuntu.py (#26424)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-08 07:55:43 -07:00
241b4cfe66 [Refactor] Refactor FP8 & INT8 Quant Folder inside w8a8 (#25293)
Signed-off-by: nicole-lihui <nicole.li@daocloud.io>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: courage17340 <courage17340@163.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Jacob Kahn <jacobkahn1@gmail.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zxw <1020938856@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: chenlang <chen.lang5@zte.com.cn>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Jonas Kuebler <kuebj@amazon.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: AlonKejzman <alonkeizman@gmail.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com>
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Icey <1790571317@qq.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
Signed-off-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: zixi-qi <qizixi@meta.com>
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Naman Lalit <nl2688@nyu.edu>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Junhong <liujunhong11@huawei.com>
Signed-off-by: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
Signed-off-by: Patrick Toulme <ptoulme@meta.com>
Signed-off-by: Patrick Toulme <pctoulme+1@gmail.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Signed-off-by: Clayton Coleman <smarterclayton@gmail.com>
Signed-off-by: Jialin Ouyang <jialino@meta.com>
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Signed-off-by: Juechen Liu <jueliu@meta.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: yingjun-mou <renzomou@gmail.com>
Signed-off-by: zhoukz <me@zhoukz.com>
Signed-off-by: Chenxi Yang <cxyang@fb.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Signed-off-by: Lee Nau <lnau@nvidia.com>
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: Lucia Fang <fanglu@meta.com>
Signed-off-by: a120092009 <zhaoty0121@gmail.com>
Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com>
Signed-off-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
Signed-off-by: Lehua Ding <lehuading@tencent.com>
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
Signed-off-by: anion <1005128408@qq.com>
Signed-off-by: Anion <123177548+Anionex@users.noreply.github.com>
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Signed-off-by: Or Ozeri <oro@il.ibm.com>
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Salvatore Cena <cena@cenas.it>
Signed-off-by: padg9912 <phone.and.desktop@gmail.com>
Signed-off-by: nadathurv <work.vnadathur@gmail.com>
Signed-off-by: WorldExplored <srreyansh.sethi@gmail.com>
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: billishyahao <bill.he@amd.com>
Signed-off-by: Nathan Scott <nathans@redhat.com>
Signed-off-by: Kenichi Maehashi <maehashi@preferred.jp>
Signed-off-by: Johnny <johnnynuca14@gmail.com>
Signed-off-by: johnnynunez <johnnynuca14@gmail.com>
Signed-off-by: Johnny <johnnync13@gmail.com>
Signed-off-by: Huamin Li <3ericli@gmail.com>
Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
Signed-off-by: Peter Schuurman <psch@google.com>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: leo-pony <nengjunma@outlook.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: ElizaWszola <elizaw.9289@gmail.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: huijjj <huijong.jeong@squeezebits.com>
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: kyt <eluban4532@gmail.com>
Signed-off-by: Egor <e.a.krivov@gmail.com>
Signed-off-by: Yang <lymailforjob@gmail.com>
Signed-off-by: Paul Pak <paulpak58@gmail.com>
Signed-off-by: whx-sjtu <2952154980@qq.com>
Signed-off-by: Xiang Si <sixiang@google.com>
Signed-off-by: Aleksandr Samarin <astrlrd@nebius.com>
Signed-off-by: Jun Jiang <jasl9187@hotmail.com>
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Signed-off-by: Nikhil Ghosh <nikhil@anyscale.com>
Co-authored-by: Nicole LiHui 🥜 <nicolelihui@outlook.com>
Co-authored-by: courage17340 <courage17340@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Jacob Kahn <jacobkahn1@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Nicole LiHui 🥜 <nicole.li@daocloud.io>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Co-authored-by: Agata Dobrzyniewicz <160237065+adobrzyn@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: yyzxw <34639446+yyzxw@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: chenlang <chen.lang5@zte.com.cn>
Co-authored-by: chenlang <10346245@zte.com.cn>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Jonas M. Kübler <44084297+jmkuebler@users.noreply.github.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: AlonKejzman <alonkeizman@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Doug Lehr <douglehr@amd.com>
Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
Co-authored-by: yitingdc <59356937+yitingdc@users.noreply.github.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: xaguilar-amd <xavier.aguilarfruto@amd.com>
Co-authored-by: Iceber Gu <caiwei95@hotmail.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: Icey <1790571317@qq.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: RishiAstra <40644327+RishiAstra@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: 阿丹(adan) <47373076+LDLINGLINGLING@users.noreply.github.com>
Co-authored-by: liudan <adan@minicpm.com>
Co-authored-by: liudan <liudan@qq.com>
Co-authored-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Co-authored-by: Clouddude <kouss.hd@gmail.com>
Co-authored-by: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: qizixi <22851944+zixi-qi@users.noreply.github.com>
Co-authored-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: Naman Lalit <nl2688@nyu.edu>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: Junhong <liujunhong11@huawei.com>
Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Xiaohan Zou <renovamenzxh@gmail.com>
Co-authored-by: rentianyue-jk <rentianyue-jk@360shuke.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Peter Pan <peter.pan@daocloud.io>
Co-authored-by: Patrick C. Toulme <135739773+patrick-toulme@users.noreply.github.com>
Co-authored-by: Clayton Coleman <smarterclayton@gmail.com>
Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: Jialin Ouyang <jialino@meta.com>
Co-authored-by: weiliang <weiliangl@nvidia.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
Co-authored-by: Juechen Liu <grinchcoder@gmail.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yingjun Mou <renzomou@gmail.com>
Co-authored-by: Zhou Jiahao <me@zhoukz.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@fb.com>
Co-authored-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Lee Nau <lee.nau@gmail.com>
Co-authored-by: Adrian Abeyta <aabeyta@redhat.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: acisseJZhong <40467976+acisseJZhong@users.noreply.github.com>
Co-authored-by: Simon Danielsson <70206058+simondanielsson@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucia Fang <fanglu@meta.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Xiaozhu Meng <mxz297@gmail.com>
Co-authored-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Co-authored-by: a120092009 <33205509+a120092009@users.noreply.github.com>
Co-authored-by: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com>
Co-authored-by: Lehua Ding <lehuading@tencent.com>
Co-authored-by: Reza Barazesh <3146276+rzabarazesh@users.noreply.github.com>
Co-authored-by: ihb2032 <40718643+ihb2032@users.noreply.github.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: Anion <123177548+Anionex@users.noreply.github.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com>
Co-authored-by: Or Ozeri <oro@il.ibm.com>
Co-authored-by: cjackal <44624812+cjackal@users.noreply.github.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Andrew Xia <axia@mit.edu>
Co-authored-by: Andrew Xia <axia@fb.com>
Co-authored-by: Salvatore Cena <cena@cenas.it>
Co-authored-by: Param <psch@cs.unc.edu>
Co-authored-by: Zhewen Li <zhewenli@meta.com>
Co-authored-by: nadathurv <work.vnadathur@gmail.com>
Co-authored-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com>
Co-authored-by: Wenlong Wang <wangwenlong2755@gmail.com>
Co-authored-by: billishyahao <bill.he@amd.com>
Co-authored-by: Nathan Scott <natoscott@users.noreply.github.com>
Co-authored-by: Kenichi Maehashi <939877+kmaehashi@users.noreply.github.com>
Co-authored-by: Johnny <johnnync13@gmail.com>
Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Co-authored-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Hosang <156028780+hyoon1@users.noreply.github.com>
Co-authored-by: Jerry Zhang <jerryzh168@gmail.com>
Co-authored-by: pwschuurman <psch@google.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: leo-pony <nengjunma@outlook.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Andrew Xia <axia@meta.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: ahao-anyscale <ahao@anyscale.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Liu-congo <1502632128@qq.com>
Co-authored-by: HUIJONG JEONG <64083281+huijjj@users.noreply.github.com>
Co-authored-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
Co-authored-by: kyt <eluban4532@gmail.com>
Co-authored-by: Egor <e.a.krivov@gmail.com>
Co-authored-by: Yang Liu <127183760+KKSK-DON@users.noreply.github.com>
Co-authored-by: Paul Pak <52512091+paulpak58@users.noreply.github.com>
Co-authored-by: whx <56632993+whx-sjtu@users.noreply.github.com>
Co-authored-by: Xiang Si <sixiang@google.com>
Co-authored-by: Aleksandr Samarin <samarin_ad@mail.ru>
Co-authored-by: Jun Jiang <jasl9187@hotmail.com>
Co-authored-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Nikhil G <nrghosh@users.noreply.github.com>
2025-10-08 10:20:48 -04:00
9fc983c707 [NIXL][non-cuda] Add install script for nixl with non-cuda ucx (#25959)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
2025-10-08 14:19:53 +00:00
2f99f2f506 Tidy vllm/config/__init__.py to only add classes and functions (#26405)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 07:10:00 -07:00
338b1bf04f [Benchmarks] Add support for Qwen 3 VL MoE tuning (#26419)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-08 14:01:08 +00:00
e39dc46f8f [CI] Pooling models mteb test disable enforce_eager (#26408)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-08 12:15:36 +00:00
10c75b5439 [Docs] Have mergify leave a comment with the docs preview link (#26412)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-08 12:04:00 +00:00
f9582fd8f4 [Model] Allow passing custom number of max tiles to Nano 2 VL (#26403)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
2025-10-08 11:19:39 +00:00
f377333bd7 [Misc] add usedforsecurity=False in md5 hash call (#26357)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-10-08 10:18:32 +00:00
f8607863d8 [Feature] Enable E8M0 by Default on Hopper for DeepGEMM, 5% E2E throughput improvement (#26197)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-08 15:33:56 +08:00
335b28f7d1 [TPU] Rename tpu_commons to tpu_inference (#26279)
Signed-off-by: Utkarsh Sharma <utksharma@google.com>
Co-authored-by: Utkarsh Sharma <utksharma@google.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-10-07 23:30:52 -07:00
5e65d6b2ad fix[DP][v1]: Prevent hangs from mismatched worker configurations (#26218)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
2025-10-07 22:55:08 -07:00
0d4f48fa10 [Bugfix] Incorrect MM data format in vllm bench throughput (#26395)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-08 13:52:19 +08:00
127c8b782a Add gather_indexer_k_quant_cache kernel (#25931)
Signed-off-by: Barry Kang <43644113+Barry-Delaney@users.noreply.github.com>
Signed-off-by: Simon Mo <simon.mo@hey.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-10-08 04:58:57 +00:00
cd9890544b fix(v1/kv_cache): resolve async KV transfer bug in cascade attention (#23485)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
2025-10-08 04:46:33 +00:00
067da2d1df [Core] Simplify setting new_token_ids in CachedRequestData (#26388)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-08 03:32:37 +00:00
046118b938 Add SwigluOAI implementation for CPUFusedMOE (#26347)
Signed-off-by: Sharif Inamdar <sharif.inamdar@arm.com>
2025-10-07 20:17:49 -06:00
b32260ab85 [torchao] safetensors integration (#25969)
Signed-off-by: Angel Li <liangel@meta.com>
2025-10-07 20:12:35 -06:00
f80e7866c0 [Misc] Clean up cruft from previous FlashMLA sparse implementation (#26125)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-08 10:09:34 +08:00
31a4b3e6c4 Revert #24446 and #26168 (#26332)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-10-07 16:38:19 -06:00
caf8b1c084 [Bugfix] Fix MTP+FlashInfer crash when trtllm kernels are available but disabled (#26361)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-07 22:12:26 +00:00
1b86bd8e18 Add more libraries to rlhf.md (#26374)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-10-07 20:59:41 +00:00
59012df99b [TPU] update TPU benchmark threshold (#25713)
Signed-off-by: Johnny Yang <johnnyyang@google.com>
2025-10-07 13:53:09 -07:00
3d1f67616d [Spec Decode] Enable efficient speculative decoding with FlashInfer-MLA (#25984)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-07 16:05:59 -04:00
6ebaf43ee4 [V1] Logit processors for rejection sampler (#19482)
Signed-off-by: southfreebird <yvorott@gmail.com>
Signed-off-by: Sergei Skvortsov <sergeyskv@nebius.com>
Signed-off-by: Sergei Skvortsov <yvorott@gmail.com>
Co-authored-by: Sergei Skvortsov <sergeyskv@nebius.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-10-07 13:02:49 -07:00
0c824fc46f [Frontend] CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26113)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
2025-10-07 12:53:43 -07:00
eb577e4655 [Bugfix] Add missing sink tensor into flash attn cascade attn implementation (#26325) 2025-10-07 18:56:39 +00:00
8f36850f73 [Bug] Fix Shape Validation for Fallback while Enabling E8M0 for DeepGEMM (#26322)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-07 13:50:30 -04:00
29fd2662ba [deepseek] add EP8 FusedMOE config for H200 and B200 (#26331)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-07 10:38:54 -07:00
30a3e5af69 [CI] Add Qwen3 MoE NVFP4 to Blackwell lm-eval (#26316)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-07 10:36:15 -07:00
a38c1bfe09 [ci] Rename test_mxfp4_moe.py to test_ocp_mx_moe.py (#26364)
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
2025-10-07 09:52:24 -07:00
320feae6f5 [Model] Lfm2Moe (#26344)
Signed-off-by: Paul Pak <paulpak58@gmail.com>
2025-10-07 16:03:05 +00:00
1e4ecca1d0 [V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 15:42:31 +00:00
c0a7b89d8e [Misc] Move LRUCache into its own file (#26342)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 15:08:40 +00:00
6f59beaf0b [Model] Add support for ModernBertForTokenClassification (#26340)
Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr>
Signed-off-by: antrec <antoine.recanati@gmail.com>
Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-07 14:29:19 +00:00
41f1cf38f2 [Feature][OCP MX] Support mxfp6 and mixed mxfp6-mxfp4 (#21166) 2025-10-07 09:35:26 -04:00
08d26a1b7e [Model] Use merge_by_field_config for MM models (Ovis family) (#26308)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-07 12:54:22 +00:00
63773a6200 [Docs] add docs for cuda graph v1 (#24374)
Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-07 05:25:05 -07:00
883b42896a Add TRL example notebook to RLHF docs (#26346)
Signed-off-by: sergiopaniego <sergiopaniegoblanco@gmail.com>
2025-10-07 11:31:28 +00:00
e1098ced95 Add topk logits torch op for DS3.2. (#25945)
Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
Signed-off-by: Daniel Cámpora <961215+dcampora@users.noreply.github.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-10-07 10:07:32 +00:00
d100d78eb3 Optimize KV cache distribution for asymmetric pipeline parallelism (#25164)
Signed-off-by: gholmes829 <g.holmes429@gmail.com>
2025-10-07 09:20:30 +00:00
7e4cd070b0 [V0 Deprecation] Remove VLLM_USE_V1 from docs and scripts (#26336)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 16:46:44 +08:00
46b0779996 [BugFix] Update KV block hash type from BlockHash to ExternalBlockHash in kv_events_subscriber - #26264 (#26265)
Signed-off-by: atalhens <sneh.lata@nutanix.com>
2025-10-07 08:42:28 +00:00
de342585ff [Model] Define merge_by_field_config MM interface (R-T) (#26260)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 16:10:55 +08:00
185d8ed44f [responsesAPI][bugfix] serialize harmony messages (#26185)
Signed-off-by: Andrew Xia <axia@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-07 07:07:53 +00:00
d9836d4517 [Deprecation] Deprecate LLM.set_tokenizer (#26333)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 06:50:57 +00:00
5f7e8a916a [Model] Define merge_by_field_config MM interface (U-Z) (#26261)
Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 06:45:49 +00:00
4dbdf4a294 [BUG] Fix file parsing for load_format runai_streamer_sharded (#26324)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-10-07 11:23:07 +08:00
c6873c4e6d [UX] Support nested dicts in hf_overrides (#25727)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-07 11:19:16 +08:00
2111b4643c [Core] Simplify the Dp padding/should ubatch coordination logic (#25768)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-10-07 01:57:49 +00:00
c50901f3b9 [Docs][DBO] Add initial doc that describes the DBO implementation (#26024)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-10-07 00:47:28 +00:00
8229280a9c [Misc] Define EP kernel arch list in Dockerfile (#25635)
Signed-off-by: Simon Mo <simon.mo@hey.com>
2025-10-07 00:05:33 +00:00
f77df94647 [Perf] Add decode full-graph support to FlashInfer-MLA backend (#26313)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-06 23:03:49 +00:00
f231e5bc21 [ROCm] Split AITER unified attention into its own backend (#25507)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-10-06 22:49:23 +00:00
2161efe978 [Bugfix] Allow skipping MoE in NVFP4 (fix for MTP) (#25987)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-06 16:16:30 -04:00
f23b4c04fd [BugFix] Pad input buffers in _dummy_run (#26209)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-06 16:07:51 -04:00
93540958b8 [Docs] Fix broken table in moe_kernel_features doc (#26314)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-06 15:58:05 -04:00
44b9af5bb2 [Benchmark] Enable MM Embedding benchmarks (#26310)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-06 19:51:58 +00:00
7cd95dc8a3 [Bugfix] Fix gemma3 with transformers backend (#23178)
Signed-off-by: raushan <raushan@huggingface.co>
Signed-off-by: Raushan Turganbay <raushan@huggingface.co>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-06 18:42:32 +00:00
c02058c222 Add bias handling to CPUFusedMOE kernel (#26289)
Signed-off-by: Crefeda Rodrigues <crefeda.rodrigues@arm.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Crefeda Rodrigues <65665931+cfRod@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Sharif Inamdar <Sharif.Inamdar@arm.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-10-06 18:39:10 +00:00
b2ea5ba677 [Bugfix][Spec Decode] Fix wrong valid_mask for padded speculation when chunked prefill occurs (#26231)
Signed-off-by: seven-mile <i@7li.moe>
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-06 18:24:22 +00:00
824a3f403f [Misc] auto_tune: kill specific vllm process (#26304)
Signed-off-by: Karan Goel <karangoel@google.com>
2025-10-06 18:02:51 +00:00
05f6846ede Support llama3 eagle3 head with llama4 verifier (#25961)
Signed-off-by: rahul-tuli <rtuli@redhat.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-06 13:56:08 -04:00
20db99cc69 [CI Bugfix] Make sure TRTLLM attention is available in test_blackwell_moe (#26188)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-06 13:50:11 -04:00
6431be808f [Tests] conftest: Extending VllmRunner and HfRunner to accept token_ids as input (#26295)
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-06 17:19:34 +00:00
4727a8afa7 [Attention] Remove unused reorder_batch method (#24463)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-06 13:13:39 -04:00
1480 changed files with 57767 additions and 31494 deletions

View File

@ -5,11 +5,11 @@ import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
# Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
def print_top_10_largest_files(zip_file):

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,12 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
value: 0.80
limit: 100
num_fewshot: 0

View File

@ -0,0 +1,10 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5

View File

@ -1,4 +1,5 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
# For vllm script, with -t option (tensor parallel size)
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.855
limit: 2500
num_fewshot: 0

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

View File

@ -0,0 +1 @@
Qwen2.5-VL-7B-Instruct.yaml

View File

@ -0,0 +1,44 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage() {
echo``
echo "Runs lm eval harness on ChartQA using multimodal vllm."
echo "This pathway is intended to be used to create baselines for "
echo "our correctness tests in vllm's CI."
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:l:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm-vlm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT

View File

View File

@ -0,0 +1,50 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size auto

View File

@ -19,21 +19,27 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
backend = eval_config.get("backend", "vllm")
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}"
f"max_model_len={max_model_len},"
)
results = lm_eval.simple_evaluate(
model="vllm",
model=backend,
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto",
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm.
apply_chat_template=backend == "vllm-vlm",
batch_size=batch_size,
)
return results

View File

@ -454,11 +454,6 @@ main() {
fi
check_hf_token
# Set to v1 to run v1 benchmark
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
export VLLM_USE_V1=1
fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)

View File

@ -8,7 +8,7 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@ -48,7 +48,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@ -76,7 +76,7 @@ steps:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest

View File

@ -25,25 +25,28 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
set -e
set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
# Note: disable Bart until supports V1
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 40m bash -c cpu_tests
timeout 120m bash -c cpu_tests

View File

@ -70,7 +70,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Note: disable it until supports V1
# Run AWQ test

View File

@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info

View File

@ -44,6 +44,5 @@ docker run \
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_metrics
pytest -v -s v1/test_serial_utils.py
'

View File

@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=10.0
EXPECTED_THROUGHPUT=8.7
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -42,7 +42,7 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
vllm serve $MODEL \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

1267
.buildkite/test-amd.yaml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -296,6 +296,7 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
@ -317,7 +318,7 @@ steps:
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
@ -399,11 +400,10 @@ steps:
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
- pytest -v -s compile/test_noop_elimination.py
- pytest -v -s compile/test_aot_compile.py
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@ -416,8 +416,8 @@ steps:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min
timeout_in_minutes: 30
- label: PyTorch Fullgraph Test # 22min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@ -425,6 +425,7 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py
- pytest -v -s compile/test_fusions_e2e.py
- label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75
@ -432,8 +433,9 @@ steps:
source_file_dependencies:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
commands:
- pytest -v -s kernels/core
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
@ -526,8 +528,9 @@ steps:
# since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
timeout_in_minutes: 75
@ -732,6 +735,16 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
optional: true
@ -795,8 +808,8 @@ steps:
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
timeout_in_minutes: 60
- label: Blackwell Test # 21 min
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
@ -809,8 +822,6 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py
- vllm/compilation/fusion_attn.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
@ -827,13 +838,32 @@ steps:
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- label: Blackwell Fusion Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
@ -867,7 +897,7 @@ steps:
- pytest -s -v tests/quantization/test_blackwell_moe.py
- label: Blackwell LM Eval Small Models
timeout_in_minutes: 75
timeout_in_minutes: 120
gpu: b200
optional: true # run on nightlies
source_file_dependencies:
@ -990,6 +1020,11 @@ steps:
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
- pip uninstall dummy_stat_logger -y
# end stat_logger plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@ -1054,6 +1089,17 @@ steps:
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
##### multi gpus test #####
@ -1086,12 +1132,16 @@ steps:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H200 test #####
- label: Distrubted Tests (H200) # optional
- label: Distributed Tests (H200) # optional
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048

View File

@ -1,5 +1,10 @@
[run]
source = vllm
# Track the installed vllm package (this is what actually gets imported during tests)
# Use wildcard pattern to match the installed location
source =
vllm
*/dist-packages/vllm
*/site-packages/vllm
omit =
*/tests/*
*/test_*
@ -12,6 +17,16 @@ omit =
*/benchmarks/*
*/docs/*
[paths]
# Map all possible vllm locations to a canonical "vllm" path
# This ensures coverage.combine properly merges data from different test runs
source =
vllm
/vllm-workspace/src/vllm
/vllm-workspace/vllm
*/site-packages/vllm
*/dist-packages/vllm
[report]
exclude_lines =
pragma: no cover

4
.git-blame-ignore-revs Normal file
View File

@ -0,0 +1,4 @@
# Migrate from `yapf` & `isort` to `ruff`
d6953beb91da4e9c99be4c0a1304a2d24189535c
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
8fcaaf6a165e661f63fc51be906bc05b0767332f

13
.github/CODEOWNERS vendored
View File

@ -5,9 +5,7 @@
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/triton_attn.py @tdoublep
@ -60,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/offloading @ApostaC
# Transformers backend
/vllm/model_executor/models/transformers.py @hmellor
/vllm/model_executor/models/transformers @hmellor
/tests/models/test_transformers.py @hmellor
# Docs
@ -121,3 +118,11 @@ mkdocs.yaml @hmellor
# KVConnector installation files
/requirements/kv_connectors.txt @NickLucche
# Pooling models
/examples/*/pooling/ @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop

2
.github/mergify.yml vendored
View File

@ -11,6 +11,8 @@ pull_request_rules:
label:
add:
- documentation
comment:
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build
description: Automatically apply ci/build label

View File

@ -13,6 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
@ -42,7 +43,6 @@ jobs:
searchIn: "body"
},
],
// Substring search - matches anywhere in text (partial matches)
substrings: [
{
@ -89,14 +89,12 @@ jobs:
term: "hip_",
searchIn: "both"
},
// ROCm tools and libraries
{
term: "hipify",
searchIn: "both"
},
],
// Regex patterns - for complex pattern matching
regexPatterns: [
{
@ -107,13 +105,17 @@ jobs:
}
],
},
// Add more label configurations here as needed
// example: {
// keywords: [...],
// substrings: [...],
// regexPatterns: [...]
// },
};
// Helper function to create regex based on search type
function createSearchRegex(term, type) {
// Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) {
case 'keyword':
// Word boundary search - matches whole words only
@ -125,16 +127,13 @@ jobs:
throw new Error(`Unknown search type: ${type}`);
}
}
// Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = [];
const lines = text.split('\n');
for (const termConfig of searchTerms) {
let regex;
let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object)
if (typeof termConfig === 'string') {
term = termConfig;
@ -146,21 +145,17 @@ jobs:
description = termConfig.description;
flags = termConfig.flags;
}
// Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) {
continue;
}
// Create appropriate regex
if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi");
} else {
regex = createSearchRegex(term, searchType);
}
const termMatches = [];
// Check each line for matches
lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex);
@ -175,15 +170,14 @@ jobs:
originalTerm: term || pattern,
description: description,
// Show context around the match in the line
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim()
});
});
}
});
if (termMatches.length > 0) {
matches.push({
term: term || (description || pattern),
@ -196,64 +190,48 @@ jobs:
});
}
}
return matches;
}
// Helper function to check if label should be added
async function processLabel(labelName, config) {
const body = context.payload.issue.body || "";
const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false;
let allMatches = [];
let reason = '';
const keywords = config.keywords || [];
const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title
if (title.trim()) {
core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
}
// Search in body
if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
}
if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
}
// Show details for each match
termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -266,7 +244,6 @@ jobs:
}
});
}
shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -274,13 +251,10 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
}
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) {
@ -296,14 +270,92 @@ jobs:
core.notice(`Label "${labelName}" already present.`);
return false;
}
core.notice(`No matching terms found for label "${labelName}".`);
return false;
}
// Process all configured labels
const processLabels = Object.entries(labelConfig)
.map(([labelName, config]) => processLabel(labelName, config));
const labelsAdded = await Promise.all(processLabels);
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
const labelsAddedResults = await Promise.all(
Object.entries(labelConfig).map(([labelName, config]) =>
processLabel(labelName, config).then(added => ({ labelName, added }))
)
);
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
// Return which labels were added for the next step
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
core.setOutput('labels_added', JSON.stringify(addedLabels));
return addedLabels;
- name: CC users for labeled issues
if: steps.label-step.outputs.labels_added != '[]'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Map labels to GitHub users to CC
// You can add multiple users per label, and multiple label configurations
const ccConfig = {
rocm: {
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
},
// Add more label -> user mappings here
// Example:
// cuda: {
// users: ['user1', 'user2'],
// message: 'CC {users} for CUDA-related issue'
// },
// performance: {
// users: ['perfexpert'],
// message: 'CC {users} for performance issue'
// },
};
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
// Get existing comments to check for already mentioned users
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const issueBody = context.payload.issue.body || '';
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
// Process each label that was added
for (const label of labelsAdded) {
if (ccConfig[label]) {
const config = ccConfig[label];
const usersToMention = [];
// Check which users haven't been mentioned yet
for (const user of config.users) {
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
if (!mentionPattern.test(allExistingText)) {
usersToMention.push(user);
} else {
core.notice(`@${user} already mentioned for label "${label}", skipping`);
}
}
// Post comment if there are users to mention
if (usersToMention.length > 0) {
const mentions = usersToMention.map(u => `@${u}`).join(' ');
const message = config.message.replace('{users}', mentions);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message
});
core.notice(`CC comment added for label "${label}": ${mentions}`);
} else {
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}

View File

@ -4,7 +4,6 @@ MD013: false
MD024:
siblings_only: true
MD033: false
MD042: false
MD045: false
MD046: false
MD051: false

View File

@ -7,17 +7,18 @@ default_stages:
exclude: 'vllm/third_party/.*'
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.13.3
rev: v0.14.0
hooks:
- id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format
- repo: https://github.com/crate-ci/typos
rev: v1.35.5
rev: v1.38.1
hooks:
- id: typos
args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v20.1.3
rev: v21.1.2
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@ -34,7 +35,7 @@ repos:
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.6.17
rev: 0.9.1
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
@ -55,11 +56,6 @@ repos:
types_or: [python, pyi]
require_serial: true
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: python tools/pre_commit/mypy.py 1 "3.9"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10"
@ -75,6 +71,11 @@ repos:
entry: python tools/pre_commit/mypy.py 1 "3.12"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.13
entry: python tools/pre_commit/mypy.py 1 "3.13"
<<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
entry: tools/shellcheck.sh

View File

@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/quantization/w8a8/int8/scaled_quant.cu"
"csrc/quantization/w8a8/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
@ -314,12 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
"csrc/quantization/fp8/per_token_group_quant.cu")
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@ -423,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -458,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -492,9 +493,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -525,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@ -648,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -672,7 +673,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -697,7 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@ -720,7 +721,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -1006,6 +1007,7 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)

View File

@ -149,6 +149,7 @@ Compute Resources:
- Trainy
- UC Berkeley
- UC San Diego
- Volcengine
Slack Sponsor: Anyscale

View File

@ -74,7 +74,7 @@ start_server() {
local vllm_log=$4
local profile_dir=$5
pkill -if vllm
pkill -if "vllm serve" || true
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
@ -96,11 +96,11 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
local server_pid=$!
@ -139,7 +139,7 @@ run_benchmark() {
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
pkill -if vllm
pkill -if "vllm serve" || true
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
@ -232,7 +232,7 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill -if vllm
pkill -if "vllm serve" || true
sleep 10
echo "===================="
return 0
@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
pkill -if vllm
pkill -if "vllm serve" || true
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -8,7 +8,6 @@ import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp
import huggingface_hub.constants
@ -28,13 +27,13 @@ class RequestFuncInput:
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict | list[dict]] = None
model_name: str | None = None
logprobs: int | None = None
extra_body: dict | None = None
multi_modal_content: dict | list[dict] | None = None
ignore_eos: bool = False
language: Optional[str] = None
request_id: Optional[str] = None
language: str | None = None
request_id: str | None = None
@dataclass
@ -52,7 +51,7 @@ class RequestFuncOutput:
async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@ -133,7 +132,7 @@ async def async_request_tgi(
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@ -204,7 +203,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@ -367,7 +366,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), (
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
@ -610,7 +609,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path
):

View File

@ -32,7 +32,6 @@ import dataclasses
import json
import random
import time
from typing import Optional
from transformers import PreTrainedTokenizerBase
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -128,7 +127,7 @@ def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
prefix_len: int,
) -> list[Request]:
requests = []

View File

@ -7,7 +7,6 @@ import dataclasses
import json
import random
import time
from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -24,7 +23,7 @@ def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")

View File

@ -31,8 +31,8 @@ import time
import uuid
import warnings
from collections.abc import AsyncGenerator
from contextlib import nullcontext
from dataclasses import dataclass
from typing import Optional
import datasets
import numpy as np
@ -316,7 +316,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
@ -436,9 +436,9 @@ async def benchmark(
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
ignore_eos: bool,
max_concurrency: Optional[int],
max_concurrency: int | None,
structured_output_ratio: float,
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@ -502,15 +502,9 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
async def limited_request_func(request_func_input, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar)

View File

@ -6,7 +6,7 @@ import math
import os
import time
from types import TracebackType
from typing import Any, Optional, Union
from typing import Any
def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None:
self.cnt: int = 0
self._sum: int = 0
self._max: Optional[int] = None
self._max: int | None = None
self.scale = scale
self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
else:
self._max = max(self._max, v)
def avg(self) -> Union[float, str]:
def avg(self) -> float | str:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> Union[float, str]:
def max(self) -> float | str:
return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[Union[float, str]]:
def dump_avg_max(self) -> list[float | str]:
return [self.avg(), self.max()]
def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_traceback: Optional[TracebackType],
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
self.collect(time.monotonic_ns() - self.start_time)

View File

@ -6,8 +6,7 @@ import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable
from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark

View File

@ -6,8 +6,7 @@ import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable, Optional
from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark
@ -53,7 +52,7 @@ def bench_int8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels."""
assert dtype == torch.int8
@ -108,7 +107,7 @@ def bench_fp8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn
@ -183,7 +182,7 @@ def bench(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(
dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]],
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:

View File

@ -3,10 +3,9 @@
import pickle as pkl
import time
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@ -68,7 +67,7 @@ def unfused_int8_impl(
def unfused_fp8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@ -85,7 +84,7 @@ def unfused_fp8_impl(
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
out, _ = ops.rms_norm_dynamic_per_token_quant(

View File

@ -0,0 +1,191 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"mxfp4": dict(no_a_quant=False, enabled=True),
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_mxfp4(
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
b, forward_hadamard_matrix, method="abs_max"
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
return weight_hf_e2m1, weight_hf_scale_block
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
b, forward_hadamard_matrix, device
)
alpha = torch.tensor([1.0], device="cuda")
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
def run():
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
a, forward_hadamard_matrix, method="abs_max"
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
return matmul_mxf4_bf16_tn(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs MXFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_mxfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_mxfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -0,0 +1,207 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import copy
import itertools
import torch
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
return (
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
* group_size**-0.5
)
def _quant_weight_nvfp4(
b: torch.Tensor,
forward_hadamard_matrix: torch.Tensor,
global_scale: torch.Tensor,
device: str,
M: int,
N: int,
K: int,
):
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
b, forward_hadamard_matrix, global_scale
)
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
-1, K // 16
)
return weight_hf_e2m1, weight_hf_scale_block
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
alpha = torch.tensor([1.0], device="cuda")
global_scale = torch.tensor([1.0], device="cuda")
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
b, forward_hadamard_matrix, global_scale, device, M, N, K
)
if cfg["no_a_quant"]:
# Pre-quantize activation
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
def run():
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
# Quantize activation on-the-fly
def run():
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
a, forward_hadamard_matrix, global_scale
)
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
-1, K // 16
)
return ops.cutlass_scaled_fp4_mm(
input_hf_e2m1,
weight_hf_e2m1,
input_hf_scale_block,
weight_hf_scale_block,
alpha,
torch.bfloat16,
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[
1,
4,
8,
16,
32,
64,
128,
256,
512,
1024,
2048,
4096,
8192,
16384,
24576,
32768,
],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs NVFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K, had_size):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_nvfp4_runner(
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), rep=200, quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.3-70B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
for had_size in [16, 32, 64, 128]:
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N,
K=K,
had_size=had_size,
)
print("Benchmark finished!")

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
from collections.abc import Callable
from unittest.mock import patch
import pandas as pd
@ -10,7 +10,8 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
def with_triton_mode(fn):

View File

@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]

View File

@ -22,8 +22,8 @@ Example:
import json
import os
import time
from collections.abc import Callable
from contextlib import nullcontext
from typing import Callable, Optional
import torch
import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
def benchmark_allreduce_single(
self,
sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
should_use_fn: Callable[[torch.Tensor], bool],
context,
num_warmup: int,
num_trials: int,
) -> Optional[float]:
) -> float | None:
"""Benchmark method with CUDA graph optimization."""
try:
# Create test tensor (2D: sequence_length x hidden_size)

View File

@ -7,7 +7,8 @@ import torch
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()

View File

@ -6,11 +6,12 @@ import copy
import json
import pickle
import time
from collections.abc import Callable
from dataclasses import dataclass
from enum import Enum, auto
from itertools import product
from pathlib import Path
from typing import Any, Callable, Optional
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@ -158,7 +159,7 @@ def ref_group_gemm(
seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor,
scaling: float,
add_inputs: Optional[bool],
add_inputs: bool | None,
):
"""
Torch group gemm reference implementation to test correctness of
@ -316,8 +317,8 @@ class BenchmarkContext:
lora_rank: int
sort_by_lora_id: bool
dtype: torch.dtype
seq_length: Optional[int] = None
num_slices: Optional[int] = None # num_slices for slice based ops
seq_length: int | None = None
num_slices: int | None = None # num_slices for slice based ops
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
ctx = copy.copy(self)
@ -561,7 +562,7 @@ class BenchmarkTensors:
}
def bench_fn_kwargs(
self, op_type: OpType, add_inputs: Optional[bool] = None
self, op_type: OpType, add_inputs: bool | None = None
) -> dict[str, Any]:
if op_type.is_shrink_fn():
assert add_inputs is None
@ -575,7 +576,7 @@ class BenchmarkTensors:
raise ValueError(f"Unrecognized optype {self}")
def test_correctness(
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
self, op_type: OpType, expand_fn_add_inputs: bool | None
) -> bool:
"""
Test correctness of op_type implementation against a grouped gemm
@ -611,8 +612,8 @@ def bench_optype(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
cuda_graph_nops: Optional[int] = None,
expand_fn_add_inputs: Optional[bool] = None,
cuda_graph_nops: int | None = None,
expand_fn_add_inputs: bool | None = None,
test_correctness: bool = False,
) -> TMeasurement:
assert arg_pool_size >= 1
@ -679,7 +680,7 @@ def bench_torch_mm(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
cuda_graph_nops: Optional[int] = None,
cuda_graph_nops: int | None = None,
) -> TMeasurement:
"""
Benchmark basic torch.mm as a roofline.
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
"""
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
compare = TBenchmark.Compare(timers)
compare.print()

View File

@ -8,10 +8,9 @@ import math
import os
import pickle as pkl
import time
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
import pandas as pd
import torch
@ -63,23 +62,23 @@ class BenchmarkTensors:
a: torch.Tensor
w_q: torch.Tensor
group_size: Optional[int]
group_size: int | None
wtype: ScalarType
w_g_s: torch.Tensor
w_g_zp: Optional[torch.Tensor]
w_ch_s: Optional[torch.Tensor]
w_tok_s: Optional[torch.Tensor]
w_g_zp: torch.Tensor | None
w_ch_s: torch.Tensor | None
w_tok_s: torch.Tensor | None
@dataclass
class TypeConfig:
act_type: torch.dtype
weight_type: ScalarType
output_type: Optional[torch.dtype]
group_scale_type: Optional[torch.dtype]
group_zero_type: Optional[torch.dtype]
channel_scale_type: Optional[torch.dtype]
token_scale_type: Optional[torch.dtype]
output_type: torch.dtype | None
group_scale_type: torch.dtype | None
group_zero_type: torch.dtype | None
channel_scale_type: torch.dtype | None
token_scale_type: torch.dtype | None
def rand_data(shape, dtype=torch.float16, scale=1):
@ -93,8 +92,8 @@ def quantize_and_pack(
atype: torch.dtype,
w: torch.Tensor,
wtype: ScalarType,
stype: Optional[torch.dtype],
group_size: Optional[int],
stype: torch.dtype | None,
group_size: int | None,
zero_points: bool = False,
):
assert wtype.is_integer(), "TODO: support floating point weights"
@ -113,7 +112,7 @@ def quantize_and_pack(
def create_bench_tensors(
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
) -> list[BenchmarkTensors]:
m, n, k = shape
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
return res
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
def bench(

View File

@ -579,10 +579,12 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
hidden_size = config.hidden_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
@ -592,6 +594,7 @@ def main(args: argparse.Namespace):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] in (
"Qwen2MoeForCausalLM",
"Qwen3MoeForCausalLM",
@ -600,10 +603,18 @@ def main(args: argparse.Namespace):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
hidden_size = config.hidden_size
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
text_config = config.get_text_config()
E = text_config.num_experts
topk = text_config.num_experts_per_tok
intermediate_size = text_config.moe_intermediate_size
hidden_size = text_config.hidden_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
hidden_size = config.hidden_size
else:
# Support for llama4
config = config.get_text_config()
@ -611,6 +622,7 @@ def main(args: argparse.Namespace):
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
hidden_size = config.hidden_size
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")
@ -619,8 +631,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)

View File

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute

View File

@ -3,16 +3,15 @@
import random
import time
from typing import Optional
import torch
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)
@ -37,7 +36,7 @@ def main(
seed: int,
do_profile: bool,
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
kv_cache_dtype: str | None = None,
) -> None:
current_platform.seed_everything(seed)

View File

@ -3,8 +3,8 @@
import argparse
import math
from collections.abc import Callable
from contextlib import contextmanager
from typing import Callable
from unittest.mock import patch
import torch

View File

@ -1,155 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import torch
from vllm import _custom_ops as vllm_ops
from vllm.triton_utils import triton
def polynorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
def norm(x, eps: float):
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
x = x.float()
return (
(
weight[0] * norm(x**3, eps)
+ weight[1] * norm(x**2, eps)
+ weight[2] * norm(x, eps)
+ bias
)
.to(weight.dtype)
.view(orig_shape)
)
def polynorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
out = torch.empty_like(x)
vllm_ops.poly_norm(out, x, weight, bias, eps)
output = out
output = output.view(orig_shape)
return output
def calculate_diff(batch_size, seq_len, hidden_dim):
dtype = torch.bfloat16
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
weight = torch.ones(3, dtype=dtype, device="cuda")
bias = torch.ones(1, dtype=dtype, device="cuda")
output_naive = polynorm_naive(x, weight, bias)
output_vllm = polynorm_vllm(x, weight, bias)
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
print("✅ All implementations match")
else:
print("❌ Implementations differ")
batch_size_range = [2**i for i in range(0, 7, 2)]
seq_length_range = [2**i for i in range(6, 11, 1)]
dim_range = [2048, 4096]
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
def get_benchmark():
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["dim", "batch_size", "seq_len"],
x_vals=[list(_) for _ in configs],
line_arg="provider",
line_vals=["naive", "vllm"],
line_names=["Naive", "vLLM"],
styles=[("blue", "-"), ("red", "-")],
ylabel="us",
plot_name="polynorm-perf",
args={},
)
)
def benchmark(dim, batch_size, seq_len, provider):
dtype = torch.bfloat16
hidden_dim = dim * 4
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
weight = torch.ones(3, dtype=dtype, device="cuda")
bias = torch.ones(1, dtype=dtype, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "naive":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: polynorm_naive(x, weight, bias),
quantiles=quantiles,
)
else:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: polynorm_vllm(x, weight, bias),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
return benchmark
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--batch-size",
type=int,
default=4,
help="Batch size",
)
parser.add_argument(
"--seq-len",
type=int,
default=128,
help="Sequence length",
)
parser.add_argument(
"--hidden-dim",
type=int,
default=8192,
help="Intermediate size of MLP",
)
parser.add_argument(
"--save-path",
type=str,
default="./configs/polnorm/",
help="Path to save polnorm benchmark results",
)
args = parser.parse_args()
# Run correctness test
calculate_diff(
batch_size=args.batch_size,
seq_len=args.seq_len,
hidden_dim=args.hidden_dim,
)
benchmark = get_benchmark()
# Run performance benchmark
benchmark.run(print_data=True, save_path=args.save_path)

View File

@ -7,7 +7,8 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time
@ -11,9 +9,9 @@ from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time
@ -14,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random_flash,
)

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Optional, Union
import torch
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
def forward(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
residual: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
orig_dtype = x.dtype
x = x.to(torch.float32)
if residual is not None:
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
def rmsnorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -65,7 +64,7 @@ def rmsnorm_naive(
def rmsnorm_flashinfer(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
def rmsnorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate
from typing import Optional
import nvtx
import torch
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
seq_len: int,
num_heads: int,
head_size: int,
rotary_dim: Optional[int],
rotary_dim: int | None,
dtype: torch.dtype,
seed: int,
device: str,

View File

@ -1,5 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Comprehensive 3-way SiLU Benchmark Suite
This benchmark compares three SiLU implementations:
1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
2. Triton Kernel - Triton-based implementation
The suite generates detailed performance comparisons including:
- Memory bandwidth utilization
- Speedup ratios (baseline vs optimized implementations)
- Performance across different expert configurations and token distributions
"""
from collections.abc import Callable
import matplotlib.pyplot as plt
@ -7,7 +21,7 @@ import numpy as np
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
silu_mul_fp8_quant_deep_gemm_cuda,
persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
num_parallel_tokens,
group_size: int = 128,
eps: float = 1e-10,
expert_offsets: torch.Tensor = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton(
# Parse generation strategies
strategies = ["uniform", "max_t", "first_t"]
strategies = ["random_imbalanced", "uniform", "max_t"]
def benchmark(
@ -195,15 +210,27 @@ def benchmark(
current_platform.seed_everything(42 + seed_offset)
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
if gen_strategy == "uniform":
r = torch.rand(size=(E,), device="cuda")
if gen_strategy == "random_imbalanced":
def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
mean = total_tokens // n_e
min_max = mean // ratio
e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
e[0] = min_max
r = torch.rand(size=(E - 1,))
r /= r.sum()
r *= total_tokens - min_max
r = r.round().long()
e[1:] = r.to(device=device)
return e
tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
elif gen_strategy == "uniform":
r = torch.rand(size=(E,))
r /= r.sum()
r *= total_tokens
tokens_per_expert = r.int()
tokens_per_expert = torch.minimum(
tokens_per_expert,
torch.ones((E,), device=r.device, dtype=torch.int) * T,
)
r = r.round().long()
tokens_per_expert = r
elif gen_strategy == "max_t":
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
tokens_per_expert.fill_(total_tokens / E)
@ -281,40 +308,34 @@ def benchmark(
def create_comparison_plot(
ratio, cuda_times, baseline_times, config_labels, strategy_name, id
ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
):
"""Create a comparison plot for a specific generation strategy"""
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
fig, ax = plt.subplots(1, 1, figsize=(18, 6))
# Configure x-axis positions
x = np.arange(len(config_labels))
width = 0.35
width = 0.25
# Execution Time plot (lower is better)
ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
ax.bar(
x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
)
ax.bar(
x + width / 2,
baseline_times,
width,
label="Baseline",
alpha=0.8,
color="orange",
x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
)
# Add speedup labels over each bar pair
# Add speedup labels over each bar trio
for i in range(len(x)):
speedup = ratio[i]
max_height = max(cuda_times[i], baseline_times[i])
triton_v2_speedup = ratios[i][1] # triton/v2
max_height = max(silu_v2_times[i], triton_times[i])
# Triton/V2 speedup
ax.text(
x[i],
x[i] + width / 2,
max_height + max_height * 0.02,
f"{speedup:.2f}x",
f"{triton_v2_speedup:.2f}x",
ha="center",
va="bottom",
fontweight="bold",
fontsize=9,
fontsize=8,
)
ax.set_xlabel("Configuration")
@ -332,56 +353,75 @@ def create_comparison_plot(
def create_combined_plot(all_results):
"""Create a combined plot with all strategies in one PNG"""
num_strategies = len(all_results)
fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
if num_strategies == 1:
axes = [axes]
for idx, (
strategy_name,
ratio,
cuda_times,
baseline_times,
all_ratios,
all_silu_v2_results,
all_triton_results,
config_labels,
config_x_axis,
) in enumerate(all_results):
ax = axes[idx]
# Flatten the nested results to get bandwidth percentages for plotting
silu_v2_bandwidths = []
triton_bandwidths = []
flat_ratios = []
for config_results in all_silu_v2_results:
for result in config_results:
silu_v2_bandwidths.append(result[3]) # bandwidth percentage
for config_results in all_triton_results:
for result in config_results:
triton_bandwidths.append(result[3]) # bandwidth percentage
for config_ratios in all_ratios:
for ratio in config_ratios:
flat_ratios.append(ratio)
# Configure x-axis positions
x = np.arange(len(config_labels))
width = 0.35
width = 0.25
# Execution Time plot (lower is better)
# Bandwidth utilization plot (higher is better)
ax.bar(
x - width / 2,
cuda_times,
x,
silu_v2_bandwidths,
width,
label="CUDA Kernel",
label="SiLU V2 (CUDA)",
alpha=0.8,
color="blue",
)
ax.bar(
x + width / 2,
baseline_times,
x + width,
triton_bandwidths,
width,
label="Baseline",
label="Triton Kernel",
alpha=0.8,
color="orange",
color="green",
)
# Add speedup labels over each bar pair
# Add speedup labels over each bar trio
for i in range(len(x)):
speedup = ratio[i]
max_height = max(cuda_times[i], baseline_times[i])
triton_v2_speedup = flat_ratios[i] # triton/v2
max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
# Triton/V2 speedup
ax.text(
x[i],
x[i] + width / 2,
max_height + max_height * 0.02,
f"{speedup:.2f}x",
f"{triton_v2_speedup:.2f}x",
ha="center",
va="bottom",
fontweight="bold",
fontsize=9,
fontsize=8,
)
ax.set_xlabel("Configuration")
@ -395,7 +435,7 @@ def create_combined_plot(all_results):
ax.grid(True, alpha=0.3)
plt.tight_layout()
filename = "../../silu_bench/silu_benchmark_combined.png"
filename = "silu_benchmark_combined_3way.png"
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show()
@ -405,7 +445,9 @@ def create_combined_plot(all_results):
outer_dim = 7168
configs = [
# DeepSeekV3 Configs
# (1, 56, 7168),
(8, 1024, 7168),
# (32, 56, 7168),
# DeepSeekV3 Configs
(32, 1024, 7168),
# DeepSeekV3 Configs
@ -417,6 +459,7 @@ num_warmups = 20
strategy_descriptions = {
"uniform": "Uniform Random",
"random_imbalanced": "Imbalanced Random",
"max_t": "Even Assignment",
"first_t": "experts[0] = T, experts[1:] = 0",
}
@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies):
print(f"Testing strategy: {strategy_descriptions[strategy]}")
print(f"{'=' * 60}")
# Collect benchmark data for both algorithms
# Collect benchmark data for all three algorithms
config_labels = []
config_x_axis = []
all_cuda_results = []
all_baseline_results = []
all_silu_v2_results = []
all_triton_results = []
all_ratios = []
for E, T, H in configs:
total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
total_tokens_config = []
for i in [8, 16, 32, 64, 128, 256, 512]:
if i <= T:
total_tokens_config.append(i * E)
config_x_axis.append(total_tokens_config)
cuda_results = []
baseline_results = []
silu_v2_results = []
triton_results = []
ratios = []
for total_tokens in total_tokens_config:
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
config_labels.append(config_label)
# CUDA kernel results
time_ms_cuda, gflops, gbps, perc = benchmark(
silu_mul_fp8_quant_deep_gemm_cuda,
# SiLU V2 (CUDA kernel) results
time_ms_silu_v2, gflops, gbps, perc = benchmark(
persistent_masked_m_silu_mul_quant,
E,
T,
H,
@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups,
gen_strategy=strategy,
)
cuda_results.append((time_ms_cuda, gflops, gbps, perc))
silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
# Baseline results
# Triton kernel results
time_ms_triton, gflops, gbps, perc = benchmark(
silu_mul_fp8_quant_deep_gemm_triton,
E,
@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies):
num_warmups=num_warmups,
gen_strategy=strategy,
)
baseline_results.append((time_ms_triton, gflops, gbps, perc))
ratios.append(time_ms_triton / time_ms_cuda)
triton_results.append((time_ms_triton, gflops, gbps, perc))
print(f"Completed: {config_label}")
all_cuda_results.append(cuda_results)
all_baseline_results.append(baseline_results)
# Calculate speedup ratios (triton baseline / implementation)
triton_v2_ratio = time_ms_triton / time_ms_silu_v2
ratios.append(triton_v2_ratio)
print(
f"Completed: {config_label}:"
f" V2: {time_ms_silu_v2:.3f}ms,"
f" Triton: {time_ms_triton:.3f}ms"
)
all_silu_v2_results.append(silu_v2_results)
all_triton_results.append(triton_results)
all_ratios.append(ratios)
# Store results for combined plotting
@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies):
(
strategy_descriptions[strategy],
all_ratios,
all_cuda_results,
all_baseline_results,
all_silu_v2_results,
all_triton_results,
config_labels,
config_x_axis,
)
@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies):
# Print summary table for this strategy
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
print("-" * 60)
print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
print("-" * 90)
for i, (E, T, H) in enumerate(configs):
speedup = baseline_results[i][0] / cuda_results[i][0]
# Get the first result for each config (simplifying for summary)
v2_time = silu_v2_results[i][0]
triton_time = triton_results[i][0]
triton_v2_speedup = triton_time / v2_time
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
print(
f"{config_label:<20} {cuda_results[i][0]:8.5f} "
f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
f"{triton_v2_speedup:8.2f}x"
)
@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results):
num_strategies = len(all_results)
num_configs = len(configs)
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
fig, axs = plt.subplots(
num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
)
# Add main title to the entire figure
fig.suptitle(
"Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
fontsize=16,
"Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
fontsize=18,
fontweight="bold",
y=0.98,
)
@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results):
(
strategy_name,
all_ratios,
all_cuda_results,
all_baseline_results,
all_silu_v2_results,
all_triton_results,
config_labels,
config_x_axis,
) = result
@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results):
ratios = all_ratios[config_idx]
total_tokens_values = config_x_axis[config_idx]
# Extract CUDA and Triton bandwidth percentages
cuda_bandwidth_percentages = [
result[3] for result in all_cuda_results[config_idx]
# Extract speedup ratios
triton_v2_ratios = [ratio for ratio in ratios]
# Extract bandwidth percentages for all implementations
v2_bandwidth_percentages = [
result[3] for result in all_silu_v2_results[config_idx]
]
triton_bandwidth_percentages = [
result[3] for result in all_baseline_results[config_idx]
result[3] for result in all_triton_results[config_idx]
]
# Plot speedup ratios vs total tokens (left plot)
ax_speedup.plot(
total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
total_tokens_values,
triton_v2_ratios,
"go-",
linewidth=3,
markersize=8,
label="Triton/V2 Speedup",
)
ax_speedup.set_title(
f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
fontsize=12,
fontweight="bold",
)
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
ax_speedup.legend(prop={"weight": "bold"})
ax_speedup.grid(True, alpha=0.3)
# Plot bandwidth utilization (right plot)
ax_bandwidth.plot(
total_tokens_values,
cuda_bandwidth_percentages,
"ro-",
v2_bandwidth_percentages,
"o-",
linewidth=3,
markersize=8,
label="CUDA",
label="SiLU V2",
color="blue",
)
ax_bandwidth.plot(
total_tokens_values,
triton_bandwidth_percentages,
"go-",
"o-",
linewidth=3,
markersize=8,
label="Triton",
color="green",
)
ax_bandwidth.set_title(
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results):
for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_fontweight("bold")
# Add value labels on speedup points
for x, y in zip(total_tokens_values, ratios):
# Add value labels on Triton/V2 speedup points
for x, y in zip(total_tokens_values, triton_v2_ratios):
ax_speedup.annotate(
f"{y:.2f}x",
(x, y),
textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=10,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
)
# Add value labels on CUDA bandwidth points
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, 12),
ha="center",
fontsize=9,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
)
# Add value labels on Triton bandwidth points
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
ax_bandwidth.annotate(
f"{y:.1f}%",
(x, y),
textcoords="offset points",
xytext=(0, -15),
ha="center",
fontsize=9,
@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results):
plt.tight_layout()
plt.subplots_adjust(top=0.93) # Make room for main title
filename = "silu_benchmark_total_tokens.png"
filename = "silu_benchmark_total_tokens_3way.png"
plt.savefig(filename, dpi=300, bbox_inches="tight")
plt.show()
return filename
# Create combined plot with all strategies
combined_plot_filename = create_total_tokens_plot(all_results)
# Create comprehensive 3-way comparison plots
combined_plot_filename = create_combined_plot(all_results)
total_tokens_plot_filename = create_total_tokens_plot(all_results)
print(f"\n{'=' * 60}")
print("Benchmark Complete!")
print(f"Generated combined plot: {combined_plot_filename}")
print(f"{'=' * 60}")
print(f"\n{'=' * 80}")
print("3-Way Benchmark Suite Complete!")
print(f"Generated combined comparison plot: {combined_plot_filename}")
print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
print("Compared: SiLU V2 (CUDA), and Triton implementations")
print(f"{'=' * 80}")

View File

@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
from typing import Optional
import flashinfer
import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_decode(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),

View File

@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
from typing import Optional
import flashinfer
import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_prefill(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),

View File

@ -14,7 +14,7 @@ import torch
from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_block_fp8_matmul,
_w8a8_triton_block_scaled_mm,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
@ -83,7 +83,7 @@ def w8a8_block_matmul(
)
if A.dtype == torch.float8_e4m3fn:
kernel = _w8a8_block_fp8_matmul
kernel = _w8a8_triton_block_scaled_mm
else:
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")

View File

@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Iterable
from typing import Any, Callable, Optional
from collections.abc import Callable, Iterable
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
def __init__(
self,
cuda_graph_params: Optional[CudaGraphBenchParams],
cuda_graph_params: CudaGraphBenchParams | None,
label: str,
sub_label: str,
description: str,

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from statistics import mean
from typing import Any, NamedTuple, Optional, Union
from typing import Any, NamedTuple
import numpy as np # type: ignore
import pandas as pd # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
class UniformDistribution(Distribution):
def __init__(
self,
min_val: Union[int, float],
max_val: Union[int, float],
min_val: int | float,
max_val: int | float,
is_integer: bool = True,
) -> None:
self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
class ConstantDistribution(Distribution):
def __init__(self, value: Union[int, float]) -> None:
def __init__(self, value: int | float) -> None:
self.value = value
self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution):
def __init__(
self,
mean: Optional[float] = None,
sigma: Optional[float] = None,
average: Optional[int] = None,
median_ratio: Optional[float] = None,
max_val: Optional[int] = None,
mean: float | None = None,
sigma: float | None = None,
average: int | None = None,
median_ratio: float | None = None,
max_val: int | None = None,
) -> None:
self.average = average
self.median_ratio = median_ratio

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum
from http import HTTPStatus
from statistics import mean
from typing import NamedTuple, Optional, Union
from typing import NamedTuple
import aiohttp # type: ignore
import numpy as np # type: ignore
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
class ClientArgs(NamedTuple):
seed: int
max_num_requests: Optional[int]
max_num_requests: int | None
skip_first_turn: bool
max_turns: Optional[int]
max_turns: int | None
max_active_conversations: int
verbose: bool
print_content: bool
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
class MetricStats:
def __init__(self) -> None:
self.min: Optional[float] = None
self.max: Optional[float] = None
self.avg: Optional[float] = None
self.min: float | None = None
self.max: float | None = None
self.avg: float | None = None
self.sum = 0.0
self.count = 0
@ -143,7 +143,7 @@ class MovingAverage:
self.index = 0
self.sum = 0.0
self.count = 0
self.avg: Optional[float] = None
self.avg: float | None = None
def update(self, new_value: float) -> None:
if self.count < self.window_size:
@ -169,7 +169,7 @@ class MovingAverage:
class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
self.metrics: dict[str, MovingAverage | MetricStats] = {
"moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(),
@ -198,14 +198,6 @@ class DebugStats:
self.logger.info("-" * 50)
# Must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text
def nanosec_to_millisec(value: float) -> float:
return value / 1000000.0
@ -220,8 +212,8 @@ async def send_request(
chat_url: str,
model: str,
stream: bool = True,
min_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
min_tokens: int | None = None,
max_tokens: int | None = None,
) -> ServerResponse:
payload = {
"model": model,
@ -250,9 +242,9 @@ async def send_request(
timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True
ttft: Optional[float] = None
ttft: float | None = None
chunk_delay: list[int] = []
latency: Optional[float] = None
latency: float | None = None
first_chunk = ""
generated_text = ""
@ -269,7 +261,7 @@ async def send_request(
if not chunk_bytes:
continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk == "[DONE]":
# End of stream
latency = time.perf_counter_ns() - start_time
@ -364,7 +356,7 @@ async def send_turn(
req_args: RequestArgs,
verbose: bool,
verify_output: bool,
) -> Optional[RequestStats]:
) -> RequestStats | None:
assert messages_to_use > 0
assert messages_to_use <= len(conversation_messages)
@ -644,7 +636,7 @@ async def client_main(
if args.verbose:
curr_time_sec: float = time.perf_counter()
time_since_last_turn: Union[str, float] = "N/A"
time_since_last_turn: str | float = "N/A"
if conv_id in time_of_last_turn:
time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3
@ -769,7 +761,7 @@ def get_client_config(
"Number of conversations must be equal or larger than the number of clients"
)
max_req_per_client: Optional[int] = None
max_req_per_client: int | None = None
if args.max_num_requests is not None:
# Max number of requests per client
req_per_client = args.max_num_requests // args.num_clients
@ -936,13 +928,13 @@ async def main_mp(
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
)
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed)
rps = "N/A"
runtime_left_sec: Union[str, float] = round(
runtime_left_sec: str | float = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
)
if percent < 0.05:
@ -1032,7 +1024,7 @@ def process_statistics(
warmup_percentages: list[float],
test_params: dict,
verbose: bool,
gen_conv_args: Optional[GenConvArgs] = None,
gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False,
) -> None:
if len(client_metrics) == 0:
@ -1259,7 +1251,7 @@ async def main() -> None:
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ",
"same as the `--model` argument. ",
)
parser.add_argument(

View File

@ -13,7 +13,7 @@ import argparse
import json
import random
from statistics import mean
from typing import Any, Optional
from typing import Any
import pandas as pd # type: ignore
import tqdm # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
def content_is_valid(
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
content: str, min_content_len: int | None, max_content_len: int | None
) -> bool:
if min_content_len and len(content) < min_content_len:
return False
@ -37,7 +37,7 @@ def content_is_valid(
def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
) -> None:
# Collect statistics
stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
seed: int,
input_file: str,
output_file: str,
max_items: Optional[int],
min_content_len: Optional[int] = None,
max_content_len: Optional[int] = None,
min_turns: Optional[int] = None,
max_turns: Optional[int] = None,
model: Optional[str] = None,
max_items: int | None,
min_content_len: int | None = None,
max_content_len: int | None = None,
min_turns: int | None = None,
max_turns: int | None = None,
model: str | None = None,
) -> None:
if min_turns and max_turns:
assert min_turns <= max_turns

View File

@ -198,13 +198,24 @@ else()
endif()
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.9
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
FetchContent_Declare(
oneDNN
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
)
else()
message(STATUS "Downloading oneDNN from GitHub")
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.9
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
endif()
if(USE_ACL)
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@ -227,7 +238,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
set(ONEDNN_VERBOSE "ON")
set(ONEDNN_VERBOSE "OFF")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
FetchContent_MakeAvailable(oneDNN)
@ -309,4 +320,4 @@ define_gpu_extension_target(
WITH_SOABI
)
message(STATUS "Enabling C extension.")
message(STATUS "Enabling C extension.")

View File

@ -0,0 +1,97 @@
include(FetchContent)
set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
if(DEFINED ENV{QUTLASS_SRC_DIR})
set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
endif()
if(QUTLASS_SRC_DIR)
FetchContent_Declare(
qutlass
SOURCE_DIR ${QUTLASS_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
qutlass
GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif()
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
if(QUTLASS_ARCHS MATCHES "10\\.0a")
set(QUTLASS_TARGET_CC 100)
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
set(QUTLASS_TARGET_CC 120)
else()
message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
endif()
set(QUTLASS_SOURCES
${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
)
set(QUTLASS_INCLUDES
${qutlass_SOURCE_DIR}
${qutlass_SOURCE_DIR}/qutlass
${qutlass_SOURCE_DIR}/qutlass/csrc/include
${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
)
if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
else()
message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
"Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
endif()
set_gencode_flags_for_srcs(
SRCS "${QUTLASS_SOURCES}"
CUDA_ARCHS "${QUTLASS_ARCHS}"
)
target_sources(_C PRIVATE ${QUTLASS_SOURCES})
target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
target_compile_definitions(_C PRIVATE
QUTLASS_DISABLE_PYBIND=1
TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
)
set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
)
else()
if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
message(STATUS
"[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
else()
message(STATUS
"[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
"CUDA_ARCHS='${CUDA_ARCHS}'.")
endif()
endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

12
codecov.yml Normal file
View File

@ -0,0 +1,12 @@
codecov:
require_ci_to_pass: false
fixes:
# Map source code paths to repository root paths
# Wildcards match any Python version (python3.*)
- "/vllm-workspace/src/vllm/::vllm/"
- "/vllm-workspace/vllm/::vllm/"
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"

View File

@ -28,10 +28,10 @@
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
#include "../quantization/fp8/amd/quant_utils.cuh"
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
typedef __hip_bfloat16 __nv_bfloat16;
#else
#include "../quantization/fp8/nvidia/quant_utils.cuh"
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))

View File

@ -125,32 +125,37 @@ public:
}
static void set_split_kv (KernelArguments& args) {
// printf("set_split_kv start");
if (args.split_kv >= 1) return;
auto [H, K, D, B] = args.problem_shape;
// std::cout << H << " " << K << " " << D << " " << B << "\n";
int sm_count = args.hw_info.sm_count;
// printf(" sm_count = %d\n", sm_count);
int max_splits = ceil_div(K, 128);
max_splits = min(16, max_splits);
float seq_length_k = static_cast<float>(K) / 1024.0f;
int max_splits = 1;
// TODO: This avoids a hang when the batch size larger than 1 and
// there is more than 1 kv_splits.
// Discuss with NVIDIA how this can be fixed.
if (B > 1) {
max_splits = min(1, max_splits);
if (B <= 4 && seq_length_k >= 16) {
max_splits = 16;
}
// printf(" max_splits = %d\n", max_splits);
else if (B <= 8 && seq_length_k >= 4) {
max_splits = 8;
}
else if ((B <= 16 && seq_length_k >= 8) ||
(B == 48 && seq_length_k >= 32)) {
max_splits = 4;
}
else if ((B <= 32 && seq_length_k >= 16) ||
(B == 96 && seq_length_k >= 16)) {
max_splits = 2;
}
else {
max_splits = 1;
}
// Wave-aware scheduling: ensure integer number of waves in K dimension
int sms_per_batch = max(1, sm_count / B);
// printf(" sms_per_batch = %d\n", sms_per_batch);
int split_heur = min(max_splits, sms_per_batch);
int waves = ceil_div(B * split_heur, sm_count);
int k_waves = ceil_div(max_splits, split_heur);
int split_wave_aware = ceil_div(max_splits, k_waves);
args.split_kv = split_wave_aware;
// printf(" args.split_kv = %d\n", args.split_kv);
}
/// Determines whether the GEMM can execute the given problem.

View File

@ -64,3 +64,11 @@ void indexer_k_quant_and_cache(
torch::Tensor& slot_mapping, // [num_tokens]
int64_t quant_block_size, // quantization block size
const std::string& scale_fmt);
// Extract function to gather quantized K cache
void cp_gather_indexer_k_quant_cache(
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& dst_k, // [num_tokens, head_dim]
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
const torch::Tensor& block_table, // [batch_size, num_blocks]
const torch::Tensor& cu_seq_lens); // [batch_size + 1]

View File

@ -9,9 +9,9 @@
#include "quantization/vectorization_utils.cuh"
#ifdef USE_ROCM
#include "quantization/fp8/amd/quant_utils.cuh"
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
#else
#include "quantization/fp8/nvidia/quant_utils.cuh"
#include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#endif
#include <algorithm>
@ -572,6 +572,70 @@ __global__ void indexer_k_quant_and_cache_kernel(
}
}
template <int BLOCK_Y_SIZE>
__global__ void cp_gather_indexer_k_quant_cache_kernel(
const char* __restrict__ kv_cache, // [num_blocks, block_size,
// cache_stride]
char* __restrict__ dst_k, // [num_tokens, head_dim]
char* __restrict__ dst_scale, // [num_tokens, head_dim / quant_block_size *
// 4]
const int* __restrict__ block_table, // [batch_size, num_blocks]
const int* __restrict__ cu_seq_lens, // [batch_size + 1]
const int batch_size, // batch size
const int64_t token_stride, // stride for each token in dst_k
const int64_t head_dim, // dimension of each head
const int64_t block_stride, // stride for each block in kv_cache
const int64_t cache_token_stride, // stride for each token in kv_cache
const int64_t cache_block_size, // num_tokens for each block in kv_cache
const int num_blocks, // number of blocks
const int num_tokens, // number of tokens
const int quant_block_size // quantization block size
) {
constexpr int VEC_SIZE = sizeof(float4) / sizeof(char);
const int token_idx = blockIdx.x * blockDim.y + threadIdx.y;
const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE;
// Find batch index within a block
__shared__ int batch_idx[BLOCK_Y_SIZE];
for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x));
iter++) {
int tid = iter * blockDim.x + threadIdx.x;
if (tid < batch_size) {
const int seq_start = cu_seq_lens[tid];
const int seq_end = cu_seq_lens[tid + 1];
if (token_idx >= seq_start && token_idx < seq_end) {
batch_idx[threadIdx.y] = tid;
}
}
}
#ifndef USE_ROCM
__syncwarp();
#endif
if (head_idx >= head_dim || token_idx >= num_tokens) {
return;
}
const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]];
const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks +
inbatch_seq_idx / cache_block_size];
const int64_t src_block_offset = block_idx * block_stride;
const int64_t cache_inblock_offset =
(inbatch_seq_idx % cache_block_size) * head_dim + head_idx;
const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset;
const int64_t dst_inblock_offset = token_idx * token_stride + head_idx;
reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] =
reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE];
;
if (threadIdx.x == 0) {
const int64_t src_scale_offset =
src_block_offset + cache_block_size * head_dim +
cache_inblock_offset * 4 / quant_block_size;
reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] =
reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4];
}
}
} // namespace vllm
// KV_T is the data type of key and value tensors.
@ -1173,3 +1237,59 @@ void indexer_k_quant_and_cache(
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
CALL_INDEXER_K_QUANT_AND_CACHE);
}
// Macro to dispatch the kernel based on the data amount.
#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE) \
vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE> \
<<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE, \
(head_dim + 8 * vec_size - 1) / (8 * vec_size)), \
dim3(8, BLOCK_Y_SIZE), 0, stream>>>( \
reinterpret_cast<char*>(kv_cache.data_ptr()), \
reinterpret_cast<char*>(dst_k.data_ptr()), \
reinterpret_cast<char*>(dst_scale.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0), \
kv_cache.stride(1), kv_cache.size(1), block_table.size(1), \
num_tokens, quant_block_size);
void cp_gather_indexer_k_quant_cache(
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
torch::Tensor& dst_k, // [num_tokens, head_dim]
torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
const torch::Tensor& block_table, // [batch_size, num_blocks]
const torch::Tensor& cu_seq_lens // [batch_size + 1]
) {
int batch_size = block_table.size(0);
int num_tokens = dst_k.size(0);
int head_dim = dst_k.size(1);
int quant_block_size = head_dim * 4 / dst_scale.size(1);
TORCH_CHECK(kv_cache.device() == dst_k.device(),
"kv_cache and dst_k must be on the same device");
TORCH_CHECK(kv_cache.device() == dst_scale.device(),
"kv_cache and dst_scale must be on the same device");
TORCH_CHECK(kv_cache.device() == block_table.device(),
"kv_cache and block_table must be on the same device");
TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(),
"kv_cache and cu_seq_lens must be on the same device");
TORCH_CHECK(head_dim % quant_block_size == 0,
"head_dim must be divisible by quant_block_size");
constexpr int vec_size = 16;
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (num_tokens < 32) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1);
} else if (num_tokens < 64) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2);
} else if (num_tokens < 128) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4);
} else if (num_tokens < 256) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8);
} else if (num_tokens < 512) {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16);
} else {
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
}
}

View File

@ -5,12 +5,15 @@
namespace vllm {
// vllm_kernel_override_batch_invariant(); returns true
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
inline bool vllm_kernel_override_batch_invariant() {
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
const char* val = std::getenv(env_key.c_str());
return (val && std::atoi(val) != 0) ? 1 : 0;
// vllm_is_batch_invariant(); returns true
// if env VLLM_BATCH_INVARIANT=1
inline bool vllm_is_batch_invariant() {
static bool cached = []() {
std::string env_key = "VLLM_BATCH_INVARIANT";
const char* val = std::getenv(env_key.c_str());
return (val && std::atoi(val) != 0) ? 1 : 0;
}();
return cached;
}
} // namespace vllm

View File

@ -12,6 +12,7 @@ using CubMaxOp = cub::Max;
#endif // CUB_VERSION
#else
#include <hipcub/hipcub.hpp>
using CubAddOp = cub::Sum;
using CubMaxOp = cub::Max;
namespace cub = hipcub;
using CubAddOp = hipcub::Sum;
using CubMaxOp = hipcub::Max;
#endif // USE_ROCM

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum
from typing import Union
from cutlass_library import *
@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative = enum_auto()
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
**DataTypeNames, # type: ignore
**{
VLLMDataType.u4b8: "u4b8",
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
},
}
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
**DataTypeTag, # type: ignore
**{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
},
}
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
**DataTypeSize, # type: ignore
**{
VLLMDataType.u4b8: 4,
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
},
}
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
VLLMDataType.u4b8: "vllm::kU4B8",
VLLMDataType.u8b128: "vllm::kU8B128",
DataType.u4: "vllm::kU4",
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.bf16: "vllm::kBfloat16",
}
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.u8: "at::ScalarType::Byte",
DataType.s8: "at::ScalarType::Char",
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.f32: "at::ScalarType::Float",
}
VLLMKernelScheduleTag: dict[
Union[MixedInputKernelScheduleType, KernelScheduleType], str
] = {
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
**KernelScheduleTag, # type: ignore
**{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501

View File

@ -2,6 +2,7 @@
#include "dispatch_utils.h"
#include "cub_helpers.h"
#include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h>
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
const float epsilon, const int num_tokens, const int hidden_size) {
__shared__ float s_variance;
float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride;
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
const float x = (float)input[blockIdx.x * input_stride + idx];
constexpr int VEC_SIZE = 8;
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x;
}
};
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
@ -136,211 +148,6 @@ fused_add_rms_norm_kernel(
}
}
/* Function specialization in the case of FP16/BF16 tensors.
Additional optimizations we can make in this case are
packed and vectorized operations, which help with the
memory latency bottleneck.
_f16VecPN struct extends _f16Vec to add operations specifically required for
polynomial normalization (poly norm).
The original _f16Vec does not include the sum-of-powers computation or
in-place polynomial normalization logic. */
template <typename scalar_t, int width>
struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
using Base = _f16Vec<scalar_t, width>;
using Converter = typename Base::Converter;
using T1 = typename Base::T1;
using T2 = typename Base::T2;
using Base::data;
__device__ auto sum_pows() const {
float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
#pragma unroll
for (int i = 0; i < width; i += 2) {
float2 z = Converter::convert(T2{data[i], data[i + 1]});
float x2 = z.x * z.x;
float x4 = x2 * x2;
float x6 = x4 * x2;
float y2 = z.y * z.y;
float y4 = y2 * y2;
float y6 = y4 * y2;
s2 += x2 + y2;
s4 += x4 + y4;
s6 += x6 + y6;
}
return std::make_tuple(s2, s4, s6);
}
__device__ void poly_norm_inplace(const float w2_inv_std,
const float w1_inv_std2,
const float w0_inv_std3, const float bias) {
#pragma unroll
for (int i = 0; i < width; i += 2) {
float2 z = Converter::convert(T2{data[i], data[i + 1]});
float x2 = z.x * z.x;
float x3 = x2 * z.x;
z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
float y2 = z.y * z.y;
float y3 = y2 * z.y;
z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
auto out = Converter::convert(z);
data[i] = out.x;
data[i + 1] = out.y;
}
}
};
template <typename scalar_t, int width>
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
const scalar_t* __restrict__ input, // [..., hidden_size]
const scalar_t* __restrict__ weight, // [3]
const scalar_t* __restrict__ bias, // [1]
const float epsilon, const int hidden_size) {
// Sanity checks on our vector struct and type-punned pointer arithmetic
static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
/* These and the argument pointers are all declared `restrict` as they are
not aliased in practice. Argument pointers should not be dereferenced
in this kernel as that would be undefined behavior */
auto* __restrict__ input_v =
reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
const int vec_hidden_size = hidden_size / width;
float variance = 0.0f;
float variance2 = 0.0f;
float variance3 = 0.0f;
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
int id = blockIdx.x * vec_hidden_size + idx;
_f16VecPN<scalar_t, width> temp = input_v[id];
auto [x2, x4, x6] = temp.sum_pows();
variance += x2;
variance2 += x4;
variance3 += x6;
}
float3 thread_variances = make_float3(variance, variance2, variance3);
struct SumOp {
__device__ float3 operator()(const float3& a, const float3& b) const {
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
};
using BlockReduce = cub::BlockReduce<float3, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
float3 block_variances =
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
variance = block_variances.x;
variance2 = block_variances.y;
variance3 = block_variances.z;
__shared__ float s_w2_inv_std;
__shared__ float s_w1_inv_std2;
__shared__ float s_w0_inv_std3;
__shared__ float s_bias;
if (threadIdx.x == 0) {
float w0 = (float)weight[0];
float w1 = (float)weight[1];
float w2 = (float)weight[2];
s_bias = (float)bias[0];
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
}
__syncthreads();
auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
int id = blockIdx.x * vec_hidden_size + idx;
_f16VecPN<scalar_t, width> temp = input_v[id];
temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
out_v[id] = temp;
}
}
/* Generic poly_norm_kernel
The width field is not used here but necessary for other specializations.
*/
template <typename scalar_t, int width>
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
const scalar_t* __restrict__ input, // [..., hidden_size]
const scalar_t* __restrict__ weight, // [3]
const scalar_t* __restrict__ bias, // [1]
const float epsilon, const int hidden_size) {
float variance = 0.0f;
float variance2 = 0.0f;
float variance3 = 0.0f;
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
float x = (float)input[blockIdx.x * hidden_size + idx];
float x2 = x * x;
float x4 = x2 * x2;
float x6 = x4 * x2;
variance += x2;
variance2 += x4;
variance3 += x6;
}
float3 thread_variances = make_float3(variance, variance2, variance3);
struct SumOp {
__device__ float3 operator()(const float3& a, const float3& b) const {
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
};
using BlockReduce = cub::BlockReduce<float3, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
float3 block_variances =
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
variance = block_variances.x;
variance2 = block_variances.y;
variance3 = block_variances.z;
__shared__ float s_w2_inv_std;
__shared__ float s_w1_inv_std2;
__shared__ float s_w0_inv_std3;
__shared__ float s_bias;
if (threadIdx.x == 0) {
float w0 = (float)weight[0];
float w1 = (float)weight[1];
float w2 = (float)weight[2];
s_bias = (float)bias[0];
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
}
__syncthreads();
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
float x = (float)input[blockIdx.x * hidden_size + idx];
float x2 = x * x;
float x3 = x2 * x;
out[blockIdx.x * hidden_size + idx] =
(scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
s_bias);
}
}
} // namespace vllm
void rms_norm(torch::Tensor& out, // [..., hidden_size]
@ -352,18 +159,26 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
TORCH_CHECK(weight.is_contiguous());
int hidden_size = input.size(-1);
int num_tokens = input.numel() / hidden_size;
int64_t input_stride = input.stride(-2);
// We cannot just use `input.stride(-2)` if the tensor is not row-major.
// Instead, we use a 2d view to get the second-innermost stride.
// That way the dimensions (except the last one) can be arbitrarily permuted.
torch::Tensor input_view = input.view({-1, hidden_size});
int num_tokens = input_view.numel() / hidden_size;
int64_t input_stride = input_view.stride(-2);
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
});
VLLM_DISPATCH_FLOATING_TYPES(
input_view.scalar_type(), "rms_norm_kernel", [&] {
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
hidden_size);
});
}
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
@ -380,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
torch::Tensor& residual, // [..., hidden_size]
torch::Tensor& weight, // [hidden_size]
double epsilon) {
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
TORCH_CHECK(input.scalar_type() == residual.scalar_type());
TORCH_CHECK(residual.is_contiguous());
TORCH_CHECK(weight.is_contiguous());
int hidden_size = input.size(-1);
@ -414,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
wt_ptr % req_alignment_bytes == 0;
bool offsets_are_multiple_of_vector_width =
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
!batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -422,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
LAUNCH_FUSED_ADD_RMS_NORM(0);
}
}
#define LAUNCH_FUSED_POLY_NORM(width) \
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), \
weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon, \
hidden_size); \
});
void poly_norm(torch::Tensor& out, // [..., hidden_size]
torch::Tensor& input, // [..., hidden_size]
torch::Tensor& weight, // [3]
torch::Tensor& bias, // [1]
double epsilon) {
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.data_ptr() != input.data_ptr());
int hidden_size = input.size(-1);
int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
/* This kernel is memory-latency bound in many scenarios.
When num_tokens is large, a smaller block size allows
for increased block occupancy on CUs and better latency
hiding on global mem ops. */
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
dim3 block(std::min(hidden_size, max_block_size));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
/*If the tensor types are FP16/BF16, try to use the optimized kernel
with packed + vectorized ops.
Max optimization is achieved with a width-8 vector of FP16/BF16s
since we can load at most 128 bits at once in a global memory op.
However, this requires each tensor's data to be aligned to 16
bytes.
*/
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
LAUNCH_FUSED_POLY_NORM(8);
} else {
LAUNCH_FUSED_POLY_NORM(0);
}
}

View File

@ -6,10 +6,11 @@
*/
#include "type_convert.cuh"
#include "quantization/fp8/common.cuh"
#include "quantization/w8a8/fp8/common.cuh"
#include "dispatch_utils.h"
#include "cub_helpers.h"
#include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h>
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
__shared__ float s_variance;
float variance = 0.0f;
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
const float x = (float)input[blockIdx.x * input_stride + idx];
const scalar_t* input_row = input + blockIdx.x * input_stride;
constexpr int VEC_SIZE = 8;
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x;
}
};
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
@ -216,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
double epsilon) {
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(residual.is_contiguous());
TORCH_CHECK(residual.scalar_type() == input.scalar_type());
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
int hidden_size = input.size(-1);
int input_stride = input.stride(-2);
int num_tokens = input.numel() / hidden_size;
@ -241,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
bool ptrs_are_aligned =
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
!batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8);

View File

@ -8,12 +8,77 @@
#include "../cuda_compat.h"
#include "../dispatch_utils.h"
#include "core/math.hpp"
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
namespace vllm {
namespace moe {
namespace batched_moe_align_block_size {
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
static constexpr int32_t num_threads = 1024;
static constexpr int32_t num_blocks = 1;
__global__ void batched_moe_align_block_size_kernel(
int32_t const num_batches, int32_t const max_tokens_per_batch,
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
int32_t* __restrict__ num_tokens_post_pad) {
// TODO(varun): This is a naive implementation. Could be optimized.
size_t const batch_id = threadIdx.x;
size_t const stride = blockDim.x * gridDim.x;
int32_t const num_blocks_per_batch =
CEILDIV(max_tokens_per_batch, block_size);
int32_t const sorted_ids_size =
num_blocks_per_batch * num_batches * block_size;
int32_t const block_ids_size = sorted_ids_size / block_size;
int32_t const SENTINEL =
num_batches * max_tokens_per_batch; // To denote invalid entries.
// Intialize sorted_ids
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
sorted_ids[i] = SENTINEL;
}
// Intialize expert_ids with -1
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
block_ids[i] = -1;
}
int32_t b_num_tokens = 0;
if (batch_id < num_batches) {
b_num_tokens = batch_num_tokens[batch_id];
}
int32_t const ceil_b_num_tokens =
CEILDIV(b_num_tokens, block_size) * block_size;
// Compute prefix sum over token counts per expert
using BlockScan = cub::BlockScan<int32_t, 1024>;
__shared__ typename BlockScan::TempStorage temp_storage;
int cumsum_val;
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
__syncthreads();
bool const is_last_batch = batch_id == (num_batches - 1);
if (is_last_batch) {
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
}
if (batch_id < num_batches) {
int32_t const batch_offset = batch_id * max_tokens_per_batch;
for (size_t i = 0; i < b_num_tokens; ++i) {
sorted_ids[cumsum_val + i] = batch_offset + i;
}
int32_t const block_start = cumsum_val / block_size;
int32_t const num_blocks = ceil_b_num_tokens / block_size;
for (size_t i = 0; i < num_blocks; ++i) {
block_ids[block_start + i] = batch_id;
}
}
}
} // namespace batched_moe_align_block_size
template <typename scalar_t>
__global__ void moe_align_block_size_kernel(
const scalar_t* __restrict__ topk_ids,
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
});
}
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
int64_t block_size,
torch::Tensor const& batch_num_tokens,
torch::Tensor sorted_ids,
torch::Tensor batch_ids,
torch::Tensor num_tokens_post_pad) {
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t const B = batch_num_tokens.size(0);
int32_t const num_blocks_per_batch =
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
int32_t const num_blocks = num_blocks_per_batch * B;
int64_t const sorted_ids_size = num_blocks * block_size;
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
TORCH_CHECK(B <= batched_kernel::num_threads);
batched_kernel::batched_moe_align_block_size_kernel<<<
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>());
}
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
torch::Tensor& output) // [num_tokens, hidden_size]
{

View File

@ -4,7 +4,7 @@
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices,
torch::Tensor& gating_output);
torch::Tensor& gating_output, bool renormalize);
void moe_sum(torch::Tensor& input, torch::Tensor& output);
@ -12,6 +12,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
int64_t block_size,
torch::Tensor const& expert_num_tokens,
torch::Tensor sorted_ids,
torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad);
#ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales,

View File

@ -16,12 +16,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <type_traits>
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "../cuda_compat.h"
#include "../cub_helpers.h"
#include "../core/batch_invariant.hpp"
#ifndef USE_ROCM
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#else
#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
typedef __hip_bfloat16 __nv_bfloat16;
typedef __hip_bfloat162 __nv_bfloat162;
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -37,16 +47,27 @@ template <
/// Alignment requirement in bytes
int Alignment = sizeof(T) * N
>
class alignas(Alignment) AlignedArray {
float data[N];
struct alignas(Alignment) AlignedArray {
T data[N];
};
template <typename T>
__device__ __forceinline__ float toFloat(T value) {
if constexpr (std::is_same_v<T, float>) {
return value;
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
return __bfloat162float(value);
} else if constexpr (std::is_same_v<T, __half>) {
return __half2float(value);
}
}
// ====================== Softmax things ===============================
// We have our own implementation of softmax here so we can support transposing the output
// in the softmax kernel when we extend this module to support expert-choice routing.
template <int TPB>
template <int TPB, typename InputType>
__launch_bounds__(TPB) __global__
void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
{
using BlockReduce = cub::BlockReduce<float, TPB>;
__shared__ typename BlockReduce::TempStorage tmpStorage;
@ -67,7 +88,8 @@ __launch_bounds__(TPB) __global__
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
{
const int idx = thread_row_offset + ii;
threadData = max(static_cast<float>(input[idx]), threadData);
const float val = toFloat(input[idx]);
threadData = max(val, threadData);
}
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
@ -82,7 +104,8 @@ __launch_bounds__(TPB) __global__
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
{
const int idx = thread_row_offset + ii;
threadData += exp((static_cast<float>(input[idx]) - float_max));
const float val = toFloat(input[idx]);
threadData += expf(val - float_max);
}
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
@ -96,8 +119,9 @@ __launch_bounds__(TPB) __global__
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
{
const int idx = thread_row_offset + ii;
const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
output[idx] = val;
const float val = toFloat(input[idx]);
const float softmax_val = expf(val - float_max) * normalizing_factor;
output[idx] = softmax_val;
}
}
@ -111,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
const int num_experts,
const int k,
const int start_expert,
const int end_expert)
const int end_expert,
const bool renormalize)
{
using cub_kvp = cub::KeyValuePair<int, float>;
@ -126,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
const bool row_is_active = finished ? !finished[block_row] : true;
const int thread_read_offset = blockIdx.x * num_experts;
float selected_sum = 0.f;
for (int k_idx = 0; k_idx < k; ++k_idx)
{
thread_kvp.key = 0;
@ -164,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
assert(indices[idx] >= 0);
source_rows[idx] = k_idx * num_rows + block_row;
if (renormalize) {
selected_sum += result_kvp.value;
}
}
__syncthreads();
}
// Renormalize the k weights for this row to sum to 1, if requested.
if (renormalize) {
if (threadIdx.x == 0) {
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
for (int k_idx = 0; k_idx < k; ++k_idx) {
const int idx = k * block_row + k_idx;
output[idx] = output[idx] / denom;
}
}
}
}
// ====================== TopK softmax things ===============================
@ -185,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
2) This implementation assumes k is small, but will work for any k.
*/
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
int* source_rows, const int k, const int start_expert, const int end_expert)
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
{
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
std::is_same_v<InputType, __half>,
"InputType must be float, __nv_bfloat16, or __half");
// We begin by enforcing compile time assertions and setting up compile time constants.
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
// Number of bytes each thread pulls in per load
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
"ELTS_PER_LDG must be 1 or even for 16-bit conversion");
}
// Restrictions based on previous section.
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
@ -237,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
// row it will read.
const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
// this can support all powers of 2 up to 16.
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
// Finally, we pull in the data from global mem
float row_chunk[VPT];
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
// NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
if constexpr (std::is_same_v<InputType, float>) {
using VecType = AlignedArray<float, ELTS_PER_LDG>;
VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
{
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
}
} else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
if constexpr (ELTS_PER_LDG >= 2) {
using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
#pragma unroll
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
*reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
);
}
}
} else { // ELTS_PER_LDG == 1
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
row_chunk[ii] = __bfloat162float(*scalar_ptr);
}
}
} else if constexpr (std::is_same_v<InputType, __half>) {
if constexpr (ELTS_PER_LDG >= 2) {
using VecType = AlignedArray<__half, ELTS_PER_LDG>;
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
#pragma unroll
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
row_chunk_f2[base_idx_f2 + jj] = __half22float2(
*reinterpret_cast<const __half2*>(vec.data + jj * 2)
);
}
}
} else { // ELTS_PER_LDG == 1
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
row_chunk[ii] = __half2float(*scalar_ptr);
}
}
}
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
@ -311,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
int start_col = first_elt_read_by_thread;
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
float selected_sum = 0.f;
for (int k_idx = 0; k_idx < k; ++k_idx)
{
// First, each thread does the local argmax
@ -364,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
output[idx] = max_val;
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
source_rows[idx] = k_idx * num_rows + thread_row;
if (renormalize) {
selected_sum += max_val;
}
}
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
@ -381,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
}
}
}
// Renormalize the k weights for this row to sum to 1, if requested.
if (renormalize) {
if (thread_group_idx == 0)
{
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
for (int k_idx = 0; k_idx < k; ++k_idx)
{
const int idx = k * thread_row + k_idx;
output[idx] = output[idx] / denom;
}
}
}
}
namespace detail
{
// Constructs some constants needed to partition the work across threads at compile time.
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
struct TopkConstants
{
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
@ -398,21 +508,21 @@ struct TopkConstants
};
} // namespace detail
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
cudaStream_t stream)
{
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
static constexpr int VPT = Constants::VPT;
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
}
#ifndef USE_ROCM
@ -420,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
static_assert(WARP_SIZE == 32, \
"Unsupported warp size. Only 32 is supported for CUDA"); \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
num_tokens, topk, 0, num_experts, renormalize, stream);
#else
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
if (WARP_SIZE == 64) { \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
num_tokens, topk, 0, num_experts, renormalize, stream); \
} else if (WARP_SIZE == 32) { \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
num_tokens, topk, 0, num_experts, renormalize, stream); \
} else { \
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
}
#endif
template <typename IndType>
template <typename IndType, typename InputType>
void topkGatingSoftmaxKernelLauncher(
const float* gating_output,
const InputType* gating_output,
float* topk_weights,
IndType* topk_indices,
int* token_expert_indices,
@ -447,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
const int num_tokens,
const int num_experts,
const int topk,
const bool renormalize,
cudaStream_t stream) {
static constexpr int WARPS_PER_TB = 4;
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
#ifndef USE_ROCM
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
// for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
// elements can be loaded by a warp
static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
(std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
#endif
switch (num_experts) {
case 1:
@ -508,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
TORCH_CHECK(softmax_workspace != nullptr,
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
static constexpr int TPB = 256;
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, softmax_workspace, num_experts);
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
num_experts, topk, 0, num_experts);
num_experts, topk, 0, num_experts, renormalize);
}
}
}
@ -520,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
} // namespace moe
} // namespace vllm
template<typename ComputeType>
void dispatch_topk_softmax_launch(
torch::Tensor& gating_output,
torch::Tensor& topk_weights,
torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices,
torch::Tensor& softmax_workspace,
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
{
if (topk_indices.scalar_type() == at::ScalarType::Int) {
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, stream);
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<uint32_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, stream);
} else {
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, stream);
}
}
void topk_softmax(
torch::Tensor& topk_weights, // [num_tokens, topk]
torch::Tensor& topk_indices, // [num_tokens, topk]
torch::Tensor& token_expert_indices, // [num_tokens, topk]
torch::Tensor& gating_output) // [num_tokens, num_experts]
torch::Tensor& gating_output, // [num_tokens, num_experts]
bool renormalize)
{
const int num_experts = gating_output.size(-1);
const auto num_tokens = gating_output.numel() / num_experts;
@ -536,45 +689,19 @@ void topk_softmax(
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
if(topk_indices.scalar_type() == at::ScalarType::Int)
{
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
{
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<uint32_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
else {
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
if (gating_output.scalar_type() == at::ScalarType::Float) {
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
} else {
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
}
}

View File

@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Apply topk softmax to the gating outputs.
m.def(
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
"token_expert_indices, Tensor gating_output) -> ()");
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
// Calculate the result of moe by summing up the partial results
@ -22,6 +22,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size, but for the batched case.
m.def(
"batched_moe_align_block_size(int max_tokens_per_batch,"
" int block_size, Tensor expert_num_tokens,"
" Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("batched_moe_align_block_size", torch::kCUDA,
&batched_moe_align_block_size);
#ifndef USE_ROCM
m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "

View File

@ -92,14 +92,16 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
torch::Tensor& weight, double epsilon);
void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
torch::Tensor& bias, double epsilon);
void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
const torch::Tensor& rowEnds, torch::Tensor& indices,
torch::Tensor& values, int64_t numRows, int64_t stride0,
int64_t stride1);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale,
double epsilon);
@ -133,12 +135,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
torch::Tensor& input,
torch::Tensor& input_global_scale);
#endif
void silu_mul_fp8_quant_deep_gemm_cuda(
void persistent_masked_m_silu_mul_quant(
const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& counts, // (E)
at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);
bool use_ue8m0);
void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

View File

@ -7,7 +7,7 @@
#include "../cuda_compat.h"
#include "dispatch_utils.h"
#include "quantization/fp8/common.cuh"
#include "quantization/w8a8/fp8/common.cuh"
#include <c10/util/Float8_e4m3fn.h>
@ -114,13 +114,22 @@ __global__ void act_and_mul_quant_kernel(
}
__device__ __forceinline__ float silu(float x) {
return (__fdividef(x, (1.f + expf(-x))));
return __fdividef(x, (1.f + expf(-x)));
}
__device__ __forceinline__ float2 silu2(float2 x) {
return make_float2(silu(x.x), silu(x.y));
}
__device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) {
#ifndef USE_ROCM
return make_bfloat162(__float2bfloat16_rn(silu(x.x)),
__float2bfloat16_rn(silu(x.y)));
#else
return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y)));
#endif
}
#ifndef USE_ROCM
__device__ __forceinline__ float warp_max(float v) {
static constexpr unsigned FULL_MASK = 0xffffffffu;
@ -223,224 +232,308 @@ constexpr __nv_bfloat16 get_fp8_min() {
return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
}
}
#ifndef USE_ROCM
template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
template <typename Idx_t>
__device__ __forceinline__ int warp_expert_search(
int idx, int n, const Idx_t* __restrict__ input, Idx_t val) {
const Idx_t* input_ptr = input + idx;
int base_offset = 0;
for (;;) {
bool move_on = (idx < n && *input_ptr <= val);
unsigned mask = __ballot_sync(0xffffffff, move_on);
if (mask != 0xffffffffu) {
int last_lane = 31 - __clz(mask);
return base_offset + last_lane;
}
input_ptr += 32;
base_offset += 32;
idx += 32;
}
}
template <int num_parallel_tokens>
__device__ __forceinline__ void token_bounds(int32_t n_tokens,
int32_t worker_id,
int32_t& n_tokens_lower,
int32_t& n_tokens_upper) {
if (n_tokens < num_parallel_tokens && worker_id < n_tokens) {
if (worker_id >= num_parallel_tokens) return;
n_tokens_lower = worker_id;
n_tokens_upper = worker_id + 1;
} else {
int32_t chunk_size = n_tokens / num_parallel_tokens;
int32_t residual = n_tokens - chunk_size * num_parallel_tokens;
auto calc_id = [&](int32_t id) {
if (id < residual)
return min(n_tokens, id * (chunk_size + 1));
else
return min(n_tokens, id * chunk_size + residual);
};
n_tokens_lower = calc_id(worker_id);
n_tokens_upper = calc_id(worker_id + 1);
}
}
template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
int NUM_STAGES = 3>
__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
float* __restrict__ _y_s, const int32_t* __restrict__ counts,
float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
// sizes
int H, int G,
Idx_t E, Idx_t T, Idx_t H,
// strides (in elements)
Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
Idx_t stride_ys_g, Idx_t stride_counts_e) {
#ifndef USE_ROCM
static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8;
static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE;
static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4;
static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES;
extern __shared__ __align__(16) __int128_t smem_128[];
int* s_expert_offsets =
reinterpret_cast<int*>(smem_128 + (SMEM_SIZE_BYTES_Y / 16));
static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
// We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
// We assign EPS with it's 16-bit unsigned counterpart to allow constexpr.
static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
int tid = threadIdx.x;
int warp_id = tid >> 5;
int lane_id = tid & 0x1f;
// We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
static constexpr int32_t BFLOAT16_PER_GROUP = 8;
int running_sum{};
if (!warp_id) {
for (int i = 0; i < E; i += WARP_SIZE) {
bool valid = (i + threadIdx.x) < E;
int value =
(valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) +
(!lane_id ? running_sum : 0);
// We split the shared memory in half, corresponding to gate and up matrices:
// [...gate_i, ...up_i] where 0 <= i < stages.
static constexpr int32_t S_NUM_128 =
2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
__shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
for (int offset = 1; offset < 32; offset *= 2) {
int n = __shfl_up_sync(0xFFFFFFFFu, value, offset);
if (lane_id >= offset) value += n;
}
const int32_t tid = threadIdx.x;
const int32_t warp_id = tid / WARP_SIZE;
const int32_t lane_id = tid % WARP_SIZE;
if (valid) {
s_expert_offsets[i + threadIdx.x + 1] = value;
}
auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1);
}
// block handles one (expert e, group g)
int32_t pid = blockIdx.x;
int32_t e = pid / G;
int32_t g = pid % G;
const int32_t n_tokens = counts[e * stride_counts_e];
if (!n_tokens) {
return; // Exit ASAP.
if (!lane_id) {
s_expert_offsets[0] = 0;
}
}
const Idx_t stride_i_t_128 = stride_i_t / 8u;
__syncthreads();
int32_t n_tokens_lower, n_tokens_upper;
int32_t total_tokens = s_expert_offsets[E];
const int warp_position_yq = warp_id * (H / NUM_WARPS);
const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS));
// A single block will handle tokens_per_block tokens.
// Each block i iterates over tokens of a slice of n_tokens =
// expert_counts[i], with the size of chunk being
// (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
// updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
// Specialize this, but can be likely fused.
if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
return;
}
n_tokens_lower = blockIdx.y;
n_tokens_upper = blockIdx.y + 1;
} else {
auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
auto calc_id = [&](int32_t id) {
if (id < residual) {
return min(n_tokens, id * (chunk_size + 1));
} else {
return min(n_tokens, id * chunk_size + residual);
}
};
n_tokens_lower = calc_id(blockIdx.y);
n_tokens_upper = calc_id(blockIdx.y + 1);
}
if (n_tokens_lower >= n_tokens_upper) {
// Each warp will get space to store its hidden dim for gate and up.
__int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES);
__int128_t* smem_load_ptr = s_hidden_load + lane_id;
const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max);
int32_t compute_pipeline_offset_64 = 0;
int32_t load_stage_offset{};
const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f);
__int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) +
warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) +
lane_id;
__int64_t* s_gate64_ptr = smem_compute_ptr;
__int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4;
int tokens_lower, tokens_upper;
token_bounds<BLOCK_COUNT>(total_tokens, blockIdx.x, tokens_lower,
tokens_upper);
Idx_t expert_id{}, expert_offset{}, next_expert_offset{};
int token_id = tokens_lower;
int32_t t_load{};
if (token_id < tokens_upper) {
expert_id = warp_expert_search<int>(lane_id, E, s_expert_offsets, token_id);
expert_offset = s_expert_offsets[expert_id];
next_expert_offset = s_expert_offsets[expert_id + 1];
} else {
// This thread block has no work to do.
return;
}
// We do calculations here, using constexpr wherever possible.
const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
const Idx_t base_yq =
e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
stride_i_t_128 * n_tokens_lower;
auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
auto y_s_ptr =
_y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
stride_yq_t * n_tokens_lower + 4 * lane_id;
int32_t t_load = n_tokens_lower, load_stage_id = 0;
auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
int32_t stage_offset{};
int t_load_bound = H / (GROUP_SIZE * NUM_WARPS);
static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
static constexpr int32_t LOAD_STAGE_MOD =
NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
Idx_t base_i = ((expert_id * stride_i_e) / 8) +
(token_id - expert_offset) * stride_i_t / 8;
const Idx_t gate_warp_offset =
warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111);
const __int128_t* input_128_ptr =
reinterpret_cast<const __int128_t*>(_input) + gate_warp_offset +
((lane_id < 16) ? 0 : ((H * stride_i_h) / 8));
__int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
auto token_offset = token_id - expert_offset;
// Two halves of all threads in a block conduct global loads for gate and up,
// repsectively.
auto load_and_advance_y_pred = [&] {
if (t_load < n_tokens_upper) {
auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
if (t_load < t_load_bound) {
// Here we are simply continuing to load data
// from the current token.
auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
// It is very important that LOAD_STAGE_SIZE is constexpr to avoid
// unnecessary ALU ops.
stage_offset += LOAD_STAGE_SIZE;
stage_offset %= LOAD_STAGE_MOD;
load_stage_offset += LOAD_STAGE_SIZE;
load_stage_offset %= LOAD_STAGE_MOD;
if (tid < HALF_THREAD_COUNT) {
cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
gate_128_ptr += stride_i_t_128;
} else {
cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
up_128_ptr += stride_i_t_128;
}
cp_async4(smem_load_ptr_staged, load_ptr);
load_ptr += GROUP_SIZE / 8;
++t_load;
} else if (token_id + 1 < tokens_upper) {
// We loaded everything from the current token, let's move on
// to the next one, and we checked that we have more tokens to load.
++token_id;
t_load = 0;
if (token_id >= next_expert_offset) {
// We need to find the next expert.
do {
// This is a loop because it's possible
// that some experts are assigned 0 tokens.
// NOTE: We are guaranteed that there's at least
// one more token left so we don't have to check for
// expert_id bounds.
++expert_id;
// This skips 1 memory read.
expert_offset = next_expert_offset;
next_expert_offset = s_expert_offsets[expert_id + 1];
} while (next_expert_offset == expert_offset);
base_i = expert_id * (stride_i_e / 8);
token_offset = 0;
load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
} else {
// We remain within the same expert, so just
// move by H/4 __int128_t (2 * H/8).
base_i += stride_yq_t / 4;
token_offset++;
}
load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i);
auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset;
// It is very important that LOAD_STAGE_SIZE is constexpr to avoid
// unnecessary ALU ops.
load_stage_offset += LOAD_STAGE_SIZE;
load_stage_offset %= LOAD_STAGE_MOD;
cp_async4(smem_load_ptr_staged, load_ptr);
load_ptr += GROUP_SIZE / 8;
++t_load;
++load_stage_id;
}
// We fence even if there is nothing to load to simplify pipelining.
cp_async_fence();
};
// We need to warm-up the pipeline.
#pragma unroll
for (int i = 0; i < NUM_STAGES - 1; i++) {
load_and_advance_y_pred();
}
__int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
lane_id;
__int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
__nv_fp8x4_e4m3* y_q_base_ptr =
reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
for (auto j = tokens_lower; j < tokens_upper; j++) {
const Idx_t base_ys = expert_id * stride_ys_e;
auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
__nv_fp8x4_e4m3* y_q_ptr =
y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t +
warp_position_yq * stride_yq_h) /
4;
const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS);
int32_t compute_pipeline_offset_64 = 0;
for (int i = 0; i < COMPUTE_LIMIT; i++) {
cp_async_wait<NUM_STAGES - 2>();
__syncthreads();
load_and_advance_y_pred();
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
__nv_bfloat162 results_bf162[2];
__int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64;
__int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64;
cp_async_wait<NUM_STAGES - 2>();
__syncthreads();
// COMPUTE_STAGE_SIZE/MOD must also be constexpr!
compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE;
compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD;
// We double-buffer pipelined loads so that the next load will
// concurrently run with compute without overwrites.
load_and_advance_y_pred();
__int64_t gate64 = *gate64_ptr;
__int64_t up64 = *up64_ptr;
auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
// STAGE_SIZE must also be constexpr!
compute_pipeline_offset_64 += STAGE_SIZE;
compute_pipeline_offset_64 %= STAGE_MOD;
// Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
__int64_t gate64 = *s_gate_compute_64;
__nv_bfloat162* s_gate_compute_32 =
reinterpret_cast<__nv_bfloat162*>(&gate64);
__int64_t up64 = *s_up_compute_64;
__nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
// Compute
__nv_bfloat162 res[2];
__nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64);
__nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64);
#pragma unroll
for (int i = 0; i < 2; i++) {
// For silu, we make sure that div is emitted.
float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
results_bf162[i] = __float22bfloat162_rn(gate);
}
for (int32_t k = 0; k < 2; ++k) {
__nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k]));
res[k] = __hmul2(gate, s_up_comp[k]);
}
auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1]));
_y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS);
__nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
if constexpr (USE_UE8M0) {
y_s = hexp2(hceil(hlog2(y_s)));
}
__nv_bfloat16 inv_y = __hdiv(one_bf16, y_s);
auto y_s2 = make_bfloat162(inv_y, inv_y);
#pragma unroll
for (int i = 0; i < 2; i++) {
results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
}
for (int32_t k = 0; k < 2; ++k) {
res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min),
__bfloat162bfloat162(fp8_max));
}
auto _y_max2 =
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
*y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]);
y_q_ptr += WARP_SIZE * stride_yq_h;
__nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
// An entire group is assigned to a single warp, so a simple warp reduce
// is used.
__nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
if constexpr (USE_UE8M0) {
y_s = hexp2(hceil(hlog2(y_s)));
}
auto inv_y = __float2bfloat16_rn(1.f) / y_s;
auto y_s2 = make_bfloat162(inv_y, inv_y);
#pragma unroll
for (int32_t i = 0; i < 2; ++i) {
results_bf162[i] =
clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
__bfloat162bfloat162(fp8_max));
}
auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
*reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
y_q_ptr += stride_yq_t;
if (lane_id == 0) {
*y_s_ptr = y_s;
y_s_ptr += stride_ys_t;
if (!lane_id) {
*y_s_ptr = y_s;
y_s_ptr += stride_ys_g;
}
}
}
}
#endif
}
} // namespace vllm
@ -475,14 +568,14 @@ void silu_and_mul_quant(torch::Tensor& out, // [..., d]
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}
void silu_mul_fp8_quant_deep_gemm_cuda(
const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& counts, // (E)
at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
void persistent_masked_m_silu_mul_quant(
const at::Tensor& input, // (E, T, 2*H)
const at::Tensor& tokens_per_expert, // (E)
at::Tensor& y_q, // (E, T, H) [OUT]
at::Tensor& y_s, // (E, T, H//group_size) [OUT]
bool use_ue8m0) {
#ifndef USE_ROCM
// This kernel relies heavily on cp.async and fp8 support.
// This kernel currently only supports H % 128 == 0 and assumes a
// fixed GROUP_SIZE of 128.
TORCH_CHECK(input.dtype() == torch::kBFloat16);
@ -491,10 +584,6 @@ void silu_mul_fp8_quant_deep_gemm_cuda(
TORCH_CHECK(y_s.dtype() == torch::kFloat32);
TORCH_CHECK(input.size(-1) % 256 == 0);
// Check that num_parallel_tokens is of power of 2 and between 1 and 64.
TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
using Idx_t = int64_t;
Idx_t E = input.size(0);
@ -510,81 +599,54 @@ void silu_mul_fp8_quant_deep_gemm_cuda(
Idx_t stride_ys_t = y_s.stride(1);
Idx_t stride_ys_g = y_s.stride(2);
Idx_t stride_counts_e = counts.stride(0);
Idx_t stride_counts_e = tokens_per_expert.stride(0);
static constexpr int GROUP_SIZE = 128;
#define KERNEL_FN \
if (use_ue8m0) { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \
NUM_PARALLEL_TOKENS, true> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \
stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \
stride_counts_e); \
} else { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t, \
NUM_PARALLEL_TOKENS, false> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G, \
stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g, \
stride_counts_e); \
}
#define KERNEL_CALL_H \
if (H % (4 * GROUP_SIZE) == 0) { \
static constexpr int NUM_WARPS = 4; \
populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
KERNEL_FN \
} else { \
static constexpr int NUM_WARPS = 1; \
populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
KERNEL_FN \
}
#define KERNEL_CALL_TOP_LEVEL \
if (num_parallel_tokens == 1) { \
static constexpr int NUM_PARALLEL_TOKENS = 1; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 2) { \
static constexpr int NUM_PARALLEL_TOKENS = 2; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 4) { \
static constexpr int NUM_PARALLEL_TOKENS = 4; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 8) { \
static constexpr int NUM_PARALLEL_TOKENS = 8; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 16) { \
static constexpr int NUM_PARALLEL_TOKENS = 16; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 32) { \
static constexpr int NUM_PARALLEL_TOKENS = 32; \
KERNEL_CALL_H \
} else if (num_parallel_tokens == 64) { \
static constexpr int NUM_PARALLEL_TOKENS = 64; \
KERNEL_CALL_H \
}
Idx_t G;
dim3 block, grid;
auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
G = H / Idx_t(group_size * num_warps);
grid = dim3(E * G, _num_parallel_tokens);
block = dim3(num_warps * WARP_SIZE);
};
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
"silu_mul_fp8_quant_deep_gemm_kernel",
[&] { KERNEL_CALL_TOP_LEVEL });
#define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES) \
static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE; \
int sms = SILU_V2_BLOCK_COUNT; \
static constexpr int max_shared_mem_bytes = \
GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2; \
dim3 grid(sms), block(THREAD_COUNT); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
VLLM_DISPATCH_FP8_TYPES( \
y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] { \
vllm::silu_mul_fp8_quant_deep_gemm_kernel< \
BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t, \
USE_UE8M0, GROUP_SIZE, STAGES> \
<<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>( \
reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \
(fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(), \
reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e, \
stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t, \
stride_ys_g, stride_counts_e); \
});
static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
if (!use_ue8m0) {
if (H >= 4096) {
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
} else {
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
}
} else {
if (H >= 4096) {
static constexpr int NUM_STAGES = 4;
static constexpr int THREAD_COUNT = 256;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
} else {
static constexpr int THREAD_COUNT = 32;
KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
}
}
#endif
}

View File

@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
if (scale_ub.has_value()) {
TORCH_CHECK(out.dtype() == kFp8Type);
}
TORCH_CHECK(weight.dtype() == input.dtype());
TORCH_CHECK(scales.dtype() == torch::kFloat32);
if (residual) {
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
}
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {

View File

@ -6,7 +6,7 @@
#include "quantization/vectorization.cuh"
// TODO(luka/varun):refactor common.cuh to use this file instead
#include "quantization/fp8/common.cuh"
#include "quantization/w8a8/fp8/common.cuh"
namespace vllm {

View File

@ -9,7 +9,6 @@ from collections.abc import Iterable
from copy import deepcopy
from dataclasses import dataclass, fields
from functools import reduce
from typing import Optional, Union
import jinja2
from vllm_cutlass_library_extension import (
@ -259,7 +258,7 @@ class ScheduleConfig:
@dataclass(frozen=True)
class TypeConfig:
a: DataType
b: Union[DataType, VLLMDataType]
b: DataType | VLLMDataType
b_group_scale: DataType
b_group_zeropoint: DataType
b_channel_scale: DataType
@ -280,7 +279,7 @@ class PrepackTypeConfig:
class ImplConfig:
types: TypeConfig
schedules: list[ScheduleConfig]
heuristic: list[tuple[Optional[str], ScheduleConfig]]
heuristic: list[tuple[str | None, ScheduleConfig]]
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:

Some files were not shown because too many files have changed in this diff Show More