Compare commits

...

274 Commits

Author SHA1 Message Date
b6553be1bc [Misc] Slight improvement of the BNB (#19418)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-10 13:51:49 +00:00
64a9af5afa Simplify ep kernels installation (#19412)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-10 20:06:08 +08:00
e4248849ec [BugFix][CPU] Fix CPU CI by ignore collecting test_pixtral (#19411)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-10 12:02:40 +00:00
467bef18a3 [BugFix][FlashInfer] Fix attention backend interface mismatch with unexpected keyword use_irope (#19134)
Signed-off-by: Yunqiu Guo <guorachel@meta.com>
2025-06-10 16:48:51 +08:00
5f1ac1e1d1 Revert "[v1] Add fp32 support to v1 engine through flex attn" (#19404) 2025-06-10 01:30:20 -07:00
9368cc90b2 Automatically bind CPU OMP Threads of a rank to CPU ids of a NUMA node. (#17930)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-06-10 06:22:05 +00:00
32b3946bb4 Add clear documentation around the impact of debugging flag (#19369)
Signed-off-by: Anna Pendleton <pendleton@google.com>
2025-06-10 06:16:09 +00:00
6b1391ca7e [Misc] refactor neuron_multimodal and profiling (#19397)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 06:12:42 +00:00
a3f66e75d1 Add security warning to bug report template (#19365)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-06-10 06:06:36 +00:00
319cb1e351 [Core] Batch multi modal input using pinned memory (#19169)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-10 13:44:59 +08:00
1efef71645 [Bugfix] Fix modelscope token passed in (#19389)
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-10 13:39:37 +08:00
646d62f636 [Core] Use tuple for kv cache group block ids (#19175)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-10 07:01:17 +02:00
6cd4ae8acd [Frontend] Add tqdm_leave_pbar to control progress bar visibility (#19357)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 04:55:09 +00:00
c016047ed7 Fix docs/mkdocs/hooks/remove_announcement.py (#19382) 2025-06-09 21:36:54 -07:00
9af6d22e4c Use xla flag to improve the quantized model performance (#19303)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-06-10 01:28:45 +00:00
4589b94032 [Bugfix] Fix benchmark_moe.py (#19016)
Signed-off-by: Tianyu Guo <guoty9@mail2.sysu.edu.cn>
2025-06-09 18:04:36 -07:00
cc867be19c [V1] Reuse V0's memory_profiling util for gpu worker memory profiling (#19312)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-10 08:40:01 +08:00
3a7cd627a8 [Misc] Fix a config typo in disable_hybrid_kv_cache_manager configuration (#19383)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-09 16:41:51 -07:00
8058c91108 [HOT-FIX] Add kv_sharing_target_layer_name argument to cutlass_mla backend (#19374)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-06-09 19:00:07 -04:00
7d44c469fe [TPU]Fix KV cache sharing tests (#19371) 2025-06-09 18:38:15 -04:00
31f58be96a [Frontend] Make TIMEOUT_KEEP_ALIVE configurable through env var (#18472)
Signed-off-by: liusiqian <liusiqian@tal.com>
2025-06-09 21:41:21 +00:00
ebb2f383b8 [Quantization] Bump compressed-tensors version (#19295)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-06-09 14:33:15 -07:00
c1c7dbbeeb [Bugfix][Core] Prevent token lengths exceeding max_model_len in V0 (#19348)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-09 23:01:29 +08:00
5cf2daea9a [Misc] Fixes and Optimizations for DeepEP + DeepGEMM combination. (#19298)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-09 10:50:39 -04:00
b8089195b4 [v1] Add fp32 support to v1 engine through flex attn (#19319)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-06-09 22:10:44 +08:00
770e5dcdb8 [full_graph] Fix query_start_loc padding (#19321)
Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
2025-06-09 21:32:56 +08:00
c57c9415b1 [Docs] Fix a bullet list in usage/security.md (#19358)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-06-09 13:28:51 +00:00
01810f9236 [CI] Introduce rules for llama auto-label (#19323)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-09 20:05:42 +08:00
59abbd84f9 [Fix] Allow kernel compilation for CUDA capability 8.7 (#19328)
Signed-off-by: Conroy Cheers <conroy@corncheese.org>
2025-06-09 02:57:23 -07:00
95a6568b5c [CI/Build] Fix LoRA test (#19350)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-09 09:52:10 +00:00
0eca5eacd0 [Doc] Fix description in the Automatic Prefix Caching design doc (#19333)
Signed-off-by: cr7258 <chengzw258@163.com>
2025-06-09 17:30:02 +08:00
12e5829221 [doc] improve ci doc (#19307)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-09 07:26:12 +00:00
3a4d417707 [Misc] Cleanup compilation tests (#19343)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-09 15:05:44 +08:00
8335667c22 [Frontend] Remove unreachable code from llm.py (#19288)
Signed-off-by: KsuParkhamchuk <k.parkhamchuk@gmail.com>
2025-06-09 10:22:10 +08:00
e1c4380d4c [Misc] Add documentation update reminder to PR template (#19289)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-09 10:20:53 +08:00
e31ae3de36 [Deprecation] Remove inputs arg fallback in Engine classes (#18799)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-09 10:19:56 +08:00
2ffb9b6e07 [Bugfix] model_max_length should consider max_model_len in tokenizer_config (#19201) 2025-06-08 07:17:53 -07:00
cda10fa3e2 [Multi Modal] Add an env var for message queue max chunk bytes (#19242)
Signed-off-by: yZhen <yZhen@fb.com>
Co-authored-by: yZhen <yZhen@fb.com>
2025-06-08 21:39:12 +08:00
c123bc33f9 [Quantization] Add compressed-tensors NVFP4 support (#18312) 2025-06-08 09:05:55 -04:00
b9a1791e2c [Hardware][POWER] Add IBM POWER11 Support to CPU Extension Detection (#19082)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-06-08 09:17:14 +00:00
989dcee981 Add H20-3e fused MoE kernel tuning configs for Qwen3-235B-A22B (#19315)
Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
2025-06-08 16:07:02 +08:00
3d64d366e0 [Misc] Change tests/compile to use VLLM_V1 by default (#19302)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-08 16:06:48 +08:00
eaa2e51088 [Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-08 08:56:12 +08:00
d77f7fb871 [Bugfix]: Fix TypeError: 'float' object cannot be interpreted as an integer (#19283)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-08 08:16:31 +08:00
2d8476e465 [BugFix][V1] Fix memory profiling bug (#18974)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-07 10:34:51 -07:00
88be823d57 [AMD] Update compatible packaging version (#19309)
Signed-off-by: pramkuma <Pramendra.Kumar@amd.com>
2025-06-07 20:55:09 +08:00
4e4f63ad45 [Nit][Benchmark]Fix example in benchmark_serving_structured_output.py (#19311)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-06-07 18:25:38 +08:00
d2f0e7e615 [CI/Build] Improve Llama GGUF test robustness (#19287)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-07 17:23:28 +08:00
122cdca5f6 [Misc] refactor context extension (#19246)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-07 05:13:21 +00:00
cf02f9b283 Add FlexAttention to V1 (#16078)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-06-06 21:58:55 -07:00
c4296b1a27 [CI][PowerPC] Use a more appropriate way to select testcase in tests/models/language/pooling/test_embedding.py (#19253)
Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
2025-06-07 11:52:52 +08:00
66c508b137 [TPU][Test] Add script to run benchmark on TPU for buildkite (#19039)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-06 20:10:24 -07:00
84166fee97 [Kernel] Integrate CUTLASS MoE kernel with PPLX (#18762)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-06 18:26:11 -07:00
6e0cd10f72 [Easy][Test] Simplify test_function_tool_use with multiple parametrizes (#19269)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-07 09:19:09 +08:00
e010688f50 [Build][ROCm] Update Dockerfile.rocm (#19296)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-06-06 19:35:16 -04:00
441b65d8c7 [Misc][Tools][Benchmark] Fix and improve auto tune script (#19163)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-06 23:31:19 +00:00
46ecc57973 [BugFix] Fix tpu_model_runner block_id concatenation (#19228)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:28:17 -07:00
b6a3a9f76d [Core] Fix abrupt request abort (#18485)
Signed-off-by: nicklucche <nlucches@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>

Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:27:59 -07:00
ca27f0f9c1 [Bugfix][Core] Update cancellation logic in generate() to handle Generator exits (#19225)
Co-authored-by: Adolfo Victoria <adovi@meta.com>
2025-06-06 20:17:54 +00:00
aad30bd306 [BugFix] Fix MultiConnector test after HMA changes (#19291)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 20:16:24 +00:00
94ecee6282 Fixed ppc build when it runs on non-RHEL based linux distros (#18422)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
Co-authored-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
2025-06-06 11:54:26 -07:00
8267f9916f improve logits bias (#19041) 2025-06-06 19:59:25 +08:00
7353492a47 [Core] Raise when non-multi-instance DP clients target a DP rank (#19227)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-06 19:03:01 +08:00
7661e92ef8 [Model] Optimize nemotron_h implementation (#19249)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-06 10:05:14 +00:00
f168b85725 Unit Test for run_dp_sharded_vision_model (#19103)
Signed-off-by: Siqi Yan <siqi@meta.com>
Co-authored-by: Siqi Yan <siqi@meta.com>
2025-06-06 16:24:02 +08:00
da511d54d8 Fix CompilationConfig repr (#19091)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-06 16:23:35 +08:00
65c69444b1 [Docs] Improve V1 KVConnector interface documentation (#19172)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:22:45 +08:00
94870359cd [Quantization] Bump compressed-tensors version; update NVFP4A16 test model (#19224)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-06-06 01:21:54 -07:00
0d49483ea9 [TPU] fix kv cache dtype in model runner (#19244)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 16:20:16 +08:00
90b78ec5f9 [v1][P/D] Fix a edge case in kv cache schedule (#19182)
Co-authored-by: jinghui <jinghui@fb.com>
2025-06-05 23:32:55 -07:00
91a2ef98ea [Chore] update CODEOWNERS (#19247)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-06 06:09:43 +00:00
3da2313d78 Support allowed_token_ids in ChatCompletionRequest (#19143)
Signed-off-by: Xu Song <xusong.vip@gmail.com>
2025-06-06 05:06:48 +00:00
b61dc5f972 [TPU] update torch_xla pin (#19231)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 04:27:38 +00:00
f8a1a2d108 [v1] Hybrid Memory Allocator (#17996)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-05 20:47:09 -07:00
3465b87ef8 [Bugfix] Fix EAGLE vocab embedding construction for Llama 70B (#19033)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-06-05 19:10:08 -07:00
c8134bea15 Fix AOPerModuleConfig name changes (#18869)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-06-05 18:51:32 -07:00
cb6d572e85 [Model] NemotronH support (#18863)
Signed-off-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
Co-authored-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
2025-06-05 21:29:28 +00:00
87360308b7 [V1] Use FlashInfer by default on Blackwell GPUs (#19118) 2025-06-05 15:40:39 -04:00
aa49f14832 [Quantization] Skip Fp4 Test for compressed-tensors (#19217) 2025-06-05 18:21:53 +00:00
9ef9173cfa [P/D][NixlConnector] Enable FlashInfer backend (#19090) 2025-06-05 17:10:15 +00:00
85e2b7bb13 [MISC][Bugfix] Use less CPU when message queue has been empty for some time (#16226)
Signed-off-by: Povilas Kanapickas <povilas@radix.lt>
2025-06-05 16:53:08 +00:00
61059bee40 [Hardware][NVIDIA] FP4 MoE kernel optimization (#19110)
Signed-off-by: Chiyue Wei <chiyuew@nvidia.com>
Co-authored-by: Chiyue Wei <chiyuew@nvidia.com>
2025-06-05 09:48:26 -07:00
ec89524f50 Add H20-3e fused MoE kernel tuning configs for DeepSeek-R1/V3 (#19205) 2025-06-05 16:38:54 +00:00
f20f9f063b [mistral_common] Add v11 tokenizer (#19193)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-06-05 08:27:41 -07:00
9bc8bb07cf [Bugfix] properly catch PIL-related errors for vision models when incorrect data urls are provided (#19202)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-06-05 12:59:28 +00:00
1aeb925f34 [Frontend] improve vllm run-batch --help display (#19187)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 11:16:25 +00:00
188a4590d8 [Misc] Do not override NCCL_CUMEM_ENABLE if set explicitly (#19105)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-05 11:14:32 +00:00
18093084be [Misc] Remove unnecessary fallback to prefill-decode attention (#19138)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-06-05 16:08:26 +08:00
da40380214 [Build] Annotate wheel and container path for release workflow (#19162)
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-04 23:24:56 -07:00
8fc57501d3 [Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (#19135)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-05 06:24:24 +00:00
af7fc84fd2 [BugFix][Minor] Fix full cuda graph bug when max_num_seqs < 512 (#19171)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-05 13:41:25 +08:00
0678b52251 Handle non-serializable objects when dumping benchmark results (#19114) 2025-06-04 22:40:04 -07:00
25b918eee6 [Torch Nightly]add missing dependency (#18770)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-06-04 21:56:12 -07:00
a408820f2f [Bugfix] Fix port handling in make_zmq_path (#19117) 2025-06-04 21:00:59 -06:00
c56ed8bb0e [Bugfix][Nixl] Fix full prefix cache hit bug (#18632)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-05 02:07:32 +00:00
78dcf56cb3 [doc] small fix (#19167)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 09:13:50 +08:00
b2fac67130 [P/D] Heterogeneous TP (#18833)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-04 23:25:34 +00:00
23027e2daf [Misc] refactor: simplify EngineCoreClient.make_async_mp_client in AsyncLLM (#18817)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-04 15:37:25 -07:00
c3fd4d669a [Kernel] Integrate batched/masked deepgemm kernel (#19111)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-04 21:59:18 +00:00
ef3f98b59f [Bugfix] fix v1 cpu worker fails on macOS (#19121) 2025-06-04 20:17:38 +00:00
7ee2590478 [TPU] Update dynamo dump file name in compilation test (#19108)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 16:13:43 -04:00
53a5a0ce30 [Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-04 10:46:28 -07:00
d459fae0a2 [Bugfix][EP+DP] Fix internode check (#19112)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
2025-06-04 23:39:23 +08:00
c8dcc15921 Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-04 08:26:47 -07:00
8f4ffbd373 [Doc] Update V1 Guide for embedding models (#19141)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 22:57:55 +08:00
5f2cd251d2 Sm100 blockwise fp8 swap ab (#18564) 2025-06-04 07:48:45 -07:00
02658c2dfe Add DeepSeek-R1-0528 function call chat template (#18874)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-06-04 13:24:18 +00:00
01dc9a76db [CI/Build][Bugfix] Ensure compatibility with transformers 4.52 (#18678)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 04:49:20 -07:00
35cf32df30 Improve the output precision of embedding models (#19092) 2025-06-04 11:48:57 +00:00
8711bc5e68 [Misc] Add packages for benchmark as extra dependency (#19089)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-04 04:18:48 -07:00
2669a0d7b5 Fix ValueError: Missing value for tag key(s): model_name,engine. (#19113)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-06-04 17:10:45 +08:00
8e972d9c44 [TPU] Skip hanging tests (#19115)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 01:43:00 -07:00
3336c8cfbe Fix #19130 (#19132)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-04 01:42:06 -07:00
b124e1085b [Bugfix] Fix FA3 full cuda graph correctness (#19106)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-03 23:10:15 -07:00
41aa578428 [NVIDIA] Add Cutlass MLA backend (#17625) 2025-06-03 21:40:26 -07:00
8d646c2e53 [Cleanup][v1]:remote guided-decoding-backend for example (#19059)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-04 04:23:26 +00:00
5d6d1adf15 [KERNEL] Sampler. CUDA kernel for applying repetition penalty (#18437) 2025-06-03 21:13:01 -07:00
1409ef9134 [Core] Cast multimodal input in hf processor (#18862)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-03 20:24:56 -07:00
4555143ea7 [CPU] V1 support for the CPU backend (#16441) 2025-06-03 18:43:01 -07:00
52dceb172d [Docs] Add developer doc about CI failures (#18782)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-04 01:09:13 +00:00
abd7df2fca [Misc] Fix path and python alias errors in disagg_prefill exmaples (#18919) 2025-06-03 17:15:18 -07:00
b712be98c7 feat: add data parallel rank to KVEventBatch (#18925) 2025-06-03 17:14:20 -07:00
a8da78eac9 [Bugfix] Max concurrency estimation and check_enough_kv_cache_memory for models with sliding window layers (#19029)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-04 00:14:06 +00:00
5d96533e22 [Bugfix][P/D] Fix Prefix Cache Bug (#18411)
Signed-off-by: nicklucche <nlucches@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-06-03 23:53:16 +00:00
4de790fcad [Bugfix]: Fix the incompatibility issue with tool_choice 'required' when Thinking is enabled (#19075)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-03 23:27:24 +00:00
b5fd9506c1 [Bugfix] get_num_blocks_to_allocate with null_block (#19031)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 15:30:55 -07:00
135cf55cd1 [V1][Spec Decode][Ngram] 1.35x gain -> 1.95x gain on InstructCoder with prompt fix (#18971) 2025-06-03 15:26:33 -07:00
6cac54f4d1 [v1] Re-init input batch for multiple kv cache groups (#18654)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 21:41:36 +00:00
6865fe0074 Fix interaction between Optional and Annotated in CLI typing (#19093)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikun@apache.org>
2025-06-03 21:07:19 +00:00
e31446b6c8 [Perf] Tune scaled_fp8_quant by increasing vectorization (#18844)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 13:48:25 -07:00
bdf13965ab [V1] Support cross-layer KV sharing (#18212)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-06-03 20:33:07 +00:00
fa98d77773 [Kernel] DeepEP dispatch-combine kernel integration (#18434)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-03 12:30:02 -07:00
01eee40536 [doc] update docker version (#19074)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 19:08:21 +00:00
19bdaf32b1 [Doc] Readme standardization (#18695)
Co-authored-by: Soren Dreano <soren@numind.ai>
2025-06-03 11:50:55 -07:00
02f0c7b220 [Misc] Add SPDX-FileCopyrightText (#19100)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-06-03 11:20:17 -07:00
d054da1992 [Misc] fix: add miss best_of param validation (#18555)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-03 11:02:07 -07:00
4b7817c119 [Misc] Add missing _Backend enums (#19081)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-03 16:15:16 +00:00
d00dd65cd4 [Doc] Improve the Pull Request template with key components (#19086)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 23:44:34 +08:00
d81edded69 [Bugfix] disable processor cache (#19068)
Signed-off-by: raushan <raushan@huggingface.co>
2025-06-03 15:06:04 +00:00
476844d44c Fix underscores in dict keys passed via CLI (#19030)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-06-03 14:39:24 +00:00
4e68ae5e59 [CI/Build] Remove V0 LoRA test (#19066)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 14:30:18 +00:00
4e88723f32 [doc] clarify windows support (#19088)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-03 21:42:17 +08:00
118ff92111 [Doc] Update V1 user guide for embedding and enc-dec models (#19060)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-03 02:29:41 -07:00
ec2dcd80bc [Misc] Update WeightsMapper for qwen2-vl/qwen2.5-vl (#19054)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-03 09:08:20 +00:00
42243fbda0 [Doc] Add InternVL LoRA support (#19055)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 09:08:03 +00:00
6d18ed2a2e Update docker docs with ARM CUDA cross-compile (#19037)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-06-03 08:21:53 +00:00
f32fcd9444 [v1][KVCacheManager] Rename BlockHashType to BlockHash (#19015)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 08:01:48 +00:00
d32aa2e670 [Bugfix] Use cmake 3.26.1 instead of 3.26 to avoid build failure (#19019)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 00:16:17 -07:00
cc977286e7 Reduce logs in CLI scripts and plugin loader (#18970)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 06:00:45 +00:00
17430e3653 [bugfix] small fix logic issue (#18999)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 05:35:12 +00:00
1282bd812e Add tarsier model support (#18985)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-03 13:13:13 +08:00
bdce64f236 [V1] Support DP with Ray (#18779) 2025-06-02 21:15:13 -07:00
9e6f61e8c3 [ROCm][Build] Clean up the ROCm build (#19040)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-02 20:47:47 -07:00
8655f47f37 [CPU][CI] Re-enable the CPU CI tests (#19046)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-02 20:46:47 -07:00
4ce42f9204 Adding "LoRA Test %N" to AMD production tests (#18929)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
2025-06-02 20:46:44 -07:00
8a57872b2a [Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-03 11:36:51 +08:00
5bc1ad6cee [Doc] Remove duplicate TOCs during MkDocs migration (#19021)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-06-02 19:49:48 -07:00
9112b443a0 [Hardware][TPU] Initial support of model parallelism with single worker using SPMD (#18011)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Hossein Sarshar <hossein.sarshar@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-06-03 00:06:20 +00:00
c57d577e8d add an absolute path for run.sh (#18258)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-02 19:38:23 +00:00
ca2f6b9c30 [Bugfix][Model] Attempt to fix eagle in V0. (#18978)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-02 08:15:53 -07:00
20133cfee2 [Frontend] enable custom logging for the uvicorn server (OpenAI API server) (#18403)
Signed-off-by: François Paupier <francois.paupier@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-02 15:04:23 +00:00
ebb1ec9318 [Model] enable data parallel for Llama4 vision encoder (#18368)
Signed-off-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
Co-authored-by: yZhen <yZhen@fb.com>
Co-authored-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
2025-06-02 19:22:54 +08:00
5b168b6d7a [doc] add pytest tips (#19010)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-02 11:07:26 +00:00
9760fd8f6a [Core] Support inplace model weights loading (#18745)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-02 17:38:50 +08:00
b9f61e1387 [Bugfix][Nixl] Fix DP Metadata Handshake (#19008)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-06-02 03:30:41 +00:00
d6fd3a33b8 [Misc] reuse num_tokens_across_dp of get_dp_padding to avoid unnecessary dp all reduce in set_forward_context (#18935)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
2025-06-01 19:41:18 +00:00
432ec9926e [doc] wrong output (#19000)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-01 11:26:14 +00:00
2b102d51ad [BugFix] Fix incorrect metrics shutdown error log message (#18992)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-01 11:42:23 +08:00
aa54a7bf7b [BugFix] fix data parallel construct ipv6 url addres (#18991)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-06-01 11:42:10 +08:00
2ad6194a02 Let max_num_batched_tokens use human_readable_int for large numbers (#18968)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-01 11:41:29 +08:00
c594cbf565 [doc] small fix - mkdocs (#18996)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 20:23:43 -07:00
a35ca765a5 [LoRA] Support dynamically initialize packed_modules_mapping for VLM with arbitrary components (#18987)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-01 11:06:57 +08:00
6aa8f9a4e7 [Core] Rework dtype resolution (#18751)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-01 11:04:23 +08:00
1bc86a3da1 [Bugfix] Fix EAGLE3 broken logits (#18909)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-05-31 19:58:07 -07:00
bbfa0c61d1 [Misc][Benchmark] Add support for CustomDataset (#18511) 2025-05-31 19:07:38 +00:00
20079c6e36 [Misc] add return token strs for tokenize (#18941)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 18:00:11 +00:00
9a1b9b99d7 [BugFix] Fix multi-node offline data-parallel (#18981)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-05-31 08:34:52 -07:00
8bf507d766 [P/D] NixlConnector use cache device index for memory registration (#18969)
Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
2025-05-31 11:19:18 -04:00
306d60401d [ROCm][Kernel] Add gfx950 support for skinny gemms (#18010)
Signed-off-by: charlifu <charlifu@amd.com>
2025-05-31 07:40:05 -07:00
f2c3f66d59 [Bugfix] Fix for issue 17396 (#18773)
Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
2025-05-31 11:58:17 +00:00
0f5e0d567e [FEAT][ROCm] Add AITER grouped topk for DeepSeekV2 (#18825)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-31 03:39:31 -07:00
c55d804672 [BugFix] Pydantic part 2 (#18911)
Signed-off-by: luka <luka@neuralmagic.com>
2025-05-31 03:39:28 -07:00
749f5bdd38 [doc] fix the list rendering issue - security.md (#18982)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 10:39:21 +00:00
2a50ef5760 [Neuron] Add Multi-Modal model support for Neuron (#18921)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
Co-authored-by: Ashraf Mahgoub <ashymahg@amazon.com>
Co-authored-by: Rohith Nallamaddi <nalrohit@amazon.com>
Co-authored-by: FeliciaLuo <luof@amazon.com>
Co-authored-by: Elaine Zhao <elaineyz@amazon.com>
2025-05-31 10:39:11 +00:00
b8b904795d fix security issue of logging llm output (#18980)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-05-31 10:38:56 +00:00
ba5111f237 [Bugfix]: Fix the incompatibility issue with Structured Outputs when Thinking is disabled (#18879)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-31 09:20:54 +00:00
1e123529d7 [Misc] Fix estimated max model len msg (#18966)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-05-31 16:43:44 +08:00
dff80b0e42 [Frontend] Add rerank support to run_batch endpoint (#16278)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2025-05-31 07:40:01 +00:00
7782464a17 create util function for batched arange (#18937) 2025-05-31 13:50:38 +08:00
0f71e24034 [Docs] Correct multiprocessing design doc (#18964)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-31 01:30:15 +00:00
1dab4d5718 Tool parser regex timeout handling (#18960)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-05-30 21:02:54 +00:00
7f21e8052b [Misc] add group_size is -1 in awq quantization (#18910)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-05-30 17:34:22 +00:00
5a8641638a [VLM] Add PP support and fix GPTQ inference for Ovis models (#18958)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-30 17:11:44 +00:00
f49239cb45 Benchmark script for fp8 vs bf16 gemm (#17126)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 10:56:11 -06:00
2dbe8c0774 [Perf] API-server scaleout with many-to-many server-engine comms (#17546) 2025-05-30 08:17:00 -07:00
84ec470fca Improve "failed to get the hash of the compiled graph" error (#18956)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-30 15:00:54 +00:00
b29ca5c4d5 [Docs] Update SECURITY.md with link to our security guide (#18961)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-30 07:37:27 -07:00
ec6833c5e9 [doc] show the count for fork and watch (#18950)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-30 06:45:59 -07:00
e1fadf1197 [Feature] minicpm eagle support (#18943)
Signed-off-by: huangyuxiang03 <huangyx0321@gmail.com>
Co-authored-by: huangyuxiang03 <huangyx0321@gmail.com>
2025-05-30 06:45:56 -07:00
43ff405b90 [CI/Build] remove regex from build dependencies (#18945)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-30 04:02:50 -07:00
fba02e3bd1 [Bugfix][TPU] Fix tpu model runner testcase failure (#18810)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-30 18:04:03 +08:00
4577fc9abb [Misc]Fix typo (#18947) 2025-05-30 02:21:35 -07:00
5f1d0c8118 [Bugfix][Failing Test] Fix test_vllm_port.py (#18618)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-30 17:13:47 +08:00
c3bb9f2331 [Model] Use in-place adds in SigLIP (#18922)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-30 17:12:59 +08:00
8f8900cee9 [doc] add mkdocs doc (#18930)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-30 07:58:44 +00:00
6acb7a6285 [Misc]Fix benchmarks/README.md for speculative decoding (#18897)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-30 07:58:04 +00:00
4f4a6b844a [Deprecation] Remove mean pooling default for Qwen2EmbeddingModel (#18913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-30 06:53:37 +00:00
4d0a1541be [Bugfix] Remove NVFP4 scales assertions to fix load_format=dummy (#18861)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 13:37:36 +08:00
77b6e74fe2 [ROCm] Remove unnecessary assertion of max_model_len in ROCM_AITER_MLA attention backend. (#18938)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-29 22:33:17 -07:00
H
5acf828d99 [docs] fix: fix markdown syntax (#18927) 2025-05-30 05:20:48 +00:00
3987e2ae96 [Model] Use AutoWeightsLoader for mamba2 (#18918)
Signed-off-by: iLeGend <824040212@qq.com>
2025-05-30 04:50:10 +00:00
77164dad5e [Bugfix] Consistent ascii handling in tool parsers (#18883)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-30 04:44:43 +00:00
3de3eadf5b improve the robustness of parsing vlms config in AutoRound (#18894)
Signed-off-by: wenhuach21 <wenhua.cheng@intel.com>
2025-05-29 19:24:47 -07:00
3132290a14 [TPU][CI/CD] Clean up docker for TPU tests. (#18926)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-30 10:24:19 +08:00
1aa2f81b43 [Misc] Update type annotation for rotary embedding base (#18914)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-30 10:17:01 +08:00
d54af615d5 [Bugfix] Fix PP default fallback behavior for V1 (#18915)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 10:13:17 +08:00
a1cc9f33a3 [TPU] remove transpose ops in moe kernel (#18923)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-05-29 23:00:11 +00:00
a521ef06e5 Use standalone_compile by default in torch >= 2.8.0 (#18846)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-30 06:41:58 +08:00
64eaf5fe05 [P/D] NixlConnector DP fixes (#18903)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-05-29 18:08:40 +00:00
d1d61f3351 [BugFix] Make DP work with connector-delayed new requests (#18559)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Will Eaton <weaton@redhat.com>
2025-05-29 18:04:18 +00:00
32ce3cf7c9 [V1] Allocate kv_cache with stride order for V1 (#18775)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-05-29 17:54:16 +00:00
d58f9c7f7a [Misc] Remove duplicate init for self.vllm_config (#18896)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-05-29 17:26:07 +00:00
c29034037d [Deprecation] Disallow pos-args other than model when initializing LLM (#18802)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-29 09:36:58 -07:00
1b7cfd5a36 [ROCm][V0][Attention] Revert to the previous FA triton kernel (#18226)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-29 12:13:18 -04:00
da4b69d0b4 [Attention][V1] Toggle for v1 attention backend (#18275)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-29 10:48:24 -04:00
c9479b2920 [Bugfix] Fix the failing gte embedding test (#18720)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-29 07:39:25 -07:00
6f2909405e [Doc] Fix codeblocks formatting in LoRA adapters documentation (#18907)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-05-29 07:38:55 -07:00
b169d5f7b6 [Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692)
Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
2025-05-29 20:02:08 +08:00
f8977c233f Fix an error in dummy weight loading for quantization models (#18855)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-05-29 03:07:20 -07:00
f274581f44 [BugFix] Update pydantic to fix error on python 3.10 (#18852)
Signed-off-by: luka <luka@neuralmagic.com>
2025-05-29 03:05:46 -07:00
0b1447f890 [Bugfix] Ensure tensors are contiguous during serialisation (#18860)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-29 03:05:20 -07:00
24d0ef8970 [Misc] Replace TODO in serving transcription (#18895)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-05-29 02:58:14 -07:00
7fcfd954ff [Bugfix] Fix misleading information in the documentation (#18845)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-29 02:54:14 -07:00
e740d07f07 [doc] add CLI doc (#18871)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-29 09:51:36 +00:00
a652e71dd0 [Doc] Remove redundant spaces from compatibility_matrix.md (#18891)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-05-29 02:51:20 -07:00
34d6c447c4 [LoRA] Add LoRA support for InternVL (#18842)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-29 08:46:24 +00:00
972eddf7c9 [Neuron] Add multi-LoRA support for Neuron. (#18284)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-29 16:41:22 +08:00
fd7bb88d72 Fixes a dead link in nightly benchmark readme (#18856)
Signed-off-by: Brent Salisbury <bsalisbu@redhat.com>
2025-05-29 04:41:39 +00:00
3c49dbdd03 Skip device and quant Pydantic validation to make plugin device work (#18843)
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-05-28 20:12:30 -07:00
1661a9c28f [Doc][Neuron] Update documentation for Neuron (#18868)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
2025-05-28 19:44:01 -07:00
8e882ffdc0 [Bugfix][TPU] fix moe custom kernel import (#18853)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-05-28 19:34:19 -07:00
26b4fa45be Add ability to use CUDAGraphs with use_inductor=False (#17345)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-29 10:16:52 +08:00
515b413ebf Prevent the cross-encoder logic from being applied to classification tasks (#18838)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-28 19:16:17 -07:00
269d901734 [Bugfix][ROCm] fix the power of 2 exception from triton_unified_attention.py when running llama4 models and unit test fix (#18100)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-29 07:21:46 +08:00
7951d78738 [Core] Enable CUDA graphs for DP + All2All kernels (#18724)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-05-28 22:55:30 +00:00
6dbe5b5c93 Remove checks for None for fields which should never be None (#17985)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-28 21:32:19 +00:00
643622ba46 [Hardware][TPU][V1] Multi-LoRA Optimisations for the V1 TPU backend (#15655)
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: xihajun <junfan@krai.ai>
Signed-off-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Signed-off-by: Jorge de Freitas <jorge@krai.ai>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: xihajun <junfan@krai.ai>
Co-authored-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Co-authored-by: Jorge de Freitas <jorge@krai.ai>
2025-05-28 19:59:09 +00:00
a09c7ca9f2 [Chore][Spec Decode] Update check NoneType instead of assigning variables (#18836)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-28 18:57:19 +00:00
0e98964e94 [V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-28 18:54:12 +00:00
c68b5c63eb [Misc] fix olmoe model layer can't laod in tp gt 1 (#18828)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-05-28 17:36:21 +00:00
fced756923 [Chore] update ty configuration (#18839)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-28 08:59:11 -07:00
321331b8ae [Core] Add Lora Support to Beam Search (#18346)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-05-28 08:58:24 -07:00
6e4cea1cc5 decrement server_load on listen for disconnect (#18784)
Signed-off-by: Daniel Salib <danielsalib@meta.com>
2025-05-28 22:15:12 +08:00
435fa95444 [Frontend] add run batch to CLI (#18804)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-28 07:08:57 -07:00
4c2b38ce9e Enable Pydantic mypy checks and convert configs to Pydantic dataclasses (#17599)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-28 12:46:04 +00:00
d781930f90 [Platform][Dist] Make torch distributed process group extendable (#18763)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-05-28 10:52:34 +00:00
ce75efeecb [BugFix] FA2 MLA Accuracy Issue (#18807)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-05-28 08:59:39 +00:00
aa42561e40 Fix PiecewiseCompileInterpreter (#17338)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-28 08:40:53 +00:00
de65fc8e1e [CI] improve embed testing (#18747) 2025-05-28 00:16:35 -07:00
0c492b7824 [Deprecation] Remove fallbacks for Embeddings API (#18795)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:09:04 +08:00
0f0926b43f [Deprecation] Remove unused sync methods in async_timeout (#18792)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:08:48 +08:00
7f2c1a87e9 [Deprecation] Require overriding get_dummy_text and get_dummy_mm_data (#18796)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:08:35 +08:00
b78f844a67 [Bugfix][FailingTest]Fix test_model_load_with_params.py (#18758)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-28 05:42:54 +00:00
5e13c07d00 [V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (2) (#18781)
Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
2025-05-28 05:09:14 +00:00
774c5fde30 [V1] fix torch profiling for V1 offline scenarios (#18445)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-05-28 04:16:30 +00:00
9a21e331ff [Bugfix]: correctly propagate errors message caught at the chat_templating step to the client (#18769)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-05-28 03:35:43 +00:00
3e9ce609bd [Bugfix] Fix nomic max_model_len (#18755) 2025-05-27 20:29:53 -07:00
794ae1f551 [rocm] Fix wrong attention log (#18764)
Signed-off-by: Felix Marty <felmarty@amd.com>
2025-05-27 19:45:41 -07:00
d73a9457a5 [Core] Improve Tensor serialisation (#18774)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-28 09:46:21 +08:00
a3896c7f02 [Build] Fixes for CMake install (#18570) 2025-05-27 20:49:24 -04:00
51e98e4ffd [Bugfix] Disable prefix caching by default for benchmark (#18771)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-05-28 08:18:09 +08:00
e56f44d9ec Support datasets in vllm bench serve and sync with benchmark_[serving,datasets].py (#18566) 2025-05-27 19:59:48 -04:00
e0cbad4e30 [Neuron] Support quantization on neuron (#18283)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-27 22:10:33 +00:00
b48d5cca16 [CI/Build] [TPU] Fix TPU CI exit code (#18282)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-27 14:54:59 -07:00
1568 changed files with 25033 additions and 7403 deletions

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml

View File

@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
import json

View File

@ -1,5 +1,6 @@
steps:
- label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
@ -11,6 +12,7 @@ steps:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
commands:
@ -28,6 +30,7 @@ steps:
- label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents:
queue: cpu_queue_postmerge
commands:
@ -44,6 +47,7 @@ steps:
- label: "Build release image"
depends_on: block-release-image-build
id: build-release-image
agents:
queue: cpu_queue_postmerge
commands:
@ -51,6 +55,18 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
@ -70,9 +86,10 @@ steps:
DOCKER_BUILDKIT: "1"
- input: "Provide Release version here"
id: input-release-version
fields:
- text: "What is the release version?"
key: "release-version"
key: release-version
- block: "Build CPU release image"
key: block-cpu-release-image-build

View File

@ -0,0 +1,31 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -0,0 +1,17 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \

View File

@ -7,6 +7,7 @@ set -ex
# Setup cleanup
remove_docker_container() {
if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true
fi
podman system prune -f
@ -37,7 +38,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.

View File

@ -6,72 +6,70 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model
pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
@ -83,7 +81,7 @@ function cpu_tests() {
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
@ -91,4 +89,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@ -2,102 +2,184 @@
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest pytest-asyncio tpu-info \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_XLA_CACHE_PATH= \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \
&& tpu-info \
&& { \
echo TEST_0: Running test_perf.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
echo TEST_0_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_1: Running test_compilation.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
echo TEST_1_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_2: Running test_basic.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
echo TEST_2_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
echo TEST_3_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_4: Running test_quantization_accuracy.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
echo TEST_4_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_5: Running examples/offline_inference/tpu.py; \
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
echo TEST_5_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_6: Running test_tpu_model_runner.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
echo TEST_6_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_7: Running test_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_8: Running test_topk_topp_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
echo TEST_8_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_9: Running test_multimodal.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
echo TEST_9_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_10: Running test_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
echo TEST_10_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_11: Running test_struct_output_generate.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
echo TEST_11_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_12: Running test_moe_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
echo TEST_12_EXIT_CODE: \$?; \
} & \
# Disable the TPU LoRA tests until the feature is activated
# & { \
# echo TEST_13: Running test_moe_pallas.py; \
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# echo TEST_13_EXIT_CODE: \$?; \
# } & \
wait \
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
"
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 0 "test_perf.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
run_and_track_test 1 "test_compilation.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
run_and_track_test 8 "test_topk_topp_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -0,0 +1,18 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@ -0,0 +1,24 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@ -0,0 +1,14 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=512
MAX_NUM_BATCHED_TOKENS=512
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -0,0 +1,102 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build docker image.
# TODO: build the image outside the script and share the image with other
# tpu test if building time is too long.
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg GIT_REPO_CHECK=0 \
--tag vllm/vllm-tpu-bm \
--progress plain -f docker/Dockerfile.tpu .
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@ -0,0 +1,94 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@ -145,6 +145,7 @@ steps:
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py
- tests/v1/engine/test_engine_core_client.py
commands:
# test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -154,6 +155,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
@ -199,8 +201,9 @@ steps:
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
@ -274,17 +277,6 @@ steps:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min
mirror_hardwares: [amdexperimental]
source_file_dependencies:
@ -297,7 +289,7 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/lora
- tests/lora
@ -328,6 +320,7 @@ steps:
# these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction]
@ -397,6 +390,17 @@ steps:
- pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test
mirror_hardwares: [amdexperimental, amdproduction]
soft_fail: true
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/.buildkite"
@ -420,6 +424,9 @@ steps:
- vllm/model_executor/layers/quantization
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
@ -617,9 +624,11 @@ steps:
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py

16
.github/CODEOWNERS vendored
View File

@ -10,15 +10,17 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb
/vllm/v1/structured_output @mgoin @russellb @aarnphm
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96
@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee
# Docs
/docs @hmellor
mkdocs.yaml @hmellor
mkdocs.yaml @hmellor

View File

@ -8,6 +8,16 @@ body:
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
attributes:
value: |
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
- Passwords or authentication credentials
- Private URLs or endpoints
- Personal or confidential data
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
- type: textarea
attributes:
label: Your current environment

View File

@ -1,6 +1,18 @@
FILL IN THE PR DESCRIPTION HERE
## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
FIX #xxxx (*link existing issues this PR will resolve*)
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
## Purpose
## Test Plan
## Test Result
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

14
.github/mergify.yml vendored
View File

@ -36,6 +36,20 @@ pull_request_rules:
add:
- frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
actions:
label:
add:
- llama
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:

View File

@ -11,6 +11,8 @@ repos:
hooks:
- id: yapf
args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
@ -58,7 +60,7 @@ repos:
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9

View File

@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
@ -179,9 +182,6 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -189,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
#
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -243,6 +242,7 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
#
@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@ -543,8 +543,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# to compile MoE kernels that use its output.
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
#
@ -785,5 +785,7 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)
endif ()

View File

@ -58,8 +58,8 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
- Speculative decoding
- Chunked prefill
@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
- Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
- Prefix caching support
- Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g. E5-Mistral)
- Embedding Models (e.g., E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)

View File

@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
---
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.

View File

@ -64,6 +64,12 @@ become available.
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>data.jsonl</code></td>
</tr>
</tbody>
</table>
@ -124,6 +130,38 @@ P99 ITL (ms): 8.39
==================================================
```
### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```
```bash
# run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
### VisionArena Benchmark for Vision Language Models
```bash
@ -146,9 +184,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
``` bash
@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \
--seed 42
```
**`philschmid/mt-bench`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
@ -273,9 +321,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
```

View File

@ -10,11 +10,15 @@
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
@ -30,31 +34,27 @@
TAG=$(date +"%Y_%m_%d_%H_%M")
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
TP=1
DOWNLOAD_DIR=""
INPUT_LEN=4000
OUTPUT_LEN=16
MIN_CACHE_HIT_PCT_PCT=0
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
echo "result file$ $RESULT"
echo "result file: $RESULT"
echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER
cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install datasets
pip install -q datasets
current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
@ -64,53 +64,69 @@ best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
pkill -f vllm
# start the server
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization 0.98 \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size 1 \
--enable-prefix-caching \
--load-format dummy \
--download-dir $DOWNLOAD_DIR \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then
echo "Application started"
server_started=1
break
else
# echo "wait for 10 seconds..."
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
return 1
echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
else
echo "server started."
fi
echo
echo "run benchmark test..."
echo
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@ -118,29 +134,29 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
request_rate=inf
fi
if (( ! meet_latency_requirement )); then
# start from request-rate as int(through_put) + 1
request_rate=$((${through_put%.*} + 1))
# start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@ -149,19 +165,18 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore_eos \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@ -173,10 +188,10 @@ run_benchmark() {
fi
# write the results and update the best result.
if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
best_throughput=$through_put
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
best_throughput=$throughput
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
@ -188,22 +203,39 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20)
return 0
}
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
num_seqs_list="128 256"
num_batched_tokens_list="512 1024 2048 4096"
for num_seqs in $num_seqs_list; do
for num_batched_tokens in $num_batched_tokens_list; do
run_benchmark $num_seqs $num_batched_tokens
exit 0
# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done
done
echo "finish permutations"

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
import json
@ -324,7 +325,7 @@ async def async_request_openai_completions(
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
if usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
if first_chunk_received:
output.success = True
@ -611,6 +612,7 @@ ASYNC_REQUEST_FUNCS = {
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
"llama.cpp": async_request_openai_completions,
}
OPENAI_COMPATIBLE_BACKENDS = [

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample
@ -9,9 +10,6 @@ generation. Supported dataset types include:
- BurstGPT
- HuggingFace
- VisionArena
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
SampleRequest instances, similar to the approach used in ShareGPT.
"""
import base64
@ -442,6 +440,97 @@ class ShareGPTDataset(BenchmarkDataset):
return samples
# -----------------------------------------------------------------------------
# Custom Dataset Implementation
# -----------------------------------------------------------------------------
class CustomDataset(BenchmarkDataset):
"""
Implements the Custom dataset. Loads data from a JSONL file and generates
sample requests based on conversation turns. E.g.,
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
# self.data will be a list of dictionaries
# e.g., [{"prompt": "What is the capital of India?"}, ...]
# This will be the standardized format which load_data()
# has to convert into depending on the filetype of dataset_path.
# sample() will assume this standardized format of self.data
self.data = []
# Load the JSONL file
if self.dataset_path.endswith(".jsonl"):
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
# check if the JSONL file has a 'prompt' column
if "prompt" not in jsonl_data.columns:
raise ValueError("JSONL file must contain a 'prompt' column.")
# Convert each row to a dictionary and append to self.data
# This will convert the DataFrame to a list of dictionaries
# where each dictionary corresponds to a row in the DataFrame.
# This is the standardized format we want for self.data
for _, row in jsonl_data.iterrows():
self.data.append(row.to_dict())
else:
raise NotImplementedError(
"Only JSONL format is supported for CustomDataset."
)
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
**kwargs,
) -> list:
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = item["prompt"]
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
)
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# Sonnet Dataset Implementation
# -----------------------------------------------------------------------------
@ -776,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset):
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
the code, do not include any explanation."
# apply template
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark the latency of processing a single batch of requests."""
import argparse
@ -6,13 +7,12 @@ import dataclasses
import json
import os
import time
from pathlib import Path
from typing import Any, Optional
import numpy as np
import torch
from tqdm import tqdm
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
@ -80,17 +80,9 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)
),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
llm.start_profile()
llm_generate()
llm.stop_profile()
else:
start_time = time.perf_counter()
llm_generate()
@ -103,11 +95,7 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None)
if args.profile:
profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = (
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
)
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir)
return
@ -164,15 +152,6 @@ if __name__ == "__main__":
action="store_true",
help="profile the generation process of a single batch",
)
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser.add_argument(
"--output-json",
type=str,
@ -193,4 +172,9 @@ if __name__ == "__main__":
# numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False)
args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Offline benchmark to test the long document QA throughput.

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the efficiency of prefix caching.

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline prioritization."""
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput.
On the server side, run one of the following commands:
@ -60,6 +61,7 @@ from benchmark_dataset import (
ASRDataset,
BurstGPTDataset,
ConversationDataset,
CustomDataset,
HuggingFaceDataset,
InstructCoderDataset,
MTBenchDataset,
@ -627,7 +629,16 @@ def main(args: argparse.Namespace):
"'--dataset-path' if required."
)
if args.dataset_name == "sonnet":
if args.dataset_name == "custom":
dataset = CustomDataset(dataset_path=args.dataset_path)
input_requests = dataset.sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.custom_output_len,
skip_chat_template=args.custom_skip_chat_template,
)
elif args.dataset_name == "sonnet":
dataset = SonnetDataset(dataset_path=args.dataset_path)
# For the "sonnet" dataset, formatting depends on the backend.
if args.backend == "openai-chat":
@ -762,6 +773,10 @@ def main(args: argparse.Namespace):
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
if args.backend == "llama.cpp":
# Disable prompt caching in llama.cpp backend
sampling_params["cache_prompt"] = False
# Avoid GC processing "static" data - reduce pause times.
gc.collect()
gc.freeze()
@ -834,6 +849,8 @@ def main(args: argparse.Namespace):
]:
if field in result_json:
del result_json[field]
if field in benchmark_result:
del benchmark_result[field]
# Save to file
base_model_id = model_id.split("/")[-1]
@ -846,6 +863,7 @@ def main(args: argparse.Namespace):
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
os.makedirs(args.result_dir, exist_ok=True)
file_name = os.path.join(args.result_dir, file_name)
with open(
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
@ -886,7 +904,7 @@ if __name__ == "__main__":
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument(
@ -1056,6 +1074,19 @@ if __name__ == "__main__":
)
# group for dataset specific arguments
custom_group = parser.add_argument_group("custom dataset options")
custom_group.add_argument(
"--custom-output-len",
type=int,
default=256,
help="Number of output tokens per request, used only for custom dataset.",
)
custom_group.add_argument(
"--custom-skip-chat-template",
action="store_true",
help="Skip applying chat template to prompt, used only for custom dataset.",
)
sonnet_group = parser.add_argument_group("sonnet dataset options")
sonnet_group.add_argument(
"--sonnet-input-len",

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands:
@ -11,7 +12,6 @@ On the client side, run:
--model <your_model> \
--dataset json \
--structured-output-ratio 1.0 \
--structured-output-backend auto \
--request-rate 10 \
--num-prompts 1000

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput."""
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)
json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
from collections.abc import Iterable

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import itertools

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle as pkl
import time

View File

@ -0,0 +1,223 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
from vllm.triton_utils import triton
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=[
"torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
line_names=[
"torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs FP8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
# Create input tensors
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if "torch-bf16" in provider:
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
elif "fp8" in provider:
# Weights are always quantized ahead of time
if "noquant" in provider:
# For no quantization, we just measure the GEMM
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
# In these cases, we quantize the activations during the GEMM call
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
b_fp8 = b_fp8.t()
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
# Calculate TFLOP/s, two flops per multiply-add
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
return tflops(ms), tflops(max_ms), tflops(min_ms)
def prepare_shapes(args):
KN_model_names = []
models_tps = list(itertools.product(args.models, args.tp_sizes))
for model, tp_size in models_tps:
assert model in WEIGHT_SHAPES
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
KN.append(model)
KN_model_names.append(KN)
return KN_model_names
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=[*WEIGHT_SHAPES.keys()],
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
)
args = parser.parse_args()
KN_model_names = prepare_shapes(args)
for K, N, model_name in KN_model_names:
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_fp8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@ -90,7 +91,7 @@ def bench_run(
score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
quant_blocksize = 16
w1_blockscale = torch.empty(

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.utils.benchmark as benchmark
@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
cutlass_moe_fp8,
fused_experts,
fused_topk,
)
@ -69,18 +70,9 @@ def bench_run(
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
@ -121,10 +113,6 @@ def bench_run(
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
num_repeats: int,
):
for _ in range(num_repeats):
@ -132,14 +120,10 @@ def bench_run(
a,
w1,
w2,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
w1_scale,
w2_scale,
a1_scale=a_scale,
)
@ -152,10 +136,6 @@ def bench_run(
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@ -164,14 +144,10 @@ def bench_run(
a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
w1_scale,
w2_scale,
a1_scale=a_scale,
)
@ -217,10 +193,6 @@ def bench_run(
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
)
torch.cuda.synchronize()
@ -229,8 +201,8 @@ def bench_run(
with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph(
a,
w1_q_notransp,
w2_q_notransp,
w1_q,
w2_q,
topk_weights,
topk_ids,
w1_scale,
@ -249,18 +221,12 @@ def bench_run(
"w2": w2,
"score": score,
"topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params
"a_scale": a_scale,
"w1_q": w1_q,
"w2_q": w2_q,
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"ab_strides1": ab_strides1,
"c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
@ -278,8 +244,8 @@ def bench_run(
# Warmup
run_triton_moe(
a,
w1_q_notransp,
w2_q_notransp,
w1_q,
w2_q,
topk_weights,
topk_ids,
w1_scale,
@ -290,7 +256,7 @@ def bench_run(
results.append(
benchmark.Timer(
stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@ -321,16 +287,12 @@ def bench_run(
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
num_warmup,
)
results.append(
benchmark.Timer(
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.utils.benchmark as benchmark

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
@ -6,7 +7,6 @@ import time
from contextlib import nullcontext
from datetime import datetime
from itertools import product
from types import SimpleNamespace
from typing import Any, TypedDict
import ray
@ -42,7 +42,7 @@ def benchmark_config(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@ -399,7 +399,7 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int] = None,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
@ -531,7 +531,7 @@ def save_configs(
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int],
block_quant_shape: list[int],
) -> None:
dtype_str = get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
if args.model_prefix:
config = getattr(config, args.model_prefix)
config = SimpleNamespace(**config)
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = (
torch.float16
if current_platform.is_rocm()
else getattr(torch, config.torch_dtype)
)
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
from typing import Any, TypedDict

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Optional, Union

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate
from typing import Optional
@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
seed: int,
device: str,
max_position: int = 8192,
base: int = 10000,
base: float = 10000,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]],

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from sglang quantization/tuning_block_wise_kernel.py
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off
# ruff: noqa: E501
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import pickle

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Iterable

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
@ -48,4 +49,50 @@ WEIGHT_SHAPES = {
([16384, 106496], 1),
([53248, 16384], 0),
],
"meta-llama/Llama-3.1-8B-Instruct": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-3.3-70B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
"mistralai/Mistral-Large-Instruct-2407": [
([12288, 14336], 1),
([12288, 12288], 0),
([12288, 57344], 1),
([28672, 12288], 0),
],
"Qwen/Qwen2.5-7B-Instruct": [
([3584, 4608], 1),
([3584, 3584], 0),
([3584, 37888], 1),
([18944, 3584], 0),
],
"Qwen/Qwen2.5-32B-Instruct": [
([5120, 7168], 1),
([5120, 5120], 0),
([5120, 55296], 1),
([27648, 5120], 0),
],
"Qwen/Qwen2.5-72B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 59136], 1),
([29568, 8192], 0),
],
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
([2048, 3072], 1),
([2048, 4096], 1),
([2048, 2048], 0),
([2048, 576], 0),
([2048, 21888], 1),
([10944, 2048], 0),
([2048, 2816], 1),
([1408, 2048], 0),
],
}

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cProfile
import pstats

View File

@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
else()
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA")
elseif (POWER9_FOUND OR POWER10_FOUND)
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
message(STATUS "PowerPC detected")
# Check for PowerPC VSX support
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=native"
"-mtune=native")
if (POWER9_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power9"
"-mtune=power9")
elseif (POWER10_FOUND OR POWER11_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power10"
"-mtune=power10")
endif()
elseif (ASIMD_FOUND)
message(STATUS "ARMv8 or later architecture detected")

View File

@ -46,22 +46,38 @@ else()
endif()
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
# Make sure vllm-flash-attn install rules are nested under vllm/
# This is here to support installing all components under the same prefix with cmake --install.
# setup.py installs every component separately but uses the same prefix for all.
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
# and these statements don't hurt when installing neither component.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
DESTINATION vllm/vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
DESTINATION vllm/vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py"
)

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# A command line tool for running pytorch's hipify preprocessor on CUDA

View File

@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
add_custom_target(
hipify${NAME}
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
BYPRODUCTS ${HIP_SRCS}
COMMENT "Running hipify on ${NAME} extension source files.")

View File

@ -143,6 +143,14 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
"output heads must be contiguous in memory");
TORCH_CHECK(
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
"prefix_output heads must be contiguous in memory");
TORCH_CHECK(
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
"suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr<float>();

View File

@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
static_cast<ElementAcc*>(nullptr), stride_LSE},
hw_info,
-1, // split_kv
1, // split_kv
nullptr, // is_var_split_kv
};
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum
from typing import Union

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import itertools
import os

View File

@ -30,4 +30,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
int64_t BLOCK_SIZE_K, int64_t bit);
#endif
bool moe_permute_unpermute_supported();
bool moe_permute_unpermute_supported();
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor);

View File

@ -130,6 +130,62 @@ void moe_unpermute(
});
}
template <typename T>
__global__ void shuffleInputRowsKernel(const T* input,
const int32_t* dst2src_map, T* output,
int64_t num_src_rows,
int64_t num_dst_rows, int64_t num_cols) {
int64_t dest_row_idx = blockIdx.x;
int64_t const source_row_idx = dst2src_map[dest_row_idx];
if (blockIdx.x < num_dst_rows) {
// Load 128-bits per thread
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
auto const* source_row_ptr =
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
auto* dest_row_ptr =
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
int64_t const start_offset = threadIdx.x;
int64_t const stride = blockDim.x;
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
for (int elem_index = start_offset; elem_index < num_elems_in_col;
elem_index += stride) {
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
}
}
}
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor) {
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
"Input and output tensors must have the same data type");
auto stream = at::cuda::getCurrentCUDAStream().stream();
int64_t const blocks = output_tensor.size(0);
int64_t const threads = 256;
int64_t const num_dest_rows = output_tensor.size(0);
int64_t const num_src_rows = input_tensor.size(0);
int64_t const num_cols = input_tensor.size(1);
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
"num_cols must be divisible by 128 / "
"sizeof(input_tensor.scalar_type()) / 8");
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
dst2src_map.data_ptr<int32_t>(),
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
num_dest_rows, num_cols);
});
}
#else
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,

View File

@ -14,12 +14,13 @@
__VA_ARGS__(); \
break; \
}
#define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
#define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
#define MOE_DISPATCH(TYPE, ...) \
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@ -39,6 +40,11 @@ template <>
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
using type = __nv_bfloat16;
};
// uint8 for packed fp4
template <>
struct ScalarType2CudaType<at::ScalarType::Byte> {
using type = uint8_t;
};
// #if __CUDA_ARCH__ >= 890
// fp8

View File

@ -516,9 +516,8 @@ void topk_softmax(
topk,
stream);
}
else
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
{
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
@ -530,4 +529,17 @@ void topk_softmax(
topk,
stream);
}
else {
assert(topk_indices.scalar_type() == at::ScalarType::Int64);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
}

View File

@ -81,6 +81,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
// Row shuffle for MoE
m.def(
"shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
"output_tensor) -> ()");
m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
#endif
}

View File

@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
torch::Tensor& weight, double epsilon);
void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale,
double epsilon);
@ -231,7 +236,8 @@ void cutlass_moe_mm(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides);
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch);
void cutlass_fp4_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@ -243,7 +249,16 @@ void get_cutlass_moe_mm_data(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k);
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k);
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,

View File

@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
TORCH_CHECK(
a.size(0) % 4 == 0,
"Input tensor must have a number of rows that is a multiple of 4. ",
"but got: ", a.size(0), " rows.");
if (out.dtype() == torch::kBFloat16) {
cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
out, a, b, a_scales, b_scales);

View File

@ -1,5 +1,6 @@
#pragma once
#include "cuda_utils.h"
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
@ -22,49 +23,49 @@ namespace vllm {
using namespace cute;
template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
class ClusterShape, typename EpilogueScheduler,
typename MainloopScheduler>
// clang-format off
template <class OutType, int ScaleGranularityM,
int ScaleGranularityN, int ScaleGranularityK,
class MmaTileShape, class ClusterShape,
class EpilogueScheduler, class MainloopScheduler,
bool swap_ab_ = false>
struct cutlass_3x_gemm_fp8_blockwise {
static constexpr bool swap_ab = swap_ab_;
using ElementAB = cutlass::float_e4m3_t;
using ElementA = ElementAB;
using LayoutA = cutlass::layout::RowMajor;
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
using ElementB = ElementAB;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
using ElementC = void;
using ElementD = OutType;
using LayoutD = cutlass::layout::RowMajor;
using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
using ElementC = void; // TODO: support bias
using LayoutC = LayoutD;
using LayoutC_Transpose = LayoutD_Transpose;
static constexpr int AlignmentC = AlignmentD;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBlockScale = float;
// MMA and Cluster Tile Shapes
// Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
// Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
static constexpr int ScaleGranularityM =
size<0>(MmaTileShape{}) / ScaleMsPerTile;
static constexpr int ScaleGranularityN =
size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
static constexpr int ScaleGranularityK =
size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
using ScaleConfig = conditional_t<swap_ab,
cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::K, cute::UMMA::Major::MN>,
cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
// Shape of the threadblocks in a cluster
using ClusterShape_MNK = ClusterShape;
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>;
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise {
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
using ElementScalar = float;
// clang-format off
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag,
@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise {
ElementAccumulator,
ElementCompute,
ElementC,
LayoutC,
conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
AlignmentC,
ElementD,
LayoutD,
conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
AlignmentD,
EpilogueScheduler,
DefaultOperation
>::CollectiveOp;
using StageCountType = cutlass::gemm::collective::StageCountAuto;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
using CollectiveMainloop = conditional_t<swap_ab,
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementB,
cute::tuple<LayoutB_Transpose, LayoutSFA>,
AlignmentB,
ElementA,
cute::tuple<LayoutA_Transpose, LayoutSFB>,
AlignmentA,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp;
// clang-format on
MainloopScheduler
>::CollectiveOp,
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp>;
using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
static constexpr bool swap_ab = Gemm::swap_ab;
using GemmKernel = typename Gemm::GemmKernel;
using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB;
@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
using ElementD = typename Gemm::ElementD;
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
auto prob_shape = cute::make_shape(m, n, k, 1);
StrideA a_stride;
StrideB b_stride;
@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
b_stride =
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
c_stride =
cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
LayoutSFA layout_SFA =
LayoutSFA layout_SFA = swap_ab ?
ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
LayoutSFB layout_SFB =
LayoutSFB layout_SFB = swap_ab ?
ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
typename GemmKernel::MainloopArguments mainloop_args{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
auto mainloop_args = [&](){
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
if (swap_ab) {
return typename GemmKernel::MainloopArguments{
b_ptr, b_stride, a_ptr, a_stride,
b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
};
}
else {
return typename GemmKernel::MainloopArguments{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
};
}
}();
auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
typename GemmKernel::EpilogueArguments epilogue_args{
@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
auto m = a.size(0);
auto k = a.size(1);
auto n = b.size(1);
int sms;
int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
return std::ceil(static_cast<float>(m) / tile1SM) *
std::ceil(static_cast<float>(n) / tile1SM) >=
sms;
};
bool use_2sm = should_use_2sm(m, n);
if (use_2sm) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
constexpr int TILE_K = 128;
// TODO: better heuristics
bool swap_ab = (m < 16) || (m % 4 != 0);
bool use_tma_epilogue = (m * n) % 4 == 0;
if (!swap_ab) {
constexpr int TILE_N = 128;
int tile_m = 256;
if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
tile_m = 64;
}
else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
tile_m = 128;
}
if (tile_m == 64) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else if (tile_m == 128) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else { // tile_m == 256
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
}
}
} else {
// TODO: Test more tile N configs
constexpr int TILE_M = 128;
constexpr int TILE_N = 16;
// TMA epilogue isn't compatible with Swap A/B
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
out, a, b, a_scales, b_scales);
}
}

View File

@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller;
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_default {
// M in (128, inf)
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
@ -25,6 +26,34 @@ struct sm100_fp8_config_default {
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M128 {
// M in (64, 128]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _64>;
using ClusterShape = Shape<_2, _2, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M64 {
// M in [1, 64]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_64, _64, _256>;
using ClusterShape = Shape<_1, _8, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue,
typename... EpilogueArgs>
@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
using Cutlass3xGemmDefault =
typename sm100_fp8_config_default<InType, OutType,
Epilogue>::Cutlass3xGemm;
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
out, a, b, std::forward<EpilogueArgs>(args)...);
using Cutlass3xGemmM64 =
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM128 =
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
uint32_t const m = a.size(0);
uint32_t const mp2 =
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
if (mp2 <= 64) {
// m in [1, 64]
return cutlass_gemm_caller<Cutlass3xGemmM64>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 128) {
// m in (64, 128]
return cutlass_gemm_caller<Cutlass3xGemmM128>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else {
// m in (128, inf)
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
out, a, b, std::forward<EpilogueArgs>(args)...);
}
}
template <template <typename, typename, typename> typename Epilogue,

View File

@ -84,7 +84,8 @@ void run_cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@ -113,19 +114,23 @@ void run_cutlass_moe_mm_sm90(
if (n >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else if (k >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else if (m <= 16) {
cutlass_group_gemm_caller<Cutlass3xGemmM16>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else {
cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
}
}
@ -134,15 +139,18 @@ void dispatch_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
if (out_tensors.dtype() == torch::kBFloat16) {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
}
}
@ -153,8 +161,9 @@ void cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides,
c_strides);
c_strides, per_act_token, per_out_ch);
}

View File

@ -76,7 +76,8 @@ void cutlass_group_gemm_caller(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
using ElementAB = typename Gemm::ElementAB;
using ElementD = typename Gemm::ElementD;
@ -84,9 +85,6 @@ void cutlass_group_gemm_caller(
int k_size = a_tensors.size(1);
int n_size = out_tensors.size(1);
bool per_act_token = a_scales.numel() != 1;
bool per_out_ch = b_scales.numel() != num_experts;
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
auto options_int =

View File

@ -7,7 +7,7 @@
constexpr uint64_t THREADS_PER_EXPERT = 512;
__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
int32_t* problem_sizes1,
int32_t* problem_sizes2,
int32_t* atomic_buffer,
@ -45,7 +45,24 @@ __global__ void compute_expert_offsets(
}
}
__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
__global__ void compute_expert_blockscale_offsets(
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
int32_t* blockscale_offsets, int32_t* atomic_buffer,
const int num_experts) {
int32_t tot_offset = 0;
int32_t tot_offset_round = 0;
expert_offsets[0] = 0;
blockscale_offsets[0] = 0;
for (int i = 0; i < num_experts; ++i) {
atomic_buffer[i] = tot_offset;
tot_offset += problem_sizes1[i * 3];
expert_offsets[i + 1] = tot_offset;
tot_offset_round += (problem_sizes1[i * 3] + (128 - 1)) / 128 * 128;
blockscale_offsets[i + 1] = tot_offset_round;
}
}
__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
const int32_t* __restrict__ expert_offsets,
int32_t* input_permutation,
int32_t* output_permutation,
@ -77,7 +94,8 @@ void get_cutlass_moe_mm_data_caller(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k) {
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets) {
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
auto options_int32 =
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
@ -85,19 +103,61 @@ void get_cutlass_moe_mm_data_caller(
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
static_cast<const int32_t*>(topk_ids.data_ptr()),
static_cast<const uint32_t*>(topk_ids.data_ptr()),
static_cast<int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(problem_sizes2.data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
compute_expert_offsets<<<1, 1, 0, stream>>>(
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
if (blockscale_offsets.has_value()) {
compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
} else {
compute_expert_offsets<<<1, 1, 0, stream>>>(
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
}
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
static_cast<const int32_t*>(topk_ids.data_ptr()),
static_cast<const uint32_t*>(topk_ids.data_ptr()),
static_cast<const int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(input_permutation.data_ptr()),
static_cast<int32_t*>(output_permutation.data_ptr()),
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
topk_ids.size(1));
}
__global__ void compute_pplx_data(int32_t* expert_offsets,
int32_t* problem_sizes1,
int32_t* problem_sizes2,
const int32_t* __restrict__ expert_num_tokens,
const int padded_m, const int n,
const int k) {
int expert_idx = threadIdx.x;
expert_offsets[expert_idx] = expert_idx * padded_m;
problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
problem_sizes1[expert_idx * 3 + 1] = 2 * n;
problem_sizes1[expert_idx * 3 + 2] = k;
problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
problem_sizes2[expert_idx * 3 + 1] = k;
problem_sizes2[expert_idx * 3 + 2] = n;
}
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m,
const int64_t n, const int64_t k) {
auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(problem_sizes2.data_ptr()),
static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
k);
}

View File

@ -36,7 +36,8 @@ void cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides);
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch);
#endif
@ -54,7 +55,16 @@ void get_cutlass_moe_mm_data_caller(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k);
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m,
const int64_t n, const int64_t k);
#endif
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@ -206,12 +216,13 @@ void cutlass_moe_mm(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
int32_t version_num = get_sm_version_num();
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides,
c_strides);
c_strides, per_act_token, per_out_ch);
return;
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
@ -224,7 +235,8 @@ void get_cutlass_moe_mm_data(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k) {
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets) {
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
@ -232,7 +244,8 @@ void get_cutlass_moe_mm_data(
(defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
problem_sizes2, input_permutation,
output_permutation, num_experts, n, k);
output_permutation, num_experts, n, k,
blockscale_offsets);
return;
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
@ -242,6 +255,29 @@ void get_cutlass_moe_mm_data(
version_num, ". Required capability: 90");
}
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k) {
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k);
return;
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
"for CUDA device capability: ",
version_num, ". Required capability: 90");
}
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,

View File

@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
fp8_type* __restrict__ token_output = &out[offset];
// For vectorization, token_input and token_output pointers need to be
// aligned at 8-byte and 4-byte addresses respectively.
bool const can_vectorize = hidden_size % 4 == 0;
// aligned at 32-byte and 16-byte addresses respectively.
bool const can_vectorize = hidden_size % 16 == 0;
float absmax_val = 0.0f;
if (can_vectorize) {
@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
} else {
for (int i = tid; i < hidden_size; i += blockDim.x) {
float const x = static_cast<float>(token_input[i]);
absmax_val = max(absmax_val, fabs(x));
absmax_val = fmaxf(absmax_val, fabsf(x));
}
}
using BlockReduce = cub::BlockReduce<float, 1024>;
using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage reduceStorage;
float const block_absmax_val_maybe =
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
__shared__ float token_scale;
if (tid == 0) {
if (scale_ub) {
token_scale = min(block_absmax_val_maybe, *scale_ub);
token_scale = fminf(block_absmax_val_maybe, *scale_ub);
} else {
token_scale = block_absmax_val_maybe;
}
// token scale computation
token_scale = max(token_scale / quant_type_max_v<fp8_type>,
min_scaling_factor<fp8_type>::val());
token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
min_scaling_factor<fp8_type>::val());
scale[token_idx] = token_scale;
}
__syncthreads();
@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
torch::Tensor const& input, // [..., d]
torch::Tensor const& scale) // [1]
{
int64_t num_tokens = input.numel() / input.size(-1);
int64_t num_elems = input.numel();
dim3 grid(num_tokens);
dim3 block(1024);
int const block_size = 256;
int const num_tokens = input.numel() / input.size(-1);
int const num_elems = input.numel();
dim3 const grid(num_tokens);
dim3 const block(block_size);
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
torch::Tensor const& input, // [..., d]
torch::Tensor& scale) // [1]
{
int64_t num_tokens = input.numel() / input.size(-1);
int64_t num_elems = input.numel();
dim3 grid(num_tokens);
dim3 block(1024);
int const block_size = 256;
int const num_tokens = input.numel() / input.size(-1);
int const num_elems = input.numel();
dim3 const grid(num_tokens);
dim3 const block(block_size);
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant(
int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size;
int const block_size = 256;
dim3 const grid(num_tokens);
dim3 const block(std::min(hidden_size, 1024));
dim3 const block(std::min(hidden_size, block_size));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

View File

@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
}
float r =
fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
#ifndef USE_ROCM
return static_cast<fp8_type>(r);
#else
@ -65,7 +65,7 @@ template <typename scalar_t, typename fp8_type>
__global__ void segmented_max_reduction(float* __restrict__ scale,
const scalar_t* __restrict__ input,
int64_t num_elems) {
__shared__ float cache[1024];
__shared__ float cache[256];
int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
// First store maximum for all values processes by
@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
scalar_t tmp = 0.0;
while (i < num_elems) {
float x = static_cast<float>(input[i]);
tmp = max(tmp, fabs(x));
tmp = fmaxf(tmp, fabsf(x));
i += blockDim.x * gridDim.x;
}
cache[threadIdx.x] = tmp;
@ -100,25 +100,27 @@ template <typename scalar_t>
__device__ float thread_max_vec(scalar_t const* __restrict__ input,
int64_t const num_elems, int const tid,
int const step) {
constexpr size_t VEC_SIZE = 16;
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
// Vectorized input/output to better utilize memory bandwidth.
vec4_t<scalar_t> const* vectorized_in =
reinterpret_cast<vec4_t<scalar_t> const*>(input);
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
int64_t const num_vec_elems = num_elems >> 2;
// num_elems / VEC_SIZE (which is 16)
int64_t const num_vec_elems = num_elems >> 4;
float absmax_val = 0.0f;
#pragma unroll 4
#pragma unroll
for (int64_t i = tid; i < num_vec_elems; i += step) {
vec4_t<scalar_t> in_vec = vectorized_in[i];
absmax_val = max(absmax_val, fabs(in_vec.x));
absmax_val = max(absmax_val, fabs(in_vec.y));
absmax_val = max(absmax_val, fabs(in_vec.z));
absmax_val = max(absmax_val, fabs(in_vec.w));
scalarxN_t in_vec = vectorized_in[i];
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
}
}
// Handle the remaining elements if num_elems is not divisible by 4
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
absmax_val = max(absmax_val, fabs(input[i]));
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
absmax_val = fmaxf(absmax_val, fabsf(input[i]));
}
return absmax_val;
@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
float const scale,
int64_t const num_elems,
int const tid, int const step) {
using float8x4_t = q8x4_t<fp8_type>;
constexpr size_t VEC_SIZE = 16;
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
// Vectorized input/output to better utilize memory bandwidth.
auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
int64_t const num_vec_elems = num_elems >> 2;
// num_elems / VEC_SIZE (which is 16)
int64_t const num_vec_elems = num_elems >> 4;
#pragma unroll 4
#pragma unroll
for (int64_t i = tid; i < num_vec_elems; i += step) {
vec4_t<scalar_t> in_vec = vectorized_in[i];
float8x4_t out_vec;
scalarxN_t in_vec = vectorized_in[i];
float8xN_t out_vec;
out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.x), scale);
out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.y), scale);
out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.z), scale);
out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.w), scale);
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(in_vec.val[j]), scale);
}
vectorized_out[i] = out_vec;
}
// Handle the remaining elements if num_elems is not divisible by 4
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
static_cast<float>(input[i]), scale);
}

View File

@ -140,6 +140,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
// sum of squares
float ss = 0.0f;
const int VEC_SIZE = 4;
int32_t const num_vec_elems = hidden_size >> 2;
#pragma unroll 4
@ -147,22 +148,23 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
vec4_t<scalar_t> in = vec_input[i];
vec4_t<float> x;
x.x = static_cast<float>(in.x);
x.y = static_cast<float>(in.y);
x.z = static_cast<float>(in.z);
x.w = static_cast<float>(in.w);
if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i];
x.x += static_cast<float>(r.x);
x.y += static_cast<float>(r.y);
x.z += static_cast<float>(r.z);
x.w += static_cast<float>(r.w);
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
x.val[j] = static_cast<float>(in.val[j]);
}
ss += x.x * x.x;
ss += x.y * x.y;
ss += x.z * x.z;
ss += x.w * x.w;
if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i];
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
x.val[j] += static_cast<float>(r.val[j]);
}
}
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
ss += x.val[j] * x.val[j];
}
}
using BlockReduce = cub::BlockReduce<float, 1024>;
@ -203,6 +205,7 @@ __device__ void compute_dynamic_per_token_scales(
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
const int VEC_SIZE = 4;
int32_t const num_vec_elems = hidden_size >> 2;
float block_absmax_val_maybe = 0.0f;
@ -212,26 +215,25 @@ __device__ void compute_dynamic_per_token_scales(
vec4_t<scalar_t> const w = vec_weight[i];
vec4_t<float> x;
x.x = static_cast<float>(in.x);
x.y = static_cast<float>(in.y);
x.z = static_cast<float>(in.z);
x.w = static_cast<float>(in.w);
if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i];
x.x += static_cast<float>(r.x);
x.y += static_cast<float>(r.y);
x.z += static_cast<float>(r.z);
x.w += static_cast<float>(r.w);
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
x.val[j] = static_cast<float>(in.val[j]);
}
block_absmax_val_maybe = fmaxf(
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.x * rms) * w.x));
block_absmax_val_maybe = fmaxf(
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.y * rms) * w.y));
block_absmax_val_maybe = fmaxf(
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.z * rms) * w.z));
block_absmax_val_maybe = fmaxf(
block_absmax_val_maybe, fabs(static_cast<scalar_t>(x.w * rms) * w.w));
if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i];
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
x.val[j] += static_cast<float>(r.val[j]);
}
}
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
block_absmax_val_maybe =
fmaxf(block_absmax_val_maybe,
fabs(static_cast<scalar_t>(x.val[j] * rms) * w.val[j]));
}
}
using BlockReduce = cub::BlockReduce<float, 1024>;
@ -282,6 +284,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
vec_residual = reinterpret_cast<vec4_t<scalar_t>*>(&residual[token_offset]);
}
const int VEC_SIZE = 4;
int32_t const num_vec_elems = hidden_size >> 2;
// TODO(luka/varun) extract into type-agnostic vectorized quant function to
@ -292,33 +295,31 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
vec4_t<scalar_t> const w = vec_weight[i];
vec4_t<float> x;
x.x = static_cast<float>(in.x);
x.y = static_cast<float>(in.y);
x.z = static_cast<float>(in.z);
x.w = static_cast<float>(in.w);
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
x.val[j] = static_cast<float>(in.val[j]);
}
if constexpr (has_residual) {
vec4_t<scalar_t> r = vec_residual[i];
x.x += static_cast<float>(r.x);
x.y += static_cast<float>(r.y);
x.z += static_cast<float>(r.z);
x.w += static_cast<float>(r.w);
// Update residual
r.x = static_cast<scalar_t>(x.x);
r.y = static_cast<scalar_t>(x.y);
r.z = static_cast<scalar_t>(x.z);
r.w = static_cast<scalar_t>(x.w);
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
x.val[j] += static_cast<float>(r.val[j]);
}
// Update residual
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
r.val[j] = static_cast<scalar_t>(x.val[j]);
}
vec_residual[i] = r;
}
q8x4_t<scalar_out_t> out;
out.x = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.x * rms) * w.x, scale);
out.y = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.y * rms) * w.y, scale);
out.z = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.z * rms) * w.z, scale);
out.w = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.w * rms) * w.w, scale);
#pragma unroll
for (int j = 0; j < VEC_SIZE; ++j) {
out.val[j] = ScaledQuant<scalar_out_t, is_scale_inverted>::quant_fn(
static_cast<scalar_t>(x.val[j] * rms) * w.val[j], scale);
}
vec_output[i] = out;
}
}

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import itertools
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import math

View File

@ -10,23 +10,22 @@
namespace vllm {
// Vectorization containers
template <typename scalar_t>
struct __align__(8) vec4_t {
scalar_t x;
scalar_t y;
scalar_t z;
scalar_t w;
template <typename scalar_t, size_t vec_size>
struct __align__(vec_size * sizeof(scalar_t)) vec_n_t {
scalar_t val[vec_size];
};
template <typename quant_type_t>
struct __align__(4) q8x4_t {
template <typename quant_type_t, size_t vec_size>
struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t {
static_assert(std::is_same_v<quant_type_t, int8_t> ||
std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
quant_type_t x;
quant_type_t y;
quant_type_t z;
quant_type_t w;
quant_type_t val[vec_size];
};
template <typename scalar_t>
using vec4_t = vec_n_t<scalar_t, 4>;
template <typename quant_type_t>
using q8x4_t = q8_n_t<quant_type_t, 4>;
} // namespace vllm

Some files were not shown because too many files have changed in this diff Show More