Compare commits

...

925 Commits

Author SHA1 Message Date
0405645a6c initial
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-31 00:55:49 +00:00
41bf5612f5 [Misc] fix typo: add missing space in lora adapter error message (#12564)
Signed-off-by: Beim <beim2015@outlook.com>
2025-01-30 15:39:22 +00:00
a2769032ca Set ?device={device} when changing tab in installation guides (#12560)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-30 00:05:42 -08:00
f17f1d4608 [V1][Metrics] Add GPU cache usage % gauge (#12561)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-01-29 18:31:01 -08:00
1c1bb0bbf2 [Misc][MoE] add Deepseek-V3 moe tuning support (#12558)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-01-30 00:47:30 +00:00
e0cc5f259a [V1][BugFix] Free encoder cache for aborted requests (#12545)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-29 13:47:33 -08:00
73aa6cfdf7 Revert "[Build/CI] Fix libcuda.so linkage" (#12552) 2025-01-29 21:12:24 +00:00
27b78c73ca [Kernel] add triton fused moe kernel for gptq/awq (#12185) 2025-01-29 09:07:09 -05:00
b02fd288b2 [Hardware][NV] Fix Modelopt model loading for k-v-scales for Llama models. (#11787)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2025-01-29 01:46:12 -08:00
ff7424f491 [Frontend] Support override generation config in args (#12409)
Signed-off-by: liuyanyi <wolfsonliu@163.com>
2025-01-29 01:41:01 -08:00
d93bf4da85 [Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)
Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-01-29 09:24:59 +00:00
036ca94c25 [Bugfix] handle alignment of arguments in convert_sparse_cross_attention_mask_to_dense (#12347)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Wallas Santos <wallashss@ibm.com>
2025-01-29 08:54:35 +00:00
ef001d98ef Fix the pydantic logging validator (#12420)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-01-29 07:53:13 +00:00
5f671cb4c3 [V1] Improve Error Message for Unsupported Config (#12535)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-01-29 04:56:56 +00:00
bd02164cf9 Bugfix for whisper quantization due to fake k_proj bias (#12524)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-01-29 04:49:03 +00:00
46fb056749 [V1][Metrics] Add TTFT and TPOT histograms (#12530)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-01-29 04:11:16 +00:00
dd6a3a02cb [Doc] Convert docs to use colon fences (#12471)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-29 11:38:29 +08:00
a7e3eba66f [Frontend] Support reasoning content for deepseek r1 (#12473)
Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
2025-01-29 11:38:08 +08:00
fbb5bd4cef [TPU] Add example for profiling TPU inference (#12531)
Signed-off-by: mgoin <mgoin@redhat.com>
2025-01-29 03:16:47 +00:00
80fcc3ed1c [Kernel] Pipe attn_logits_soft_cap through paged attention TPU kernels (#12482)
Signed-off-by: Fenghui Zhang <fhzhang@google.com>
2025-01-28 22:36:44 +00:00
c386c43ca3 [V1][Metrics] Add per-request prompt/generation_tokens histograms (#12516)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-01-28 22:07:22 +00:00
f26d790718 Do not run suggestion pre-commit hook multiple times (#12521)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-28 20:05:27 +00:00
0f657bdc52 Replace missed warning_once for rerank API (#12472)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-01-28 19:06:32 +00:00
3fd1fb63ef [V1][Metrics] Hook up IterationStats for Prometheus metrics (#12478)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-01-28 16:38:38 +00:00
925d2f1908 [Doc] Fix typo for x86 CPU installation (#12514)
Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
2025-01-28 16:37:10 +00:00
8f58a51358 [VLM] Merged multi-modal processor and V1 support for Qwen-VL (#12504)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-28 16:25:05 +00:00
2079e43bee [Core] Make raw_request optional in ServingCompletion (#12503)
Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>
2025-01-28 10:56:45 +00:00
e29d4358ef [V1] Include Engine Version in Logs (#12496)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2025-01-28 08:27:41 +00:00
8cbc424975 Update README.md with V1 alpha release (#12495) 2025-01-28 08:22:41 +00:00
dd66fd2b01 [CI] fix pre-commit error (#12494)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-01-28 06:11:05 +00:00
0f465ab533 [FEATURE] Enables offline /score for embedding models (#12021)
Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
2025-01-28 11:30:13 +08:00
23a7cbc88b [CI/Build] Fixed the xla nightly issue report in #12451 (#12453) 2025-01-28 11:18:07 +08:00
426a5c3625 Fix bad path in prometheus example (#12481)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-01-27 18:56:31 -07:00
ddee88d0ff [Neuron][Kernel] NKI-based flash-attention kernel with paged KV cache (#11277)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: Jiangfei Duan <jfduan@outlook.com>
2025-01-27 17:31:16 -08:00
823ab79633 Update pre-commit hooks (#12475)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-27 17:23:08 -07:00
6116ca8cd7 [Feature] [Spec decode]: Enable MLPSpeculator/Medusa and prompt_logprobs with ChunkedPrefill (#10132)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: wallashss <wallashss@ibm.com>
Co-authored-by: wallashss <wallashss@ibm.com>
2025-01-27 13:38:35 -08:00
2bc3fbba0c [FlashInfer] Upgrade to 0.2.0 (#11194)
Signed-off-by: Bowen Wang <abmfy@icloud.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-01-27 18:19:24 +00:00
3f1fc7425a [V1][CI/Test] Do basic test for top-p & top-k sampling (#12469)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-27 09:40:04 -08:00
01ba927040 [V1][Metrics] Add initial Prometheus logger (#12416)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-01-27 12:26:28 -05:00
103bd17ac5 [Build] Only build 9.0a for scaled_mm and sparse kernels (#12339)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-01-27 10:40:00 -05:00
ce69f7f754 [Bugfix] Fix gpt2 GGUF inference (#12467)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-27 18:31:49 +08:00
624a1e4711 [V1][Minor] Minor optimizations for update_from_output (#12454)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-27 01:09:27 -08:00
372bf0890b [Bugfix] Fix missing seq_start_loc in xformers prefill metadata (#12464)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-27 07:25:30 +00:00
5204ff5c3f [Bugfix] Fix Granite 3.0 MoE model loading (#12446)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-26 21:26:44 -08:00
0cc6b383d7 [Frontend] Support scores endpoint in run_batch (#12430)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2025-01-27 04:30:17 +00:00
28e0750847 [V1] Avoid list creation in input preparation (#12457)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-26 19:57:56 -08:00
582cf78798 [DOC] Add link to vLLM blog (#12460)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-01-27 03:46:19 +00:00
0034b09ceb [Frontend] Rerank API (Jina- and Cohere-compatible API) (#12376)
Signed-off-by: Kyle Mistele <kyle@mistele.com>
2025-01-26 19:58:45 -07:00
72bac73067 [Build/CI] Fix libcuda.so linkage (#12424)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-01-26 21:18:19 +00:00
68f11149d8 [Bugfix][Kernel] Fix perf regression caused by PR #12405 (#12434)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-01-26 11:09:34 -08:00
72f4880425 [Bugfix/CI] Fix broken kernels/test_mha.py (#12450) 2025-01-26 10:39:03 -08:00
aa2cd2c43d [Bugfix] Disable w16a16 2of4 sparse CompressedTensors24 (#12417)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2025-01-26 19:59:58 +08:00
9ddc35220b [Frontend] generation_config.json for maximum tokens(#12242)
Signed-off-by: Matthew Hendrey <matthew.hendrey@gmail.com>
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: shangmingc <caishangming@linux.alibaba.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-01-26 19:59:25 +08:00
a5255270c3 [Misc] Revert FA on ViT #12355 and #12435 (#12445) 2025-01-26 03:56:34 -08:00
0ee349b553 [V1][Bugfix] Fix assertion when mm hashing is turned off (#12439)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-26 00:47:42 -08:00
fa63e710c7 [V1][Perf] Reduce scheduling overhead in model runner after cuda sync (#12094)
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
2025-01-26 00:42:37 -08:00
2a0309a646 [Misc][Bugfix] FA3 support to ViT MHA layer (#12435)
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-26 05:00:31 +00:00
324960a95c [TPU][CI] Update torchxla version in requirement-tpu.txt (#12422)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-01-25 07:23:03 +00:00
f1fc0510df [Misc] Add FA2 support to ViT MHA layer (#12355)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-25 15:07:35 +08:00
bf21481dde [ROCm][MoE] MI300 tuned configs Mixtral-8x(7B,22B) | fp16, fp8 (#12408)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-01-25 12:17:19 +08:00
fb30ee92ee [Bugfix] Fix BLIP-2 processing (#12412)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-25 11:42:42 +08:00
221d388cc5 [Bugfix][Kernel] Fix moe align block issue for mixtral (#12413) 2025-01-25 01:49:28 +00:00
3132a933b6 [Bugfix][Kernel] FA3 Fix - RuntimeError: This flash attention build only supports pack_gqa (for build size reasons). (#12405)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-01-24 20:20:59 +00:00
df5dafaa5b [Misc] Remove deprecated code (#12383)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-24 14:45:20 -05:00
ab5bbf5ae3 [Bugfix][Kernel] Fix CUDA 11.8 being broken by FA3 build (#12375)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-01-24 15:27:59 +00:00
3bb8e2c9a2 [Misc] Enable proxy support in benchmark script (#12356)
Signed-off-by: Junichi Sato <junichi.sato@sbintuitions.co.jp>
2025-01-24 14:58:26 +00:00
e784c6b998 [ci/build] sync default value for wheel size (#12398)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-24 17:54:29 +08:00
9a0f3bdbe5 [Hardware][Gaudi][Doc] Add missing step in setup instructions (#12382) 2025-01-24 09:43:49 +00:00
c7c9851036 [ci/build] fix wheel size check (#12396)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-24 17:31:25 +08:00
3c818bdb42 [Misc] Use VisionArena Dataset for VLM Benchmarking (#12389)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-24 00:22:04 -08:00
6dd94dbe94 [perf] fix perf regression from #12253 (#12380)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-24 11:34:27 +08:00
0e74d797ce [V1] Increase default batch size for H100/H200 (#12369)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-24 03:19:55 +00:00
55ef66edf4 Update compressed-tensors version (#12367) 2025-01-24 11:19:42 +08:00
5e5630a478 [Bugfix] Path join when building local path for S3 clone (#12353)
Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
2025-01-24 11:06:07 +08:00
d3d6bb13fb Set weights_only=True when using torch.load() (#12366)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-01-24 02:17:30 +00:00
24b0205f58 [V1][Frontend] Coalesce bunched RequestOutputs (#12298)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2025-01-23 17:17:41 -08:00
c5cffcd0cd [Docs] Update spec decode + structured output in compat matrix (#12373)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-01-24 01:15:52 +00:00
682b55bc07 [Docs] Add meetup slides (#12345)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-23 14:10:03 -08:00
9726ad676d [Misc] Fix OpenAI API Compatibility Issues in Benchmark Script (#12357)
Signed-off-by: Junichi Sato <junichi.sato@sbintuitions.co.jp>
2025-01-23 17:02:13 -05:00
eb5cb5e528 [BugFix] Fix parameter names and process_after_weight_loading for W4A16 MoE Group Act Order (#11528)
Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-01-23 21:40:33 +00:00
2cbeedad09 [Docs] Document Phi-4 support (#12362)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-23 19:18:51 +00:00
2c85529bfc [TPU] Update TPU CI to use torchxla nightly on 20250122 (#12334)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-01-23 18:50:16 +00:00
e97f802b2d [FP8][Kernel] Dynamic kv cache scaling factors computation (#11906)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
2025-01-23 18:04:03 +00:00
6e650f56a1 [torch.compile] decouple compile sizes and cudagraph sizes (#12243)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-24 02:01:30 +08:00
3f50c148fd [core] add wake_up doc and some sanity check (#12361)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-24 02:00:50 +08:00
8c01b8022c [Bugfix] Fix broken internvl2 inference with v1 (#12360)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-23 17:20:33 +00:00
99d01a5e3d [V1] Simplify M-RoPE (#12352)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: imkero <kerorek@outlook.com>
2025-01-23 23:13:23 +08:00
d07efb31c5 [Doc] Troubleshooting errors during model inspection (#12351)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-23 22:46:58 +08:00
978b45f399 [Kernel] Flash Attention 3 Support (#12093)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-01-23 06:45:48 -08:00
c5b4b11d7f [Bugfix] Fix k_proj's bias for whisper self attention (#12342)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-23 10:15:33 +00:00
8ae5ff2009 [Hardware][Gaudi][BugFix] Fix dataclass error due to triton package update (#12338)
Signed-off-by: zhenwei <zhenweiliu@habana.ai>
2025-01-23 08:35:46 +00:00
511627445e [doc] explain common errors around torch.compile (#12340)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-23 14:56:02 +08:00
f0ef37233e [V1] Add uncache_blocks (#12333) 2025-01-23 04:19:21 +00:00
7551a34032 [Docs] Document vulnerability disclosure process (#12326)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-01-23 03:44:09 +00:00
01a55941f5 [Docs] Update FP8 KV Cache documentation (#12238)
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-23 11:18:09 +08:00
8d7aa9de71 [Bugfix] Fixing AMD LoRA CI test. (#12329)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-01-23 10:53:02 +08:00
68c4421b6d [AMD][Quantization] Add TritonScaledMMLinearKernel since int8 is broken for AMD (#12282)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-01-23 00:10:37 +00:00
aea94362c9 [Frontend][V1] Online serving performance improvements (#12287) 2025-01-22 22:22:12 +00:00
7206ce4ce1 [Core] Support reset_prefix_cache (#12284) 2025-01-22 18:52:27 +00:00
96f6a7596f [Bugfix] Fix HPU multiprocessing executor (#12167)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
2025-01-23 02:07:07 +08:00
84bee4bd5c [Misc] Improve the readability of BNB error messages (#12320)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-22 16:56:54 +00:00
fc66dee76d [Misc] Fix the error in the tip for the --lora-modules parameter (#12319)
Signed-off-by: wangerxiao <863579016@qq.com>
2025-01-22 16:48:41 +00:00
6609cdf019 [Doc] Add docs for prompt replacement (#12318)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-22 14:56:29 +00:00
16366ee8bb [Bugfix][VLM] Fix mixed-modality inference backward compatibility for V0 (#12313)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-22 21:06:36 +08:00
528dbcac7d [Model][Bugfix]: correct Aria model output (#12309)
Signed-off-by: xffxff <1247714429@qq.com>
2025-01-22 11:39:19 +00:00
cd7b6f0857 [VLM] Avoid unnecessary tokenization (#12310)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-22 11:08:31 +00:00
68ad4e3a8d [Core] Support fully transparent sleep mode (#11743)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-22 14:39:32 +08:00
4004f144f3 [Build] update requirements of no-device (#12299)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-01-22 14:29:31 +08:00
66818e5b63 [core] separate builder init and builder prepare for each batch (#12253)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-22 14:13:52 +08:00
222a9dc350 [Benchmark] More accurate TPOT calc in benchmark_serving.py (#12288)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-01-22 13:46:14 +08:00
cbdc4ad5a5 [Ci/Build] Fix mypy errors on main (#12296)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-22 12:06:54 +08:00
016e3676e7 [CI] add docker volume prune to neuron CI (#12291)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2025-01-22 10:47:49 +08:00
64ea24d0b3 [ci/lint] Add back default arg for pre-commit (#12279)
Signed-off-by: kevin <kevin@anyscale.com>
2025-01-22 01:15:27 +00:00
df76e5af26 [VLM] Simplify post-processing of replacement info (#12269)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-21 16:48:13 -08:00
09ccc9c8f7 [Documentation][AMD] Add information about prebuilt ROCm vLLM docker for perf validation purpose (#12281)
Signed-off-by: Hongxia Yang <hongxyan@amd.com>
2025-01-22 07:49:22 +08:00
69196a9bc7 [BUGFIX] When skip_tokenize_init and multistep are set, execution crashes (#12277)
Signed-off-by: maleksan85 <maleksan@amd.com>
Co-authored-by: maleksan85 <maleksan@amd.com>
2025-01-21 23:30:46 +00:00
2acba47d9b [bugfix] moe tuning. rm is_navi() (#12273)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-01-21 22:47:32 +00:00
9c485d9e25 [Core] Free CPU pinned memory on environment cleanup (#10477) 2025-01-21 11:56:41 -08:00
fa9ee08121 [Misc] Set default backend to SDPA for get_vit_attn_backend (#12235)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-01-21 11:52:11 -08:00
347eeebe3b [Misc] Remove experimental dep from tracing.py (#12007)
Signed-off-by: Adrian Cole <adrian.cole@elastic.co>
2025-01-21 11:51:55 -08:00
18fd4a8331 [Bugfix] Multi-sequence broken (#11898)
Signed-off-by: Andy Lo <andy@mistral.ai>
2025-01-21 11:51:35 -08:00
132a132100 [v1][stats][1/n] Add RequestStatsUpdate and RequestStats types (#10907)
Signed-off-by: rickyx <rickyx@anyscale.com>
2025-01-21 11:51:13 -08:00
1e60f87bb3 [Kernel] fix moe_align_block_size error condition (#12239)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-01-21 10:30:28 -08:00
9705b90bcf [Bugfix] fix race condition that leads to wrong order of token returned (#10802)
Signed-off-by: Jannis Schönleber <joennlae@gmail.com>
2025-01-21 09:47:04 -08:00
3aec49e56f [ci/build] update nightly torch for gh200 test (#12270)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-21 23:03:17 +08:00
c64612802b [Platform] improve platforms getattr (#12264)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-01-21 14:42:41 +00:00
9a7c3a0042 Remove pytorch comments for outlines + compressed-tensors (#12260)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-01-21 21:49:08 +08:00
b197a5ccfd [V1][Bugfix] Fix data item ordering in mixed-modality inference (#12259)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-21 13:18:43 +00:00
c81081fece [torch.compile] transparent compilation with more logging (#12246)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-21 19:32:55 +08:00
a94eee4456 [Bugfix] Fix mm_limits access for merged multi-modal processor (#12252)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-21 10:09:39 +00:00
f2e9f2a3be [Misc] Remove redundant TypeVar from base model (#12248)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-21 08:40:39 +00:00
1f1542afa9 [Misc]Add BNB quantization for PaliGemmaForConditionalGeneration (#12237)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-21 07:49:08 +00:00
96912550c8 [Misc] Rename MultiModalInputsV2 -> MultiModalInputs (#12244)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-21 07:31:19 +00:00
2fc6944c5e [ci/build] disable failed and flaky tests (#12240)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-21 13:25:03 +08:00
5fe6bf29d6 [BugFix] Fix GGUF tp>1 when vocab_size is not divisible by 64 (#12230)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-01-21 12:23:14 +08:00
d4b62d4641 [AMD][Build] Porting dockerfiles from the ROCm/vllm fork (#11777)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-01-21 12:22:23 +08:00
ecf67814f1 Add quantization and guided decoding CODEOWNERS (#12228)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-01-20 18:23:40 -07:00
750f4cabfa [Kernel] optimize moe_align_block_size for cuda graph and large num_experts (e.g. DeepSeek-V3) (#12222)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-01-20 16:42:16 -08:00
06a760d6e8 [bugfix] catch xgrammar unsupported array constraints (#12210)
Signed-off-by: Jason Cheng <jasoncky96@gmail.com>
2025-01-20 16:42:02 -08:00
da7512215f [misc] add cuda runtime version to usage data (#12190)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-01-21 00:31:01 +00:00
af69a6aded fix: update platform detection for M-series arm based MacBook processors (#12227)
Signed-off-by: isikhi <huseyin.isik000@gmail.com>
2025-01-20 22:23:28 +00:00
7bd3630067 [Misc] Update CODEOWNERS (#12229) 2025-01-20 22:19:09 +00:00
96663699b2 [CI] Pass local python version explicitly to pre-commit mypy.sh (#12224)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-20 23:49:18 +08:00
18572e3384 [Bugfix] Fix HfExampleModels.find_hf_info (#12223)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-20 15:35:36 +00:00
86bfb6dba7 [Misc] Pass attention to impl backend (#12218)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-01-20 23:25:28 +08:00
5f0ec3935a [V1] Remove _get_cache_block_size (#12214)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-20 21:54:16 +08:00
c222f47992 [core][bugfix] configure env var during import vllm (#12209)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-20 19:35:59 +08:00
170eb35079 [misc] print a message to suggest how to bypass commit hooks (#12217)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-20 18:06:24 +08:00
b37d82791e [Model] Upgrade Aria to transformers 4.48 (#12203)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-20 17:58:48 +08:00
3127e975fb [CI/Build] Make pre-commit faster (#12212)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-20 17:36:24 +08:00
4001ea1266 [CI/Build] Remove dummy CI steps (#12208)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-20 16:41:57 +08:00
5c89a29c22 [misc] add placeholder format.sh (#12206)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-20 16:04:49 +08:00
59a0192fb9 [Core] Interface for accessing model from VllmRunner (#10353)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-20 15:00:59 +08:00
83609791d2 [Model] Add Qwen2 PRM model support (#12202)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-20 14:59:46 +08:00
0974c9bc5c [Bugfix] Fix incorrect types in LayerwiseProfileResults (#12196)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-01-20 14:59:20 +08:00
d2643128f7 [DOC] Add missing docstring in LLMEngine.add_request() (#12195)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-01-20 14:59:00 +08:00
c5c06209ec [DOC] Fix typo in docstring and assert message (#12194)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-01-20 14:58:29 +08:00
3ea7b94523 Move linting to pre-commit (#11975)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-20 14:58:01 +08:00
51ef828f10 [torch.compile] fix sym_tensor_indices (#12191)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-20 11:37:50 +08:00
df450aa567 [Bugfix] Fix num_heads value for simple connector when tp enabled (#12074)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-01-20 02:56:43 +00:00
bbe5f9de7d [Model] Support for fairseq2 Llama (#11442)
Signed-off-by: Martin Gleize <mgleize@meta.com>
Co-authored-by: mgleize user <mgleize@a100-st-p4de24xlarge-4.fair-a100.hpcaas>
2025-01-19 10:40:40 -08:00
81763c58a0 [V1] Add V1 support of Qwen2-VL (#12128)
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: imkero <kerorek@outlook.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-19 19:52:13 +08:00
edaae198e7 [Misc] Add BNB support to GLM4-V model (#12184)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-19 19:49:22 +08:00
936db119ed benchmark_serving support --served-model-name param (#12109)
Signed-off-by: zibai <zibai.gj@alibaba-inc.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2025-01-19 09:59:56 +00:00
e66faf4809 [torch.compile] store inductor compiled Python file (#12182)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-19 16:27:26 +08:00
630eb5b5ce [Bugfix] Fix multi-modal processors for transformers 4.48 (#12187) 2025-01-18 19:16:34 -08:00
4e94951bb1 [BUGFIX] Move scores to float32 in case of running xgrammar on cpu (#12152)
Signed-off-by: Michal Adamczyk <madamczyk@habana.ai>
2025-01-19 11:12:05 +08:00
7a8a48d51e [V1] Collect env var for usage stats (#12115) 2025-01-19 03:07:15 +00:00
32eb0da808 [Misc] Support register quantization method out-of-tree (#11969) 2025-01-18 16:13:16 -08:00
6d0e3d3724 [core] clean up executor class hierarchy between v1 and v0 (#12171)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-18 14:35:15 +08:00
02798ecabe [Model] Port deepseek-vl2 processor, remove dependency (#12169)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-18 13:59:39 +08:00
813f249f02 [Docs] Fix broken link in SECURITY.md (#12175)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-01-18 04:35:21 +00:00
da02cb4b27 [core] further polish memory profiling (#12126)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-18 12:25:08 +08:00
c09503ddd6 [AMD][CI/Build][Bugfix] use pytorch stale wheel (#12172)
Signed-off-by: hongxyan <hongxyan@amd.com>
2025-01-18 11:15:53 +08:00
2b83503227 [misc] fix cross-node TP (#12166)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-18 10:53:27 +08:00
7b98a65ae6 [torch.compile] disable logging when cache is disabled (#12043)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-17 20:29:31 +00:00
b5b57e301e [AMD][FP8] Using MI300 FP8 format on ROCm for block_quant (#12134)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-01-17 17:12:26 +00:00
54cacf008f [Bugfix] Mistral tokenizer encode accept list of str (#12149)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-01-17 16:47:53 +00:00
58fd57ff1d [Bugfix] Fix score api for missing max_model_len validation (#12119)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
2025-01-17 16:24:22 +00:00
87a0c076af [core] allow callable in collective_rpc (#12151)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-17 20:47:01 +08:00
d4e6194570 [CI/Build][CPU][Bugfix] Fix CPU CI (#12150)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-01-17 19:39:52 +08:00
07934cc237 [Misc][LoRA] Improve the readability of LoRA error messages (#12102)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-17 19:32:28 +08:00
69d765f5a5 [V1] Move more control of kv cache initialization from model_executor to EngineCore (#11960)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-01-17 07:39:35 +00:00
8027a72461 [ROCm][MoE] moe tuning support for rocm (#12049)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-01-17 14:49:16 +08:00
d75ab55f10 [Misc] Add deepseek_vl2 chat template (#12143)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-17 06:34:48 +00:00
d1adb9b403 [BugFix] add more is not None check in VllmConfig.__post_init__ (#12138)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-17 05:33:22 +00:00
b8bfa46a18 [Bugfix] Fix issues in CPU build Dockerfile (#12135)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-01-17 12:54:01 +08:00
1475847a14 [Doc] Add instructions on using Podman when SELinux is active (#12136)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-01-17 04:45:36 +00:00
fead53ba78 [CI]add genai-perf benchmark in nightly benchmark (#10704)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-01-17 04:15:09 +00:00
ebc73f2828 [Bugfix] Fix a path bug in disaggregated prefill example script. (#12121)
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-01-17 11:12:41 +08:00
d06e824006 [Bugfix] Set enforce_eager automatically for mllama (#12127)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-16 15:30:08 -05:00
62b06ba23d [Model] Add support for deepseek-vl2-tiny model (#12068)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-16 17:14:48 +00:00
5fd24ec02e [misc] Add LoRA kernel micro benchmarks (#11579) 2025-01-16 15:51:40 +00:00
874f7c292a [Bugfix] Fix max image feature size for Llava-one-vision (#12104)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-16 14:54:06 +00:00
92e793d91a [core] LLM.collective_rpc interface and RLHF example (#12084)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-16 20:19:52 +08:00
bf53e0c70b Support torchrun and SPMD-style offline inference (#12071)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-16 19:58:53 +08:00
dd7c9ad870 [Bugfix] Remove hardcoded head_size=256 for Deepseek v2 and v3 (#12067)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-16 10:11:54 +00:00
9aa1519f08 Various cosmetic/comment fixes (#12089)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-01-16 09:59:06 +00:00
f8ef146f03 [Doc] Add documentation for specifying model architecture (#12105) 2025-01-16 15:53:43 +08:00
fa0050db08 [Core] Default to using per_token quantization for fp8 when cutlass is supported. (#8651)
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2025-01-16 04:31:27 +00:00
cd9d06fb8d Allow hip sources to be directly included when compiling for rocm. (#12087) 2025-01-15 16:46:03 -05:00
ebd8c669ef [Bugfix] Fix _get_lora_device for HQQ marlin (#12090)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-01-15 19:59:42 +00:00
70755e819e [V1][Core] Autotune encoder cache budget (#11895)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-15 11:29:00 -08:00
edce722eaa [Bugfix] use right truncation for non-generative tasks (#12050)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-01-16 00:31:01 +08:00
57e729e874 [Doc]: Update OpenAI-Compatible Server documents (#12082) 2025-01-15 16:07:45 +00:00
de0526f668 [Misc][Quark] Upstream Quark format to VLLM (#10765)
Signed-off-by: kewang-xlnx <kewang@xilinx.com>
Signed-off-by: kewang2 <kewang2@amd.com>
Co-authored-by: kewang2 <kewang2@amd.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-01-15 11:05:15 -05:00
5ecf3e0aaf Misc: allow to use proxy in HTTPConnection (#12042)
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
2025-01-15 13:16:40 +00:00
97eb97b5a4 [Model]: Support internlm3 (#12037) 2025-01-15 11:35:17 +00:00
3adf0ffda8 [Platform] Do not raise error if _Backend is not found (#12023)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-01-15 10:14:15 +00:00
ad388d25a8 Type-fix: make execute_model output type optional (#12020) 2025-01-15 09:44:56 +00:00
cbe94391eb Fix: cases with empty sparsity config (#12057)
Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
2025-01-15 17:41:24 +08:00
994fc655b7 [V1][Prefix Cache] Move the logic of num_computed_tokens into KVCacheManager (#12003) 2025-01-15 07:55:30 +00:00
3f9b7ab9f5 [Doc] Update examples to remove SparseAutoModelForCausalLM (#12062)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-01-15 06:36:01 +00:00
ad34c0df0f [core] platform agnostic executor via collective_rpc (#11256)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-15 13:45:21 +08:00
f218f9c24d [core] Turn off GPU communication overlap for Ray executor (#12051)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-01-15 05:19:55 +00:00
0794e7446e [Misc] Add multipstep chunked-prefill support for FlashInfer (#10467) 2025-01-15 12:47:49 +08:00
b7ee940a82 [V1][BugFix] Fix edge case in VLM scheduling (#12065)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-14 20:21:28 -08:00
9ddac56311 [Platform] move current_memory_usage() into platform (#11369)
Signed-off-by: Shanshan Shen <467638484@qq.com>
2025-01-15 03:38:25 +00:00
1a51b9f872 [HPU][Bugfix] Don't use /dev/accel/accel0 for HPU autodetection in setup.py (#12046)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
2025-01-15 02:59:18 +00:00
42f5e7c52a [Kernel] Support MulAndSilu (#11624)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-15 02:29:53 +00:00
a3a3ee4e6f [Misc] Merge bitsandbytes_stacked_params_mapping and packed_modules_mapping (#11924)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-15 07:49:49 +08:00
87054a57ab [Doc]: Update the Json Example of the Engine Arguments document (#12045) 2025-01-14 17:03:04 +00:00
c9d6ff530b Explain where the engine args go when using Docker (#12041)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-14 16:05:50 +00:00
a2d2acb4c8 [Bugfix][Kernel] Give unique name to BlockSparseFlashAttention (#12040)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-14 15:45:05 +00:00
2e0e017610 [Platform] Add output for Attention Backend (#11981)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-01-14 13:27:04 +00:00
1f18adb245 [Kernel] Revert the API change of Attention.forward (#12038)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-14 20:59:32 +08:00
bb354e6b2d [Bugfix] Fix various bugs in multi-modal processor (#12031)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-14 12:16:11 +00:00
ff39141a49 [HPU][misc] add comments for explanation (#12034)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-14 19:24:06 +08:00
8a1f938e6f [Doc] Update Quantization Hardware Support Documentation (#12025)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-01-14 04:37:52 +00:00
078da31903 [HPU][Bugfix] set_forward_context and CI test execution (#12014)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
2025-01-14 11:04:18 +08:00
1a401252b5 [Docs] Add Sky Computing Lab to project intro (#12019)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-13 17:24:36 -08:00
f35ec461fc [Bugfix] Fix deepseekv3 gate bias error (#12002)
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2025-01-13 13:43:51 -07:00
289b5191d5 [Doc] Fix build from source and installation link in README.md (#12013)
Signed-off-by: Yikun <yikunkero@gmail.com>
2025-01-13 17:23:59 +00:00
c6db21313c bugfix: Fix signature mismatch in benchmark's get_tokenizer function (#11982)
Signed-off-by: elijah <f1renze.142857@gmail.com>
2025-01-13 15:22:07 +00:00
a7d59688fb [Platform] Move get_punica_wrapper() function to Platform (#11516)
Signed-off-by: Shanshan Shen <467638484@qq.com>
2025-01-13 13:12:10 +00:00
458e63a2c6 [platform] add device_control env var (#12009)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-13 20:59:09 +08:00
e8c23ff989 [Doc] Organise installation documentation into categories and tabs (#11935)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-13 12:27:36 +00:00
cd8249903f [Doc][V1] Update model implementation guide for V1 support (#11998)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-01-13 11:58:54 +00:00
0f8cafe2d1 [Kernel] unified_attention for Attention.forward (#11967)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-13 19:28:53 +08:00
5340a30d01 Fix Max Token ID for Qwen-VL-Chat (#11980)
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
2025-01-13 08:37:48 +00:00
89ce62a316 [platform] add ray_device_key (#11948)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-13 16:20:52 +08:00
c3f05b09a0 [Misc]Minor Changes about Worker (#11555)
Signed-off-by: Chenguang Li <757486878@qq.com>
2025-01-13 15:47:05 +08:00
cf6bbcb493 [Misc] Fix Deepseek V2 fp8 kv-scale remapping (#11947)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
2025-01-12 23:05:06 -08:00
80ea3af1a0 [CI][Spec Decode] fix: broken test for EAGLE model (#11972)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
2025-01-13 06:50:35 +00:00
9dd02d85ca [Bug] Fix usage of .transpose() and .view() consecutively. (#11979) 2025-01-13 06:24:10 +00:00
f7b3ba82c3 [MISC] fix typo in kv transfer send recv test (#11983) 2025-01-13 05:07:48 +00:00
619ae268c3 [V1] [2/n] Logging and Metrics - OutputProcessor Abstraction (#11973)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2025-01-13 04:54:10 +00:00
d14e98d924 [Model] Support GGUF models newly added in transformers 4.46.0 (#9685)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-13 00:13:44 +00:00
9597a095f2 [V1][Core][1/n] Logging and Metrics (#11962)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2025-01-12 21:02:02 +00:00
263a870ee1 [Hardware][TPU] workaround fix for MoE on TPU (#11764) 2025-01-12 10:53:51 -05:00
8bddb73512 [Hardware][CPU] Multi-LoRA implementation for the CPU backend (#11100)
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-12 13:01:52 +00:00
f967e51f38 [Model] Initialize support for Deepseek-VL2 models (#11578)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-12 00:17:24 -08:00
43f3d9e699 [CI/Build] Add markdown linter (#11857)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
2025-01-12 00:17:13 -08:00
b25cfab9a0 [V1] Avoid sending text prompt to core engine (#11963)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-12 06:36:38 +00:00
4b657d3292 [Model] Add cogagent model support vLLM (#11742)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-11 19:05:56 +00:00
d697dc01b4 [Bugfix] Fix RobertaModel loading (#11940)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-01-11 14:05:09 +00:00
a991f7d508 [Doc] Basic guide for writing unit tests for new models (#11951)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-11 21:27:24 +08:00
7a3a83e3b8 [CI/Build] Move model-specific multi-modal processing tests (#11934)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-11 13:50:05 +08:00
c32a7c7c0c [Bugfix] fused_experts_impl wrong compute type for float32 (#11921)
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
2025-01-11 13:49:39 +08:00
2118d0565c [Bugfix][SpecDecode] Adjust Eagle model architecture to align with intended design (#11672)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
2025-01-10 20:49:38 -08:00
899136b857 [ci] fix broken distributed-tests-4-gpus (#11937)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-11 09:07:24 +08:00
c9f09a4fe8 [mypy] Fix mypy warnings in api_server.py (#11941)
Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
2025-01-11 01:04:58 +00:00
d45cbe70f5 [Bugfix] Check that number of images matches number of <|image|> tokens with mllama (#11939)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2025-01-10 23:26:00 +00:00
8a579408f3 [Misc] Update benchmark_prefix_caching.py fixed example usage (#11920)
Signed-off-by: Ren MinMin <renmm6@chinaunicom.cn>
Co-authored-by: Ren MinMin <renmm6@chinaunicom.cn>
2025-01-10 20:39:22 +00:00
46fa98ccad [Misc] Clean up debug code in Deepseek-V3 (#11930)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-10 19:19:15 +00:00
aa1e77a19c [Hardware][CPU] Support MOE models on x86 CPU (#11831)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-01-10 11:07:58 -05:00
5959564f94 Doc fix in benchmark_long_document_qa_throughput.py (#11933)
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-01-10 23:51:43 +08:00
f33e033e27 [Docs] Fix docstring in get_ip function (#11932)
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-01-10 23:51:02 +08:00
482cdc494e [Doc] Rename offline inference examples (#11927)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-10 23:50:29 +08:00
20410b2fda [platform] support custom torch.compile backend key (#11318)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-01-10 23:46:51 +08:00
12664ddda5 [Doc] [1/N] Initial guide for merged multi-modal processor (#11925)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-10 14:30:25 +00:00
241ad7b301 [ci] Fix sampler tests (#11922)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-10 20:45:33 +08:00
d85c47d6ad Replace "online inference" with "online serving" (#11923)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-10 12:05:56 +00:00
ef725feafc [platform] support pytorch custom op pluggable (#11328)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-01-10 10:02:38 +00:00
d907be7dc7 [misc] remove python function call for custom activation op (#11885)
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-01-10 17:18:25 +08:00
d53575a5f0 [ci] fix gh200 tests (#11919)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-10 16:25:17 +08:00
61af633256 [BUGFIX] Fix UnspecifiedPlatform package name (#11916)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-01-10 16:20:46 +08:00
ac2f3f7fee [Bugfix] Validate lora adapters to avoid crashing server (#11727)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-10 15:56:36 +08:00
cf5f000d21 [torch.compile] Hide KV cache behind torch.compile boundary (#11677)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-10 13:14:42 +08:00
3de2b1eafb [Doc] Show default pooling method in a table (#11904)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-10 11:25:20 +08:00
b844b99ad3 [VLM] Enable tokenized inputs for merged multi-modal processor (#11900)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-10 03:24:00 +00:00
c3cf54dda4 [Doc][5/N] Move Community and API Reference to the bottom (#11896)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-01-10 03:10:12 +00:00
36f5303578 [Docs] Add Modal to deployment frameworks (#11907) 2025-01-09 23:26:37 +00:00
9a228348d2 [Misc] Provide correct Pixtral-HF chat template (#11891)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-09 10:19:37 -07:00
bd82872211 [ci]try to fix flaky multi-step tests (#11894)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-09 14:47:29 +00:00
405eb8e396 [platform] Allow platform specify attention backend (#11609)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-01-09 21:46:50 +08:00
65097ca0af [Doc] Add model development API Reference (#11884)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-09 09:43:40 +00:00
1d967acb45 [Bugfix] fix beam search input errors and latency benchmark script (#11875)
Signed-off-by: Ye Qi <yeq@meta.com>
Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com>
2025-01-09 17:36:39 +08:00
0bd1ff4346 [Bugfix] Override dunder methods of placeholder modules (#11882)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-09 09:02:53 +00:00
310aca88c9 [perf]fix current stream (#11870)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-09 07:18:21 +00:00
a732900efc [Doc] Intended links Python multiprocessing library (#11878) 2025-01-09 05:39:39 +00:00
d848800e88 [Misc] Move print_*_once from utils to logger (#11298)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Co-authored-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
2025-01-09 12:48:12 +08:00
730e9592e9 [Doc] Recommend uv and python 3.12 for quickstart guide (#11849)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-01-09 11:37:48 +08:00
1fe554bac3 treat do_lower_case in the same way as the sentence-transformers library (#11815)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-01-09 11:05:43 +08:00
615e4a5401 [CI] Turn on basic correctness tests for V1 (#10864) 2025-01-08 21:20:44 -05:00
3db0cafdf1 [Docs] Add Google Cloud Meetup (#11864) 2025-01-08 12:38:28 -08:00
526de822d5 [Kernel][Triton][AMD] Use block size heuristic for avg 2.8x speedup for int8 models (#11698)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-01-08 20:23:15 +00:00
56fe4c297c [TPU][Quantization] TPU W8A8 (#11785)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-08 19:33:29 +00:00
47de8821d3 [Misc]add some explanations for BlockHashType (#11847) 2025-01-08 18:21:30 +00:00
5984499e47 [Doc] Expand Multimodal API Reference (#11852)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-08 17:14:14 +00:00
ca47e176af [Misc] Move some model utils into vision file (#11848)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-08 17:04:46 +00:00
78f4590b60 [Bugfix][XPU] fix silu_and_mul (#11823)
Signed-off-by: yan ma <yan.ma@intel.com>
2025-01-09 00:11:50 +08:00
2f7024987e [CI/Build][Bugfix] Fix CPU CI image clean up (#11836)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-01-08 15:18:28 +00:00
6cd40a5bfe [Doc][4/N] Reorganize API Reference (#11843)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-08 21:34:44 +08:00
aba8d6ee00 [Doc] Move examples into categories (#11840)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-08 13:09:53 +00:00
2a0596bc48 [VLM] Reorganize profiling/processing-related code (#11812)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-08 18:59:58 +08:00
f12141170a [torch.compile] consider relevant code in compilation cache (#11614)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-08 10:46:43 +00:00
cfd3219f58 [Hardware][Apple] Native support for macOS Apple Silicon (#11696)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-01-08 16:35:49 +08:00
a1b2b8606e [Docs] Update sponsor name: 'Novita' to 'Novita AI' (#11833) 2025-01-07 23:05:46 -08:00
ad9f1aa679 [doc] update wheels url (#11830)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-08 14:36:49 +08:00
889e662eae [misc] improve memory profiling (#11809)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-08 06:36:03 +00:00
ef68eb28d8 [Bug] Fix pickling of ModelConfig when RunAI Model Streamer is used (#11825)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-08 13:40:09 +08:00
259abd8953 [Docs] reorganize sponsorship page (#11639)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-01-07 21:16:08 -08:00
f645eb6954 [Bugfix] Add checks for LoRA and CPU offload (#11810)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-08 13:08:48 +08:00
f4923cb8bc [OpenVINO] Fixed Docker.openvino build (#11732)
Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
2025-01-08 13:08:30 +08:00
b640b19cc0 Fixed docker build for ppc64le (#11518)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
2025-01-08 13:05:37 +08:00
dc71af0a71 Remove the duplicate imports of MultiModalKwargs and PlaceholderRange… (#11824) 2025-01-08 04:09:25 +00:00
4d29e91be8 [Misc] sort torch profiler table by kernel timing (#11813) 2025-01-08 10:57:04 +08:00
91445c7bc8 [Bugfix] Fix image input for Pixtral-HF (#11741)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-08 10:17:16 +08:00
5950f555a1 [Doc] Group examples into categories (#11782)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-08 09:20:12 +08:00
a4e2b26856 [Bugfix] Significant performance drop on CPUs with --num-scheduler-steps > 1 (#11794) 2025-01-07 16:15:50 -08:00
973f5dc581 [Doc]Add documentation for using EAGLE in vLLM (#11417)
Signed-off-by: Sourashis Roy <sroy@roblox.com>
2025-01-07 19:19:12 +00:00
c994223d56 [Bugfix] update the prefix for qwen2 (#11795)
Co-authored-by: jiadi.jjd <jiadi.jjd@antgroup.com>
2025-01-07 18:36:34 +00:00
869579a702 [optimization] remove python function call for custom op (#11750)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-07 17:04:28 +00:00
c0efe92d8b [Doc] Add note to gte-Qwen2 models (#11808)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-07 21:50:58 +08:00
d9fa1c05ad [doc] update how pip can install nightly wheels (#11806)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-07 21:42:58 +08:00
2de197bdd4 [V1] Support audio language models on V1 (#11733)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-07 19:47:36 +08:00
869e829b85 [doc] add doc to explain how to use uv (#11773)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-07 18:41:17 +08:00
8f37be38eb [Bugfix] Comprehensively test and fix LLaVA-NeXT feature size calculation (#11800)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-07 18:25:02 +08:00
8082ad7950 [V1][Doc] Update V1 support for LLaVa-NeXT-Video (#11798)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-07 09:55:39 +00:00
1e4ce295ae [CI][CPU] adding build number to docker image name (#11788)
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
2025-01-07 07:28:01 +00:00
ce1917fcf2 [Doc] Create a vulnerability management team (#9925)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-01-06 22:57:32 -08:00
e512f76a89 fix init error for MessageQueue when n_local_reader is zero (#11768) 2025-01-07 06:12:48 +00:00
898cdf033e [CI] Fix neuron CI and run offline tests (#11779)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2025-01-06 21:36:10 -08:00
0f3f3c86ec [Bugfix] Update attention interface in Whisper (#11784)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-07 04:36:24 +00:00
b278557935 [Kernel][LoRA]Punica prefill kernels fusion (#11234)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Co-authored-by: Zhonghua Deng <abatom@163.com>
2025-01-07 04:01:39 +00:00
8ceffbf315 [Doc][3/N] Reorganize Serving section (#11766)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-07 11:20:01 +08:00
d93d2d74fd [XPU] Make pp group initilized for pipeline-parallelism (#11648)
Signed-off-by: yisheng <yi.sheng@intel.com>
2025-01-07 11:09:58 +08:00
d0169e1b0f [Model] Future-proof Qwen2-Audio multi-modal processor (#11776)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-07 11:05:17 +08:00
08fb75c72e [Bugfix] Fix LLaVA-NeXT feature size precision error (for real) (#11772)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-07 01:10:54 +00:00
91b361ae89 [V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685)
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-06 19:58:16 +00:00
e20c92bb61 [Kernel] Move attn_type to Attention.__init__() (#11690)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-07 00:11:28 +08:00
32c9eff2ff [Bugfix][V1] Fix molmo text-only inputs (#11676)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-06 15:22:25 +00:00
4ca5d40adc [doc] explain how to add interleaving sliding window support (#11771)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-06 21:57:44 +08:00
9279b9f83d [Bugfix] Fix max image size for LLaVA-Onevision (#11769)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-01-06 13:48:53 +00:00
ee77fdb5de [Doc][2/N] Reorganize Models and Usage sections (#11755)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-06 21:40:31 +08:00
996357e480 [VLM] Separate out profiling-related logic (#11746)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-06 16:02:21 +08:00
2a622d704a k8s-config: Update the secret to use stringData (#11679)
Signed-off-by: Suraj Deshmukh <surajd.service@gmail.com>
2025-01-06 08:01:22 +00:00
9c749713f6 [mypy] Forward pass function type hints in lora (#11740)
Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
2025-01-06 07:59:36 +00:00
022c5c6944 [V1] Refactor get_executor_cls (#11754) 2025-01-06 07:59:16 +00:00
f8fcca100b [Misc] Fix typo for valid_tool_parses (#11753)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-01-06 07:12:38 +00:00
06bfb51963 [V1] Add BlockTable class (#11693)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-06 14:24:42 +09:00
408e560015 [Bugfix] Remove block size constraint (#11723) 2025-01-06 12:49:55 +08:00
402d378360 [Doc] [1/N] Reorganize Getting Started section (#11645)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-06 02:18:33 +00:00
9e764e7b10 [distributed] remove pynccl's redundant change_state (#11749) 2025-01-06 09:05:48 +08:00
33fc1e2e86 [Frontend] Improve StreamingResponse Exception Handling (#11752) 2025-01-05 16:35:01 -05:00
eba17173d3 fix: [doc] fix typo (#11751)
Co-authored-by: Lancer <maruixiang6688@gmail.com>
2025-01-06 00:48:16 +08:00
635b897246 [distributed] remove pynccl's redundant stream (#11744) 2025-01-05 23:09:11 +08:00
4068f4b5b5 [MISC] Replace c10::optional with std::optional (#11730)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-01-05 10:20:34 +09:00
47831430cc [Bugfix][V1] Fix test_kv_cache_utils.py (#11738)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-04 16:07:59 +00:00
65c08928c2 [Model] Remove unnecessary weight initialization logic (#11736)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-04 23:46:21 +08:00
ba214dffbe [Bugfix] Fix precision error in LLaVA-NeXT (#11735)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-04 23:45:57 +08:00
eed11ebee9 [VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-04 11:40:53 +00:00
300acb8347 [Core][Bugfix] Use correct device to initialize GPU data during CUDA-graph-capture (#11233)
Signed-off-by: Yan Burman <yanburman@users.noreply.github.com>
Signed-off-by: Ido Asraff <idoa@atero.ai>
2025-01-04 14:50:16 +08:00
d91457d529 [V1] Add kv cache utils tests. (#11513)
Signed-off-by: xcnick <xcnick0412@gmail.com>
2025-01-04 14:49:46 +08:00
fbf2564554 [V1] Add RayExecutor support for AsyncLLM (api server) (#11712) 2025-01-04 06:41:31 +00:00
d1d49397e7 Update bnb.md with example for OpenAI (#11718) 2025-01-04 06:29:02 +00:00
9c93636d84 Update tool_calling.md (#11701) 2025-01-04 06:16:30 +00:00
e5d7ed0c53 [V1] log GPU blocks num for MultiprocExecutor (#11656) 2025-01-04 00:13:12 +00:00
ad0d567e1c [V1] Chore: cruft removal (#11724) 2025-01-03 23:25:02 +00:00
bf0d97d786 Update requirements-tpu.txt to support python 3.9 and 3.11 (#11695)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-01-03 22:36:46 +00:00
a655eb3025 [Misc]Add BNB quantization for Qwen2VL (#11719)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-03 15:19:02 -07:00
1543914c04 [V1] Improve TP>1 Error Handling + Stack Trace (#11721)
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-01-03 21:29:11 +00:00
61fed92c7e [Bugfix] Fix ColumnParallelLinearWithLoRA slice (#11708)
Signed-off-by: ZincCat <zincchloride@outlook.com>
2025-01-03 21:02:34 +00:00
80c751e7f6 [V1] Simplify Shutdown (#11659) 2025-01-03 17:25:38 +00:00
e1a5c2f0a1 [Model] Whisper model implementation (#11280)
Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
2025-01-03 16:39:19 +08:00
fd3a62a122 [perf-benchmark] Fix dependency for steps in benchmark pipeline (#11710) 2025-01-02 22:38:37 -08:00
07064cb1d4 [Bugfix] Check chain_speculative_sampling before calling it (#11673)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-01-02 16:58:56 -08:00
2f1e8e8f54 Update default max_num_batch_tokens for chunked prefill (#11694) 2025-01-03 00:25:53 +00:00
68d37809b9 [Misc] Minimum requirements for SageMaker compatibility (#11576) 2025-01-02 15:59:25 -08:00
5dba257506 Resolve race conditions in Marlin kernel (#11493)
Signed-off-by: wchen61 <wchen61@foxmail.com>
2025-01-02 22:58:56 +00:00
187e32997c [Bugfix] Change kv scaling factor by param json on nvidia gpu (#11688)
Signed-off-by: bjmsong <bjmsong@126.com>
Co-authored-by: bjmsong <bjmsong@126.com>
2025-01-02 21:11:39 +00:00
b55ed6ef8a [V1][Minor] Optimize token_ids_cpu copy (#11692)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-02 12:04:58 -07:00
2f385183f3 [Bugfix] Free cross attention block table for preempted-for-recompute sequence group. (#10013)
Signed-off-by: Kathy Yu <feiyangyu@google.com>
2025-01-02 10:28:09 -08:00
84c35c374a According to vllm.EngineArgs, the name should be distributed_executor_backend (#11689) 2025-01-02 18:14:16 +00:00
8c38ee7007 [VLM] Merged multi-modal processor for LLaVA-NeXT (#11682)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-02 16:39:27 +00:00
b6087a6bee [mypy] Pass type checking in vllm/inputs (#11680)
Signed-off-by: Tobias Pitters <tobias.pitters@gmail.com>
2025-01-02 16:18:15 +00:00
23c1b10a4c [VLM][Bugfix] Multi-modal processor compatible with V1 multi-input (#11674)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-02 17:00:00 +08:00
a115ac46b5 [VLM] Move supported limits and max tokens to merged multi-modal processor (#11669)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-01 15:44:42 +00:00
73001445fb [V1] Implement Cascade Attention (#11635)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-01 21:56:46 +09:00
6d70198b17 [Doc] Fix typo (#11666)
Signed-off-by: Kazuhiro Serizawa <nserihiro@gmail.com>
2025-01-01 08:10:10 +00:00
f962f426bc [Misc] Replace space with - in the file names (#11667)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-01-01 07:39:30 +00:00
11d8a091c6 [Misc] Optimize Qwen2-VL LoRA test (#11663)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-01-01 14:42:23 +08:00
365801fedd [VLM] Add max-count checking in data parser for single image models (#11661)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-12-31 22:15:21 -08:00
4db72e57f6 [Bugfix][Refactor] Unify model management in frontend (#11660)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-01-01 02:21:51 +00:00
0c6f998554 [Benchmark] Add benchmark script for CPU offloading (#11533)
Signed-off-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: KuntaiDu <kuntai@uchicago.edu>
2025-01-01 00:10:55 +00:00
e7c7c5e822 [V1][VLM] V1 support for selected single-image models. (#11632)
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
2024-12-31 21:17:22 +00:00
8c3230d8c1 [V1] Simpify vision block hash for prefix caching by removing offset from hash (#11646) 2024-12-31 08:56:01 +00:00
2c5718809b [Bugfix] Move the _touch(computed_blocks) call in the allocate_slots method to after the check for allocating new blocks. (#11565) 2024-12-31 06:29:04 +00:00
82c49d3260 [Misc][LoRA] Support Rank Stabilized LoRA (RSLoRA) (#6909)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-30 22:15:58 -08:00
74fa1d123c [Bugfix] Fix OpenAI parallel sampling when using xgrammar (#11637)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-31 03:43:54 +00:00
a2a40bcd0d [Model][LoRA]LoRA support added for MolmoForCausalLM (#11439)
Signed-off-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Matthias Vogler <matthias.vogler@joesecurity.org>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-30 17:33:06 -08:00
ccb1aabcca [benchmark] Remove dependency for H100 benchmark step (#11572) 2024-12-30 12:27:07 -08:00
36e7670045 [Bugfix] Validate and concatenate image embeddings in MiniCPMVBaseModel (#11631) 2024-12-30 18:51:04 +00:00
5886aa496e [V1] [6/N] API Server: Better Shutdown (#11586) 2024-12-30 15:51:02 +00:00
8d9b6721e7 [VLM] Abstract out multi-modal data parsing in merged processor (#11620)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-30 15:01:35 +00:00
b12e87f942 [platforms] enable platform plugins (#11602)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-30 20:24:45 +08:00
5dbf854553 [CI/Build][CPU] Fix CPU CI by lazy importing triton FP8 kernels (#11618)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2024-12-30 10:17:04 +00:00
970d6d0776 [Build][Kernel] Update CUTLASS to v3.6.0 (#11607)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-30 17:22:13 +08:00
628ec6c17b [Docker] bump up neuron sdk v2.21 (#11593)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2024-12-30 13:46:14 +08:00
3682e33f9f [v1] fix compilation cache (#11598)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-30 04:24:12 +00:00
0aa38d16f5 Remove print statement in DeepseekScalingRotaryEmbedding (#11604) 2024-12-29 20:16:46 +00:00
faef77c0d6 [Misc] KV cache transfer connector registry (#11481)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2024-12-29 16:08:09 +00:00
dba4d9dec6 [v1][bugfix] fix cudagraph with inplace buffer assignment (#11596)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-29 09:03:49 +00:00
32b4c63f02 [Doc] Convert list tables to MyST (#11594)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-29 15:56:22 +08:00
4fb8e329fd [V1] [5/N] API Server: unify Detokenizer and EngineCore input (#11545)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2024-12-28 20:51:57 +00:00
328841d002 [bugfix] interleaving sliding window for cohere2 model (#11583)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-28 16:55:42 +00:00
d427e5cfda [Doc] Minor documentation fixes (#11580)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-28 21:53:59 +08:00
42bb201fd6 [V1][Minor] Set pin_memory=False for token_ids_cpu tensor (#11581)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-28 13:33:12 +00:00
59d6bb4c86 [Hardware][AMD]: Replace HIPCC version with more precise ROCm version (#11515)
Signed-off-by: hjwei <hjwei_xd@163.com>
2024-12-28 11:17:35 +00:00
b7dcc003dc [Model] Remove hardcoded image tokens ids from Pixtral (#11582)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-12-28 10:54:23 +00:00
d34be24bb1 [Model] Support InternLM2 Reward models (#11571)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-12-28 06:14:10 +00:00
b5cbe8eeb3 [Bugfix] Last token measurement fix (#11376)
Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-12-28 11:34:46 +08:00
df04dffade [V1] [4/N] API Server: ZMQ/MP Utilities (#11541) 2024-12-28 01:45:08 +00:00
a60731247f [Doc] Update mllama example based on official doc (#11567)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2024-12-28 00:31:10 +00:00
ac79799403 [Bugfix] Fix for ROCM compressed tensor support (#11561) 2024-12-27 20:12:11 +00:00
dde1fa18c9 [Misc] Improve BNB loader to handle mixture of sharded and merged weights with same suffix (#11566)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-27 19:45:13 +00:00
0240402c46 [Misc]Add BNB quantization for MolmoForCausalLM (#11551)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-27 18:48:24 +00:00
55509c2114 [MODEL] LoRA support for Jamba model (#11209)
Signed-off-by: Erez Schwartz <erezs@ai21.com>
2024-12-27 17:58:21 +00:00
101418096f [VLM] Support caching in merged multi-modal processor (#11396)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-27 17:22:48 +00:00
5ce4627a7e [Doc] Add xgrammar in doc (#11549)
Signed-off-by: ccjincong <chenjincong11@gmail.com>
2024-12-27 13:05:10 +00:00
7af553ea30 [Misc] Abstract the logic for reading and writing media content (#11527)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-27 19:21:23 +08:00
2c9b8ea2b0 [Bugfix] Fix TeleChat2ForCausalLM weights mapper (#11546)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-27 10:39:15 +00:00
d003f3ea39 Update deploying_with_k8s.md with AMD ROCm GPU example (#11465)
Signed-off-by: Alex He <alehe@amd.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-12-27 10:00:04 +00:00
6c6f7fe8a8 [Platform] Move model arch check to platform (#11503)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2024-12-27 08:45:25 +00:00
2339d59f92 [BugFix] Fix quantization for all other methods (#11547) 2024-12-26 22:23:29 -08:00
1b875a0ef3 [V1][3/N] API Server: Reduce Task Switching + Handle Abort Properly (#11534) 2024-12-26 21:19:21 -08:00
eb881ed006 [misc] fix typing (#11540)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-27 11:05:08 +08:00
46d4359450 [CI] Fix broken CI (#11543) 2024-12-26 18:49:16 -08:00
81b979f2a8 [V1] Fix yapf (#11538)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-27 09:47:10 +09:00
371d04d39b [V1] Use FlashInfer Sampling Kernel for Top-P & Top-K Sampling (#11394)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-27 09:32:38 +09:00
0c0c2015c5 Update openai_compatible_server.md (#11536)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-12-26 16:26:18 -08:00
82d24f7aac [Docs] Document Deepseek V3 support (#11535)
Signed-off-by: simon-mo <simon.mo@hey.com>
2024-12-26 16:21:56 -08:00
f49777ba62 Deepseek v3 (#11502)
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: robertgshaw2-neuralmagic <rshaw@neuralmagic.com>
2024-12-26 16:09:44 -08:00
55fb97f7bd [2/N] API Server: Avoid ulimit footgun (#11530) 2024-12-26 23:43:05 +00:00
2072924d14 [Model] [Quantization] Support deepseek_v3 w8a8 fp8 block-wise quantization (#11523)
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: simon-mo <simon.mo@hey.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: HandH1998 <1335248067@qq.com>
2024-12-26 15:33:30 -08:00
720b10fdc6 [1/N] API Server (Remove Proxy) (#11529) 2024-12-26 23:03:43 +00:00
b85a977822 [Doc] Add video example to openai client for multimodal (#11521)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-12-26 17:31:29 +00:00
eec906d811 [Misc] Add placeholder module (#11501)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-26 13:12:51 +00:00
f57ee5650d [Model] Modify MolmoForCausalLM MLP (#11510)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-26 13:12:05 +00:00
dcb1a944d4 [V1] Adding min tokens/repetition/presence/frequence penalties to V1 sampler (#10681)
Signed-off-by: Sourashis Roy <sroy@roblox.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-26 19:02:58 +09:00
7492a36207 [Doc] Add QVQ and QwQ to the list of supported models (#11509)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-12-26 09:44:32 +00:00
aa25985bd1 [Misc][LoRA] Fix LoRA weight mapper (#11495)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-26 15:52:48 +08:00
dbeac95dbb Mypy checking for vllm/compilation (#11496)
Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
2024-12-26 05:04:07 +00:00
51a624bf02 [Misc] Move some multimodal utils to modality-specific modules (#11494)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-26 04:23:20 +00:00
6ad909fdda [Doc] Improve GitHub links (#11491)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-25 14:49:26 -08:00
b689ada91e [Frontend] Enable decord to load video from base64 (#11492)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-25 16:33:55 +00:00
fc601665eb [Misc] Update disaggregation benchmark scripts and test logs (#11456)
Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
2024-12-25 06:58:48 +00:00
9832e5572a [V1] Unify VLLM_ENABLE_V1_MULTIPROCESSING handling in RayExecutor (#11472) 2024-12-24 19:49:46 -08:00
3f3e92e1f2 [Model] Automatic conversion of classification and reward models (#11469)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-24 18:22:22 +00:00
409475a827 [Bugfix] Fix issues in CPU build Dockerfile. Fixes #9182 (#11435)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2024-12-24 16:53:28 +00:00
196c34b0ac [Misc] Move weights mapper (#11443)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-24 13:05:25 +00:00
5c7963249d [attn][tiny fix] fix attn backend in MultiHeadAttention (#11463)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2024-12-24 12:39:36 +00:00
461cde2080 [OpenVINO] Fixed installation conflicts (#11458)
Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
2024-12-24 11:38:21 +00:00
7a5286cc04 [Bugfix][Hardware][CPU] Fix CPU input_positions creation for text-only inputs with mrope (#11434)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-24 17:59:51 +08:00
b1b1038fbd [Bugfix] Fix Qwen2-VL LoRA weight loading (#11430)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-24 09:56:10 +00:00
9edca6bf8f [Frontend] Online Pooling API (#11457)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-24 17:54:30 +08:00
4f074fbf53 [Misc]Suppress irrelevant exception stack trace information when CUDA… (#11438)
Co-authored-by: shiquan <shiquan>
2024-12-24 08:43:39 +00:00
a491d6f535 [V1] TP Ray executor (#11107)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2024-12-23 23:00:12 +00:00
32aa2059ad [Docs] Convert rST to MyST (Markdown) (#11145)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
2024-12-23 22:35:38 +00:00
94d545a1a1 [Doc] Fix typo in the help message of '--guided-decoding-backend' (#11440) 2024-12-23 20:20:44 +00:00
60fb4f3bcf [Bugfix] Add kv cache scales to gemma2.py (#11269) 2024-12-23 19:30:45 +00:00
63afbe9215 [CI] Expand OpenAI test_chat.py guided decoding tests (#11048)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-23 18:35:38 +00:00
8cef6e02dc [Misc] add w8a8 asym models (#11075) 2024-12-23 13:33:20 -05:00
b866cdbd05 [Misc] Add assertion and helpful message for marlin24 compressed models (#11388) 2024-12-24 02:23:38 +08:00
2e726680b3 [Bugfix] torch nightly version in ROCm installation guide (#11423)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2024-12-23 17:20:22 +00:00
5bfb30a529 [Bugfix] Fix CFGGuide and use outlines for grammars that can't convert to GBNF (#11389)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-23 23:06:20 +08:00
e51719ae72 mypy type checking for vllm/worker (#11418)
Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
2024-12-23 13:55:49 +00:00
f30581c518 [misc][perf] remove old code (#11425)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-23 08:01:08 +00:00
048fc57a0f [CI] Unboock H100 Benchmark (#11419)
Signed-off-by: simon-mo <simon.mo@hey.com>
2024-12-22 14:17:43 -08:00
f1d1bf6288 [Bugfix] Fix fully sharded LoRAs with Mixtral (#11390)
Signed-off-by: Jason Greene <jason.greene@redhat.com>
2024-12-22 23:25:10 +08:00
72d9c316d3 [cd][release] fix race conditions (#11407)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-22 00:39:11 -08:00
4a9139780a [cd][release] add pypi index for every commit and nightly build (#11404)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-12-21 23:53:44 -08:00
29c748930e [CI] Fix flaky entrypoint tests (#11403)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-12-21 21:08:44 -08:00
c2d1b075ba [Bugfix] Fix issues for Pixtral-Large-Instruct-2411 (#11393)
Signed-off-by: ywang96 <ywang@example.com>
Co-authored-by: ywang96 <ywang@example.com>
2024-12-21 10:15:03 +00:00
584f0ae40d [V1] Make AsyncLLMEngine v1-v0 opaque (#11383)
Signed-off-by: Ricky Xu <xuchen727@hotmail.com>
2024-12-21 15:14:08 +08:00
51ff216d85 [Bugfix] update should_ignore_layer (#11354)
Signed-off-by: George Ohashi <george@neuralmagic.com>
2024-12-21 06:36:23 +00:00
dd2b5633dd [V1][Bugfix] Skip hashing empty or None mm_data (#11386)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-21 14:22:21 +09:00
47a0b615b4 Add ray[default] to wget to run distributed inference out of box (#11265)
Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
2024-12-20 13:54:55 -08:00
5d2248d81a [doc] explain nccl requirements for rlhf (#11381)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-20 13:00:56 -08:00
d573aeadcc [Bugfix] Don't log OpenAI field aliases as ignored (#11378)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-20 19:03:50 +00:00
995f56236b [Core] Loading model from S3 using RunAI Model Streamer as optional loader (#10192)
Signed-off-by: OmerD <omer@run.ai>
2024-12-20 16:46:24 +00:00
7c7aa37c69 [CI/Build] fix pre-compiled wheel install for exact tag (#11373)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2024-12-21 00:14:40 +08:00
04139ade59 [V1] Fix profiling for models with merged input processor (#11370)
Signed-off-by: ywang96 <ywang@roblox.com>
2024-12-20 12:04:21 +00:00
1ecc645b8f [doc] backward compatibility for 0.6.4 (#11359)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-19 21:33:53 -08:00
c954f21ac0 [misc] add early error message for custom ops (#11355)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-19 21:18:25 -08:00
86c2d8fd1c [Bugfix] Fix spec decoding when seed is none in a batch (#10863)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
2024-12-20 05:15:31 +00:00
b880ffb87e [Misc] Add tqdm progress bar during graph capture (#11349)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-20 04:35:18 +00:00
7801f56ed7 [ci][gh200] dockerfile clean up (#11351)
Signed-off-by: drikster80 <ed.sealing@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: cenzhiyao <2523403608@qq.com>
2024-12-19 18:13:06 -08:00
48edab8041 [Bugfix][Hardware][POWERPC] Fix auto dtype failure in case of POWER10 (#11331)
Signed-off-by: Akash Kaothalkar <0052v2@linux.vnet.ibm.com>
2024-12-20 01:32:07 +00:00
a985f7af9f [CI] Adding CPU docker pipeline (#11261)
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
2024-12-19 11:46:55 -08:00
e461c262f0 [Misc] Remove unused vllm/block.py (#11336) 2024-12-19 17:54:24 +00:00
276738ce0f [Bugfix] Fix broken CPU compressed-tensors test (#11338)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-19 17:37:31 +00:00
cdf22afdda [Misc] Clean up and consolidate LRUCache (#11339)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-20 00:59:32 +08:00
e24113a8fe [Model] Refactor Qwen2-VL to use merged multimodal processor (#11258)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-19 16:28:00 +00:00
7379b3d4b2 [V1] Fix multimodal profiling for Molmo (#11325)
Signed-off-by: ywang96 <ywang@example.com>
Co-authored-by: ywang96 <ywang@example.com>
2024-12-19 16:27:22 +00:00
6c7f881541 [Model] Add JambaForSequenceClassification model (#10860)
Signed-off-by: Yehoshua Cohen <yehoshuaco@ai21.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Yehoshua Cohen <yehoshuaco@ai21.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-19 22:48:06 +08:00
a0f7d53beb [Bugfix] Cleanup Pixtral HF code (#11333)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-19 13:22:00 +00:00
5aef49806d [Feature] Add load generation config from model (#11164)
Signed-off-by: liuyanyi <wolfsonliu@163.com>
Signed-off-by: Yanyi Liu <wolfsonliu@163.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-12-19 10:50:38 +00:00
98356735ac [misc] benchmark_throughput : Add LoRA (#11267)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-12-19 15:43:16 +08:00
f26c4aeecb [Misc] Optimize ray worker initialization time (#11275)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-12-18 23:38:02 -08:00
8936316d58 [Kernel] Refactor Cutlass c3x (#10049)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-12-19 07:00:18 +00:00
6142ef0ada [VLM] Merged multimodal processor for Qwen2-Audio (#11303)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-19 06:14:17 +00:00
c6b0a7d3ba [V1] Simplify prefix caching logic by removing num_evictable_computed_blocks (#11310) 2024-12-19 04:17:12 +00:00
a30482f054 [CI] Expand test_guided_generate to test all backends (#11313)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-19 04:00:38 +00:00
17ca964273 [Model] IBM Granite 3.1 (#11307)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-12-19 11:27:24 +08:00
5a9da2e6e9 [Bugfix][Build/CI] Fix sparse CUTLASS compilation on CUDA [12.0, 12.2) (#11311)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-19 02:43:30 +00:00
fdea8ec167 [V1] VLM - enable processor cache by default (#11305)
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
2024-12-18 18:54:46 -05:00
ca5f54a9b9 [Bugfix] fix minicpmv test (#11304)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-12-18 10:34:26 -08:00
f954fe0e65 [FIX] update openai version (#11287)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2024-12-18 10:17:05 -08:00
362cff1eb3 [CI][Misc] Remove Github Action Release Workflow (#11274) 2024-12-18 10:16:53 -08:00
996aa70f00 [Bugfix] Fix broken phi3-v mm_processor_kwargs tests (#11263)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-18 10:16:40 -08:00
60508ffda9 [Kernel]: Cutlass 2:4 Sparsity + FP8/Int8 Quant Support (#10995)
Co-authored-by: Faraz Shahsavan <faraz.shahsavan@gmail.com>
Co-authored-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Rahul Tuli <rahul@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2024-12-18 09:57:16 -05:00
f04e407e6b [MISC][XPU]update ipex link for CI fix (#11278) 2024-12-17 22:34:23 -08:00
8b79f9e107 [Bugfix] Fix guided decoding with tokenizer mode mistral (#11046) 2024-12-17 22:34:08 -08:00
866fa4550d [Bugfix] Restore support for larger block sizes (#11259)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
2024-12-17 16:39:07 -08:00
bf8717ebae [V1] Prefix caching for vision language models (#11187)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2024-12-17 16:37:59 -08:00
c77eb8a33c [Bugfix] Set temperature=0.7 in test_guided_choice_chat (#11264) 2024-12-17 16:34:06 -08:00
2d1b9baa8f [Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 12:26:32 -08:00
f9ecbb18bf [Misc] Allow passing logits_soft_cap for xformers backend (#11252)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-17 00:37:04 -08:00
02222a0256 [Misc] Kernel Benchmark for RMSNorm (#11241)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiaoyu Zhang <BBuf@users.noreply.github.com>
2024-12-17 06:57:02 +00:00
2bfdbf2a36 [V1][Core] Use weakref.finalize instead of atexit (#11242)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-16 22:11:33 -08:00
e88db68cf5 [Platform] platform agnostic for EngineArgs initialization (#11225)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2024-12-16 22:11:06 -08:00
59c9b6ebeb [V1][VLM] Proper memory profiling for image language models (#11210)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: ywang96 <ywang@example.com>
2024-12-16 22:10:57 -08:00
66d4b16724 [Frontend] Add OpenAI API support for input_audio (#11027)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-16 22:09:58 -08:00
0064f697d3 [CI] Add test case with JSON schema using references + use xgrammar by default with OpenAI parse (#10935)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-17 11:39:58 +08:00
35bae114a8 fix gh200 tests on main (#11246)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-16 17:22:38 -08:00
88a412ed3d [torch.compile] fast inductor (#11108)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-16 16:15:22 -08:00
c301616ed2 [ci][tests] add gh200 tests (#11244)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-16 15:53:18 -08:00
35ffa682b1 [Docs] hint to enable use of GPU performance counters in profiling tools for multi-node distributed serving (#11235)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-12-16 22:20:39 +00:00
551603feff [core] overhaul memory profiling and fix backward compatibility (#10511)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-16 13:32:25 -08:00
efbce85f4d [misc] Layerwise profile updates (#10242)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-12-16 18:14:57 +00:00
2ca830dbaa [Doc] Reorder vision language examples in alphabet order (#11228)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-16 11:23:33 +00:00
d927dbcd88 [Model] Refactor Ultravox to use merged input processor (#11198)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-12-16 10:09:53 +00:00
bddbbcb132 [Model] Support Cohere2ForCausalLM (Cohere R7B) (#11203) 2024-12-16 09:56:19 +00:00
b3b1526f03 WIP: [CI/Build] simplify Dockerfile build for ARM64 / GH200 (#11212)
Signed-off-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
2024-12-16 09:20:49 +00:00
17138af7c4 [Bugfix] Fix the default value for temperature in ChatCompletionRequest (#11219) 2024-12-16 00:15:40 -08:00
69ba344de8 [Bugfix] Fix block size validation (#10938) 2024-12-15 16:38:40 -08:00
da6f409246 Update deploying_with_k8s.rst (#10922) 2024-12-15 16:33:58 -08:00
25ebed2f8c [V1][Minor] Cache np arange to reduce input preparation overhead (#11214)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-15 13:33:00 -08:00
d263bd9df7 [Core] Support disaggregated prefill with Mooncake Transfer Engine (#10884)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2024-12-15 21:28:18 +00:00
38e599d6a8 [Doc] add documentation for disaggregated prefilling (#11197)
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2024-12-15 13:31:16 -06:00
96d673e0f8 [Bugfix] Fix error handling of unsupported sliding window (#11213)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-15 10:59:42 -07:00
b10609e6a1 [Misc] Clean up multi-modal processor (#11207)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-15 06:30:28 +00:00
a1c02058ba [torch.compile] allow tracking forward time (#11081)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-14 19:45:00 -08:00
15859f2357 [[Misc]Upgrade bitsandbytes to the latest version 0.45.0 (#11201) 2024-12-15 03:03:06 +00:00
886936837c [Performance][Core] Optimize the performance of evictor v1 and v2 by applying a priority queue and lazy deletion (#7209) 2024-12-14 11:38:10 -08:00
6d917d0eeb Enable mypy checking on V1 code (#11105)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2024-12-14 09:54:04 -08:00
93abf23a64 [VLM] Fully dynamic prompt replacement in merged input processor (#11199)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-14 17:52:18 +00:00
9c3dadd1c9 [Frontend] Add logits_processors as an extra completion argument (#11150)
Signed-off-by: Brad Hilton <brad.hilton.nw@gmail.com>
2024-12-14 16:46:42 +00:00
3cb5769883 [Misc] Minor improvements to the readability of PunicaWrapperBase (#11200)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-14 16:38:27 +00:00
ea7bd68d10 [V1][Bugfix] Fix V1 TP trust-remote-code (#11182)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-14 08:21:23 +00:00
48259264a4 [Core] Update outlines and increase its threadpool size (#11140)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-12-14 07:46:18 +00:00
24a3d12b82 update compressed-tensors to latest version (#11183)
Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
2024-12-14 03:22:44 +00:00
9855aea21b [Bugfix][V1] Re-compute an entire block when fully cache hit (#11186)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2024-12-13 17:08:23 -08:00
4b5b8a6a3b [V1][Bugfix] Fix EngineCoreProc profile (#11185)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-13 17:02:35 -08:00
4863e5fba5 [Core] V1: Use multiprocessing by default (#11074)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-12-13 16:27:32 -08:00
0d8451c3a4 [Distributed] Allow the placement group more time to wait for resources to be ready (#11138)
Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
2024-12-13 20:17:37 +00:00
0a56bcc03d [Bugfix][Hardware][CPU] Enable Gemma2 with SDPA on CPU backend (#11169) 2024-12-13 18:00:40 +00:00
0920ab9131 [Doc] Reorganize online pooling APIs (#11172)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-14 00:22:22 +08:00
238c0d93b4 [Misc] Add tokenizer_mode param to benchmark_serving.py (#11174)
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
2024-12-13 16:19:10 +00:00
5b0ed8391d [Bugfix] using len(tokenizer) instead of tokenizer.vocab_size in AllowedTokenIdsLogitsProcessor (#11156) 2024-12-13 15:56:19 +00:00
c31d4a57a6 [Core] support LoRA and prompt adapter in content-based hashing for Block Manager v2 prefix caching (#8240) 2024-12-13 07:51:25 -08:00
d1fa714cb1 [Refactor]A simple device-related refactor (#11163)
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
2024-12-13 13:39:00 +00:00
969da7d70b [V1][VLM] Fix edge case bug for InternVL2 (#11165)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-12-13 11:09:30 +00:00
eeec9e3390 [Frontend] Separate pooling APIs in offline inference (#11129)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-13 10:40:07 +00:00
f93bf2b189 [Bugfix][CI][CPU] add missing datasets package to requirements-cpu.txt (#11159)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2024-12-13 08:50:35 +00:00
7cd7409142 PaliGemma 2 support (#11142) 2024-12-13 07:40:07 +00:00
be39e3cd18 [core] clean up cudagraph batchsize padding logic (#10996)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-13 06:57:50 +00:00
34f1a806d5 [Bugfix][V1] Fix 'NoneType' object has no attribute 'hash_value' (#11157)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2024-12-13 06:30:06 +00:00
00c1bde5d8 [ROCm][AMD] Disable auto enabling chunked prefill on ROCm (#11146)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2024-12-13 05:31:26 +00:00
3989a79824 [Bugfix] Update starcoder2 to remap k/v scale names for kv_cache quantization (#11148) 2024-12-13 05:07:20 +00:00
1efce68605 [Bugfix] Use runner_type instead of task in GritLM (#11144)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2024-12-13 04:09:53 +00:00
30870b4f66 [torch.compile] Dynamic fp8 + rms_norm fusion (#10906)
Signed-off-by: luka <luka@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-12-13 03:19:23 +00:00
78ed8f57d8 [Misc][V1] Fix type in v1 prefix caching (#11151) 2024-12-13 00:57:40 +00:00
db6c264a1e [Bugfix] Fix value unpack error of simple connector for KVCache transfer. (#11058)
Signed-off-by: ShangmingCai <csmthu@gmail.com>
2024-12-12 21:19:17 +00:00
9f3974a319 Fix logging of the vLLM Config (#11143) 2024-12-12 12:05:57 -08:00
2c97eca1ff [Misc] Validate grammar and fail early (#11119) 2024-12-12 18:34:26 +00:00
5d712571af [Bugfix] Quick fix to make Pixtral-HF load correctly again after 39e227c7ae. (#11024) 2024-12-12 18:09:20 +00:00
d4d5291cc2 fix(docs): typo in helm install instructions (#11141)
Signed-off-by: Ramon Ziai <ramon.ziai@bettermarks.com>
2024-12-12 17:36:32 +00:00
4816d20aa4 [V1] Fix torch profiling for offline inference (#11125)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-12-12 15:51:53 +00:00
85362f028c [Misc][LoRA] Ensure Lora Adapter requests return adapter name (#11094)
Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-12 09:25:16 +00:00
62de37a38e [core][distributed] initialization from StatelessProcessGroup (#10986)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-12 09:04:19 +00:00
8195824206 [Hardware][Intel-Gaudi] Enable LoRA support for Intel Gaudi (HPU) (#10565)
Signed-off-by: Sanju C Sudhakaran <scsudhakaran@habana.ai>
2024-12-12 08:09:28 +00:00
f092153fbe [V1] Use more persistent buffers to optimize input preparation overheads (#11111)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-11 23:14:20 -08:00
1da8f0e1dd [Model] Add support for embedding model GritLM (#10816)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2024-12-12 06:39:16 +00:00
ccede2b264 [Core] cleanup zmq ipc sockets on exit (#11115)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-12-11 19:12:24 -08:00
24a36d6d5f Update link to LlamaStack remote vLLM guide in serving_with_llamastack.rst (#11112)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2024-12-12 02:39:21 +00:00
8fb26dac61 [Docs] Add media kit (#11121) 2024-12-11 17:33:11 -08:00
7439a8b5fc [Bugfix] Multiple fixes to tool streaming with hermes and mistral (#10979)
Signed-off-by: cedonley <clayton@donley.io>
2024-12-12 01:10:12 +00:00
4e11683368 [V1] VLM preprocessor hashing (#11020)
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-12-12 00:55:30 +00:00
452a723bf2 [V1][Core] Remove should_shutdown to simplify core process termination (#11113)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-11 23:34:54 +00:00
d1e21a979b [CI/Build] Split up VLM tests (#11083)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-12 06:18:16 +08:00
72ff3a9686 [core] Bump ray to use _overlap_gpu_communication in compiled graph tests (#10410)
Signed-off-by: Rui Qiao <ubuntu@ip-172-31-15-128.us-west-2.compute.internal>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Rui Qiao <ubuntu@ip-172-31-15-128.us-west-2.compute.internal>
2024-12-11 11:36:35 -08:00
66aaa7722d [torch.compile] remove graph logging in ci (#11110)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-11 10:59:50 -08:00
d643c2aba1 [V1] Use input_ids as input for text-only models (#11032)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-11 10:49:23 -08:00
91642db952 [torch.compile] use depyf to dump torch.compile internals (#10972)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-11 10:43:05 -08:00
fd22220687 [Doc] Installed version of llmcompressor for int8/fp8 quantization (#11103)
Signed-off-by: Guangda Liu <bingps@users.noreply.github.com>
Co-authored-by: Guangda Liu <bingps@users.noreply.github.com>
2024-12-11 15:43:24 +00:00
b2f775456e [CI/Build] Enable prefix caching test for AMD (#11098)
Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>
2024-12-11 15:23:37 +00:00
cad5c0a6ed [Doc] Update docs to refer to pooling models (#11093)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-11 13:36:27 +00:00
8f10d5e393 [Misc] Split up pooling tasks (#10820)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-11 01:28:00 -08:00
40766ca1b8 [Bugfix]: Clamp -inf logprob values in prompt_logprobs (#11073)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
2024-12-11 01:27:39 -08:00
2e32f5d28d [Bugfix] Fix Idefics3 fails during multi-image inference (#11080)
Signed-off-by: B-201 <Joy25810@foxmail.com>
2024-12-11 01:27:07 -08:00
61b1d2f6ae [Core] v1: Use atexit to handle engine core client shutdown (#11076)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-12-11 01:26:36 -08:00
9974fca047 [ci/build] Fix entrypoints test and pin outlines version (#11088) 2024-12-11 01:01:53 -08:00
3fb4b4f163 [ci/build] Fix AMD CI dependencies (#11087) 2024-12-11 00:39:53 -08:00
2e33fe4191 [CI/Build] Check transformers v4.47 (#10991)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-11 05:02:02 +00:00
e39400a4b6 Fix streaming for granite tool call when <|tool_call|> is present (#11069)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2024-12-11 04:51:40 +00:00
ffa48c9146 [Model] PP support for Mamba-like models (#10992)
Signed-off-by: mzusman <mor.zusmann@gmail.com>
2024-12-10 21:53:37 -05:00
d5c5154fcf [Misc] LoRA + Chunked Prefill (#9057) 2024-12-11 10:09:20 +08:00
9a93973708 [Bugfix] Fix Mamba multistep (#11071)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-11 00:16:22 +00:00
134810b3d9 [V1][Bugfix] Always set enable_chunked_prefill = True for V1 (#11061)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-10 14:41:23 -08:00
75f89dc44c [torch.compile] add a flag to track batchsize statistics (#11059)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-10 12:40:52 -08:00
e739194926 [Core] Update to outlines >= 0.1.8 (#10576)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-12-10 12:08:16 -08:00
250ee65d72 [BUG] Remove token param #10921 (#11022)
Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
2024-12-10 17:38:15 +00:00
9b9cef3145 [Bugfix] Backport request id validation to v0 (#11036)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-12-10 16:38:23 +00:00
d05f88679b [Misc][LoRA] Add PEFTHelper for LoRA (#11003)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-10 11:12:01 +00:00
beb16b2c81 [Bugfix] Handle <|tool_call|> token in granite tool parser (#11039)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-12-10 10:27:11 +00:00
fe2e10c71b Add example of helm chart for vllm deployment on k8s (#9199)
Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
2024-12-10 09:19:27 +00:00
82c73fd510 [Bugfix] cuda error running llama 3.2 (#11047) 2024-12-10 07:41:11 +00:00
bfd610430c Update README.md (#11034) 2024-12-09 23:08:10 -08:00
e35879c276 [Bugfix] Fix xgrammar failing to read a vocab_size from LlavaConfig on PixtralHF. (#11043) 2024-12-10 14:54:22 +08:00
ebf778061d monitor metrics of tokens per step using cudagraph batchsizes (#11031)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-09 22:35:36 -08:00
28b3a1c7e5 [V1] Multiprocessing Tensor Parallel Support for v1 (#9856)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-10 06:28:14 +00:00
bc192a2b09 [Pixtral] Improve loading (#11040) 2024-12-10 06:09:32 +00:00
980ad394a8 [Frontend] Use request id from header (#10968)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-12-10 13:46:29 +08:00
391d7b2763 [Bugfix] Fix usage of deprecated decorator (#11025)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-10 13:45:47 +08:00
d1f6d1c8af [Model] Add has_weight to RMSNorm and re-enable weights loading tracker for Mamba (#10739)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-10 10:23:07 +08:00
6d525288c1 [Docs] Add dedicated tool calling page to docs (#10554)
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-09 20:15:34 -05:00
6faec54505 [V1] Do not store None in self.generators (#11038)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-09 15:08:19 -08:00
5ed5d5f128 Build tpu image in release pipeline (#10936)
Signed-off-by: Richard Liu <ricliu@google.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
2024-12-09 23:07:48 +00:00
b63ba84832 [ROCm][bugfix] scpecilative decoding worker class (#11035)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2024-12-09 14:00:29 -08:00
9c6459e4cb [Neuron] Upgrade neuron to 2.20.2 (#11016)
Signed-off-by: Jerzy Zagorski <jzagorsk@amazon.com>
Co-authored-by: Jerzy Zagorski <jzagorsk@amazon.com>
2024-12-09 13:53:24 -08:00
1a2f8fb828 [v1] fix use compile sizes (#11000)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-09 13:47:24 -08:00
cbcbdb1ceb [Bugfix][Hardware][Gaudi] Bump vllm_hpu_extension version (#11028)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
2024-12-09 13:21:06 -08:00
a811dd6608 [Model] merged input processor for Phi-3-Vision models (#10977)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-12-09 12:55:10 -08:00
ca871491ed [Misc][LoRA] Abstract PunicaWrapper (#10955)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-09 12:54:44 -08:00
3b61cb450d [V1] Further reduce CPU overheads in flash-attn (#10989)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-09 12:38:46 -08:00
edc4fa3188 [ci/build] Recompile CI dependencies list with Python 3.12 (#11013)
Signed-off-by: kevin <kevin@anyscale.com>
2024-12-09 11:46:58 -08:00
25b79d9fd3 [V1] Input Batch Relocation (#10962)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-12-09 09:33:41 -08:00
aea2fc38c3 [Platform] Move async output check to platform (#10768)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2024-12-09 17:24:46 +00:00
e691b26f6f [Core] Require xgrammar >= 0.1.6 (#11021)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-12-09 16:44:27 +00:00
c690357928 [V1] Fix Detokenizer loading in AsyncLLM (#10997)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-12-09 16:27:10 +00:00
d1c2e15eb3 [torch.compile] add dynamo time tracking (#11005)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-08 23:09:04 -08:00
af7c4a92e6 [Doc][V1] Add V1 support column for multimodal models (#10998)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-12-08 22:29:16 -08:00
46004e83a2 [misc] clean up and unify logging (#10999)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-08 17:28:27 -08:00
43b05fa314 [torch.compile][misc] fix comments (#10993)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-08 11:18:18 -08:00
a11f326528 [V1] Initial support of multimodal models for V1 re-arch (#10699)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-12-08 12:50:51 +00:00
fd57d2b534 [torch.compile] allow candidate compile sizes (#10984)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-08 11:05:21 +00:00
7be15d9356 [core][misc] remove use_dummy driver for _run_workers (#10920)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-07 12:06:08 -08:00
1b62745b1d [core][executor] simplify instance id (#10976)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-07 09:33:45 -08:00
78029b34ed [BugFix][Kernel]: fix illegal memory access in causal_conv1d when conv_states is None (#10928)
Signed-off-by: xffxff <1247714429@qq.com>
2024-12-08 01:21:18 +08:00
c889d5888b [Doc] Explicitly state that PP isn't compatible with speculative decoding yet (#10975)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-07 17:20:49 +00:00
39e227c7ae [Model] Update multi-modal processor to support Mantis(LLaVA) model (#10711)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-07 17:10:05 +00:00
1c768fe537 [Doc] Explicitly state that InternVL 2.5 is supported (#10978)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-07 16:58:02 +00:00
bf0e382e16 [Model] Composite weight loading for multimodal Qwen2 (#10944)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-07 07:22:52 -07:00
b26b4cd03c [Misc][LoRA] Refactor and clean MergedQKVParallelLinearWithLora implementation (#10958)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-07 18:33:49 +08:00
f13cf9ad50 [Build] Fix for the Wswitch-bool clang warning (#10060)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2024-12-07 09:03:44 +00:00
955fa9533a [3/N] Support and implement merged input processor for LLaVA model (#10676)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-12-07 00:50:58 -08:00
acf092d348 [Bugfix] Fix test-pipeline.yaml (#10973)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-07 12:08:54 +08:00
69d357ba12 [Core] Cleanup startup logging a bit (#10961)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-12-07 02:30:23 +00:00
dcdc3fafe5 [ci] fix broken tests (#10956)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-06 11:25:47 -08:00
c05cfb67da [misc] fix typo (#10960)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-06 11:25:20 -08:00
7406274041 [Doc] add KubeAI to serving integrations (#10837)
Signed-off-by: Sam Stoelinga <sammiestoel@gmail.com>
2024-12-06 17:03:56 +00:00
8b59631855 [Core] Support Lark grammars for XGrammar (#10870)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-06 08:34:29 -07:00
a1887f2c96 [torch.compile] fix deprecated code (#10948)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-06 11:01:23 +00:00
222f5b082a [CI/Build] Fix broken multimodal test (#10950) 2024-12-06 10:41:23 +00:00
b031a455a9 [torch.compile] add logging for compilation time (#10941)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-06 10:07:15 +00:00
db87eb6c67 [torch.compile] use size tuning for specific sizes (#10933)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-05 20:30:41 -08:00
9743d64e4e [ci][build] add tests for python only compilation (#10915)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-05 08:54:47 -08:00
a43065272f [Misc][Gaudi] Avoid torch.compile and enable lazy collectives (#10897)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
2024-12-05 08:47:46 -08:00
998eeafe58 [CI/Build] Bump test transformers version (#10106)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-05 16:05:52 +00:00
571da8fc43 [Misc][LoRA] Clean up the function interface of Punica (#10917)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-05 13:22:28 +00:00
39c89e71a8 [Misc] Update llama 3.2 template to support system prompt with images (#10901)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-12-05 05:54:06 +00:00
1f958a7d52 [Bugfix] Fix BNB loader target_modules (#10720)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-05 13:20:26 +08:00
aa39a8e175 [Doc] Create a new "Usage" section (#10827)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-05 11:19:35 +08:00
8d370e91cb [Bugfix] Fallback to outlines for complex json schemas (#10899)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-05 11:14:06 +08:00
7883c2bbe7 [benchmark] Make H100 benchmark optional (#10908) 2024-12-04 17:02:17 -08:00
2a56e1264f [V1] Fix when max_model_len is not divisible by block_size (#10903)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-04 16:54:05 -08:00
e4c34c23de [CI/Build] improve python-only dev setup (#9621)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2024-12-04 21:48:13 +00:00
82eb5ea8f3 Benchmark serving structured output (#10880)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-12-04 16:28:21 -05:00
10398b4706 [Model] Consolidate ViTs attention implementation without mask (#10893)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-04 18:11:08 +00:00
01d079fd8e [LoRA] Change lora_tokenizers capacity (#10796)
Signed-off-by: Xin Yang <xyang19@gmail.com>
2024-12-04 17:40:16 +00:00
c92acb9693 [ci/build] Update vLLM postmerge ECR repo (#10887) 2024-12-04 09:01:20 +00:00
8db957ee3a [bugfix] fixed parameter “n” when set parameter “bestof” > 1 (#10854)
Signed-off-by: jianzheng <57654625+o2363286@users.noreply.github.com>
2024-12-04 08:48:22 +00:00
c9ca4fce3f [ci/build] Job to build and push release image (#10877) 2024-12-04 15:02:40 +08:00
fa2dea61df [ci/build] Change queue name for Release jobs (#10875) 2024-12-04 15:02:16 +08:00
b5b647b084 Drop ROCm load format check (#10767)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2024-12-04 04:32:21 +00:00
d2bd88b122 [CI/Build] Replace mean with torch.all in test_pynccl.py (#10876)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-12-04 03:23:21 +00:00
381ac93bb5 [Benchmark] Benchmark structured output with datasets (#10557)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
2024-12-03 17:21:06 -07:00
a061fe601e [Build][Bugfix] Using the correct type hint (#10866)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2024-12-03 15:47:55 -05:00
7c32b6861e [Frontend] correctly record prefill and decode time metrics (#10853)
Signed-off-by: Tomer Asida <tomera@ai21.com>
2024-12-03 19:13:31 +00:00
7090c27bb2 [Bugfix] Only require XGrammar on x86 (#10865)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-12-03 10:32:21 -08:00
2f2cdc745a [MISC][XPU] quick fix for XPU CI (#10859)
Signed-off-by: yan ma <yan.ma@intel.com>
2024-12-03 17:16:31 +00:00
3bc94cab69 [V1] VLM - Run the mm_mapper preprocessor in the frontend process (#10640)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-12-03 10:33:10 +00:00
f6084f6324 [Speculative Decoding] Move indices to device before filtering output (#10850)
Co-authored-by: Yang Zheng(SW)(Alex) <you@example.com>
2024-12-03 17:01:39 +08:00
9323a3153b [Core][Performance] Add XGrammar support for guided decoding and set it as default (#10785)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-12-03 15:17:00 +08:00
3257d449fa [Misc] Remove deprecated names (#10817)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-03 06:52:57 +00:00
ef51831ee8 [Doc] Add github links for source code references (#10672)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-03 06:46:07 +00:00
dc5ce861bf [torch.compile] remove compilation_context and simplify code (#10838)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-03 06:19:02 +00:00
21fe7b481a [core][distributed] add pynccl broadcast (#10843)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-03 04:53:23 +00:00
a4cf256159 [Bugfix] Fix QKVParallelLinearWithShardedLora bias bug (#10844)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-03 12:10:29 +08:00
d746268e92 [Model] support bitsandbytes quantization with minicpm model (#10842)
Signed-off-by: Ubuntu <zixuanzhang@bytedance.com>
2024-12-03 03:06:41 +00:00
4433195ab7 [Bugfix] Prevent benchmark_throughput.py from using duplicated random prompts (#10753) 2024-12-03 02:26:15 +00:00
4c05edb33a [Model] Add TP and BNB quantization support to LlavaMultiModalProjector (#10834)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-12-02 23:06:09 +00:00
9b14d978aa Fix openvino on GPU (#10793) 2024-12-02 18:52:19 +00:00
519cc6ca12 [Misc][XPU] Avoid torch compile for XPU platform (#10747)
Signed-off-by: yan ma <yan.ma@intel.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2024-12-02 17:53:55 +00:00
b45f0d7946 [Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-12-02 17:53:36 +00:00
a4c4daf364 [misc] use out argument for flash attention (#10822)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-02 10:50:10 +00:00
e95f275f57 [CI/Build] Update mistral_common version for tests and docs (#10825)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-02 10:26:10 +00:00
ef31eabc68 [Model]: add some tests for aria model (#10770)
Signed-off-by: xffxff <1247714429@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2024-12-02 05:36:36 +00:00
995a148575 [doc]Update config docstring (#10732)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2024-12-02 04:14:45 +00:00
63a164172d [misc] remove xverse modeling file (#10814)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-02 03:27:13 +00:00
e25810ae29 Fill TorchSDPAAttentionMetadata seq_lens_field for prefill (#10799)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2024-12-02 10:05:32 +08:00
073a4bd1c0 [Kernel] Use out arg in flash_attn_varlen_func (#10811)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-01 17:55:39 -08:00
b7954776fd [core] Avoid metrics log noise when idle - include speculative decodi… (#10809) 2024-12-02 01:49:48 +00:00
b18c9bbaba [Model] Add BNB support to Llava and Pixtral-HF (#10795)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-12-02 01:31:09 +00:00
0590ec3fd9 [Core] Implement disagg prefill by StatelessProcessGroup (#10502)
This PR provides initial support for single-node disaggregated prefill in 1P1D scenario.
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Co-authored-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: YaoJiayi <120040070@link.cuhk.edu.cn>
2024-12-01 19:01:00 -06:00
c11f172187 [Misc] Adding MMMU-Pro vision dataset to serving benchmark (#10804)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2024-12-01 08:47:05 +00:00
169a0ff911 [doc] add warning about comparing hf and vllm outputs (#10805)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-01 00:41:38 -08:00
d2f058e76c [Misc] Rename embedding classes to pooling (#10801)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-01 14:36:51 +08:00
f877a7d12a [Misc] Improve type annotations for support_torch_compile (#10763)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-30 17:48:35 -08:00
133707123e [Model] Replace embedding models with pooling adapter (#10769)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-12-01 08:02:54 +08:00
7e4bbda573 [doc] format fix (#10789)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2024-11-30 11:38:40 +00:00
e7cfc4ef4c [Interleaved ATTN] Support for Mistral-8B (#10591)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2024-11-30 07:45:50 +00:00
16ee07f22a [Model] Refactor Molmo weights loading to use AutoWeightsLoader (#10771)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-30 04:19:14 +00:00
40bc242579 [Bugfix] Fix OpenVino/Neuron driver_worker init (#10779)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-11-30 12:07:13 +08:00
661175bc82 [platform] Add verify_quantization in platform. (#10757)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2024-11-29 15:22:21 +00:00
3132aac043 [Bugfix] Fix Idefics3 bug (#10778)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-29 13:56:46 +00:00
c82b432d4a [Misc] typo find in sampling_metadata.py (#10740) 2024-11-29 05:17:57 +00:00
fa6ecb9aa7 [Model] Clean up MiniCPMV (#10751)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-29 04:47:06 +00:00
c83919c7a6 [Model] Add Internlm2 LoRA support (#5064)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-28 17:29:04 +00:00
98f47f2a40 [V1] Optimize the CPU overheads in FlashAttention custom op (#10733)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-28 09:01:02 -08:00
8c1e77fb58 [Kernel] Update vllm-flash-attn version to reduce CPU overheads (#10742)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-28 08:31:28 -08:00
5fc5ce0fe4 [Model] Added GLM-4 series hf format model support vllm==0.6.4 (#10561)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-11-28 14:53:31 +00:00
3ed5e73146 [TPU] Update requirements-tpu (#10726)
Signed-off-by: Richard Liu <ricliu@google.com>
2024-11-28 02:30:48 -08:00
9a8bff0285 [Kernel] Update vllm-flash-attn version (#10736)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-28 02:25:59 -08:00
a79b122400 [V1] Do not allocate beyond the max_model_len (#10730)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-28 00:13:15 -08:00
d9b4b3f069 [Bug][CLI] Allow users to disable prefix caching explicitly (#10724)
Signed-off-by: rickyx <rickyx@anyscale.com>
2024-11-27 23:59:28 -08:00
278be671a3 [Doc] Update model in arch_overview.rst to match comment (#10701)
Signed-off-by: spacewander <spacewanderlzx@gmail.com>
2024-11-27 23:58:39 -08:00
70dc14fbd0 [Model] support bitsandbytes quantization with minicpm3 model (#10682)
Signed-off-by: Ubuntu <zixuanzhang@bytedance.com>
2024-11-27 23:58:02 -08:00
cb4e1c3f3a [misc] upgrade filelock version (#10731)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-27 19:54:58 -08:00
395b1c7454 [Frontend] don't block event loop in tokenization (preprocess) in OpenAI compatible server (#10635)
Signed-off-by: Tomer Asida <tomera@ai21.com>
2024-11-27 13:21:10 -08:00
9b4b150395 [Bugfix] Ignore lm_head when loading embedding models (#10719)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-27 19:05:29 +00:00
197b4484a3 [Bugfix][Mamba] Fix Multistep on Mamba-like models (#10705)
Signed-off-by: mzusman <mor.zusmann@gmail.com>
2024-11-27 19:02:27 +00:00
b98c62ba49 [Bugfix] Fix GGUF inference with FP16 unquantized checkpoint (#10675)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-27 10:43:17 -08:00
c411def234 [torch.compile] fix shape specialization (#10722)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-27 10:16:10 -08:00
308cc5e21e [ci] fix slow tests (#10698)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-27 09:26:14 -08:00
9e0a147d50 [V1] Update interface for mistral-format Pixtral (#10703)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-11-27 12:26:27 +00:00
418cb3b93f [Bugfix][Hardware][CPU] Fix intel-omp version to avoid segfault (#10700)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2024-11-27 11:55:38 +00:00
1209261e93 [Model] Support telechat2 (#10311)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: xiangw2 <xiangw2@chinatelecom.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
2024-11-27 11:32:35 +00:00
e2251109c7 [Kernel] Remove if-else with identical branches in marlin 2:4 (#10687)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2024-11-26 22:55:32 -08:00
15cc2a9f1a [Misc]Further reduce BNB static variable (#10597)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-26 22:54:12 -08:00
e85250b1d1 [Hardware][Gaudi]add get_name method for HPUAttentionBackend (#10667)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2024-11-26 22:49:40 -08:00
cfb3bf25fb [bugfix] fix the default value of llm_int8_threshold in BitsAndBytesConfig (#10657) 2024-11-27 13:55:23 +08:00
1bf905ddaa [Bugfix][SpecDecode] apply sampling parameters to target probabilities for consistency in rejection sampling. (#10198)
Signed-off-by: jeongin601 <0200angela@gmail.com>
Signed-off-by: jeong_in.bae <jeong_in.bae@navercorp.com>
2024-11-27 05:07:30 +00:00
0a4d968500 [V1] Update interface for idefics3 (#10680)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-11-27 10:04:01 +08:00
0a71900bc9 Remove hard-dependencies of Speculative decode to CUDA workers (#10587)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2024-11-26 17:57:11 -08:00
2f0a0a17a4 [V1] Refactor model executable interface for multimodal models (#10570)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-11-26 20:46:11 +00:00
7576cd38df [Bugfix] Check bnb_4bit_quant_storage for bitsandbytes (#10642) 2024-11-26 12:29:00 -08:00
9a99273b48 [Bugfix] Fix using -O[0,3] with LLM entrypoint (#10677)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-11-26 10:44:01 -08:00
f5792c7c4a [Hardware][NVIDIA] Add non-NVML CUDA mode for Jetson (#9735)
Signed-off-by: Conroy Cheers <conroy@corncheese.org>
2024-11-26 10:26:28 -08:00
db66e018ea [Bugfix] Fix for Spec model TP + Chunked Prefill (#10232)
Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
Signed-off-by: Sourashis Roy <sroy@roblox.com>
Co-authored-by: Sourashis Roy <sroy@roblox.com>
2024-11-26 09:11:16 -08:00
1f6584ee85 [V1] Enable profile for LLMEngine (#10665) 2024-11-26 10:36:45 +00:00
334d64d1e8 [ci] add vllm_test_utils (#10659)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-26 00:20:04 -08:00
940635343a [Misc] Remove outdated init protocols (#10655)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-26 14:55:00 +08:00
9a88f89799 custom allreduce + torch.compile (#10121)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2024-11-25 22:00:16 -08:00
519e8e4182 [v1] EngineArgs for better config handling for v1 (#10382)
Signed-off-by: rickyx <rickyx@anyscale.com>
2024-11-25 21:09:43 -08:00
a6760f6456 [Feature] vLLM ARM Enablement for AARCH64 CPUs (#9228)
Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
Co-authored-by: Sanket Kale <sanketk.kale@fujitsu.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-11-25 18:32:39 -08:00
45ac4ff270 [bugfix] fix aria model and add torch.compile (#10645)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-25 18:32:09 -08:00
6e9ff050c8 [misc] do not read HOST_IP (#10644)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-25 17:04:50 -08:00
9db713a1dc [Model] Add OLMo November 2024 model (#10503) 2024-11-25 17:26:40 -05:00
1b583cfefa [Doc] Fix typos in docs (#10636)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-25 10:15:45 -08:00
cf73f0c95e [Model] Enable optional prefix when loading embedding models (#10639)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-25 18:14:33 +00:00
b1d920531f [Model]: Add support for Aria model (#10514)
Signed-off-by: xffxff <1247714429@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2024-11-25 18:10:55 +00:00
452a4e80c3 [Docs] Add Snowflake Slides (#10641)
Signed-off-by: simon-mo <simon.mo@hey.com>
2024-11-25 09:34:46 -08:00
c27df94e1f [Bugfix] Fix chunked prefill with model dtype float32 on Turing Devices (#9850)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-11-25 12:23:32 -05:00
d04b13a380 [Bug]: Authorization ignored when root_path is set (#10606)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2024-11-25 16:21:41 +00:00
2b0879bfc2 Super tiny little typo fix (#10633) 2024-11-25 13:08:30 +00:00
ed46f14321 [Model] Support is_causal HF config field for Qwen2 model (#10621)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-25 09:51:20 +00:00
05d1f8c9c6 [misc] move functions to config.py (#10624)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-25 09:27:30 +00:00
25d806e953 [misc] add torch.compile compatibility check (#10618)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-24 23:40:08 -08:00
65813781a2 [torch.compile] add warning for unsupported models (#10622)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-24 23:27:51 -08:00
7c2134beda [torch.compile] force inductor threads (#10620)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-24 23:04:21 -08:00
a30a605d21 [Doc] Add encoder-based models to Supported Models page (#10616)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-25 06:34:07 +00:00
571841b7fc [torch.compile] support encoder based models (#10613)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-25 05:24:33 +00:00
7ea3cd7c3e [Refactor][MISC] del redundant code in ParallelConfig.postinit (#10614)
Signed-off-by: MengqingCao <cmq0113@163.com>
2024-11-25 05:14:56 +00:00
214efc2c3c Support Cross encoder models (#10400)
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
Co-authored-by: Flavia Beo <flavia.beo@ibm.com>
2024-11-24 18:56:20 -08:00
49628fe13e [Doc] Update README.md with Ray Summit talk links (#10610) 2024-11-24 16:45:09 -08:00
e4fbb14414 [doc] update the code to add models (#10603)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-11-24 11:21:40 -08:00
c055747867 [model][utils] add extract_layer_index utility function (#10599)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-23 22:22:54 -08:00
eda2b3589c Revert "Print running script to enhance CI log readability" (#10601) 2024-11-23 21:31:47 -08:00
1c445dca51 [CI/Build] Print running script to enhance CI log readability (#10594)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-24 03:57:13 +00:00
1700c543a5 [Bugfix] Fix LoRA weight sharding (#10450)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-11-23 17:23:17 -08:00
17d8fc1806 [bugfix] Fix example/tensorize_vllm_model tests (#10595)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-23 17:22:33 -08:00
04668ebe7a [Bugfix] Avoid import AttentionMetadata explicitly in Mllama (#10593)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-23 18:12:20 +00:00
651f6c31ac For ppc64le, disabled tests for now and addressed space issues (#10538) 2024-11-23 09:33:53 +00:00
86a44fb896 [Platforms] Refactor openvino code (#10573)
Signed-off-by: statelesshz <hzji210@gmail.com>
2024-11-22 22:23:12 -08:00
4cfe5d2bca [Bugfix] multi_modal_kwargs broadcast for CPU tensor parallel (#10541)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-22 21:25:46 -08:00
c8acd80548 [2/N] handling placeholders in merged multi-modal processor (#10485)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-22 21:25:09 -08:00
4634a89d18 Prefix Cache Aware Scheduling [1/n] (#10128)
Signed-off-by: rickyx <rickyx@anyscale.com>
2024-11-22 21:15:55 -08:00
7c25fe45a6 [AMD] Add support for GGUF quantization on ROCm (#10254) 2024-11-22 21:14:49 -08:00
02a43f82a9 Update default max_num_batch_tokens for chunked prefill to 2048 (#10544) 2024-11-22 21:14:19 -08:00
cfea9c04ef [Model] Fix Baichuan BNB online quantization (#10572)
Signed-off-by: Chen Wu <cntryroa@gmail.com>
2024-11-22 21:13:59 -08:00
7d8ffb344f [Bugfix] Internal Server Error when tool_choice is incorrect. (#10567)
Signed-off-by: Varun Shenoy <varun.vinayak.shenoy@oracle.com>
2024-11-22 21:13:29 -08:00
4aba6e3d1a [core] gemma2 full context length support (#10584)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-22 20:13:54 -08:00
978b39744b [Misc] Add pynccl wrappers for all_gather and reduce_scatter (#9432) 2024-11-22 22:14:03 -05:00
ebda51968b [Core] Fix broken log configuration (#10458)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-11-23 10:23:51 +08:00
9195dbdbca [Bugfix][Frontend] Update Llama Chat Templates to also support Non-Tool use (#10164)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-11-23 10:17:38 +08:00
d559979c54 [bugfix] fix cpu tests (#10585)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-22 17:34:03 -08:00
d345f409b7 [V1] EngineCore supports profiling (#10564)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2024-11-22 17:16:15 -08:00
28598f3939 [Core] remove temporary local variables in LLMEngine.__init__ (#10577)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-11-22 16:22:53 -08:00
948c859571 support bitsandbytes quantization with qwen model (#10549)
Signed-off-by: Ubuntu <zixuanzhang@bytedance.com>
2024-11-22 16:16:14 -08:00
97814fbf0f [v1] Refactor KVCacheManager for more hash input than token ids (#10507)
Signed-off-by: rickyx <rickyx@anyscale.com>
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-11-22 23:27:25 +00:00
eebad39f26 [torch.compile] support all attention backends (#10558)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-22 14:04:42 -08:00
db100c5cde [bugfix] fix full graph tests (#10581)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-22 10:02:14 -08:00
11fcf0e066 Remove token-adding chat embedding params (#10551)
Signed-off-by: Noam Gat <noamgat@gmail.com>
2024-11-21 23:59:47 -08:00
b6374e09b0 [Bugfix] Fix Phi-3 BNB quantization with tensor parallel (#9948)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-22 15:01:56 +08:00
a111d0151f [platforms] absorb worker cls difference into platforms folder (#10555)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2024-11-21 21:00:32 -08:00
446c7806b2 [Minor] Fix line-too-long (#10563)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-21 19:40:40 -08:00
33e0a2540a [9/N] torch.compile LLM usage (#10552)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-21 19:13:31 -08:00
aed074860a [Benchmark] Add new H100 machine (#10547) 2024-11-21 18:27:20 -08:00
9afa014552 Add small example to metrics.rst (#10550) 2024-11-21 23:43:43 +00:00
46fe9b46d8 [Minor] Revert change in offline inference example (#10545)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-21 21:28:16 +00:00
cf656f5a02 [misc] improve error message (#10553)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-21 13:13:17 -08:00
edec3385b6 [CI][Installation] Avoid uploading CUDA 11.8 wheel (#10535)
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-11-21 13:03:58 -08:00
f9310cbd0c [V1] Fix Compilation config & Enable CUDA graph by default (#10528)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-21 12:53:39 -08:00
7560ae5caf [8/N] enable cli flag without a space (#10529)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-21 12:30:42 -08:00
e7a8341c7c [Bugfix] Allow token ID-only inputs in Qwen2-Audio (#10536)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-21 18:09:43 +00:00
c51e397fe8 [Misc] Suppress duplicated logging regarding multimodal input pipeline (#10530)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-11-21 09:21:31 -08:00
2385b60d83 [Kernel] Register punica ops directly (#10522)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-21 09:18:11 -08:00
da7e702c6f [Bug]: When apply continue_final_message for OpenAI server, the "echo":false is ignored (#10180)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2024-11-21 16:24:32 +00:00
4d676f0852 [Bugfix] Embedding model pooling_type equals ALL and multi input's bug (#10494) 2024-11-21 14:40:02 +00:00
d5ec121f95 [Model] Expose dynamic_image_size as mm_processor_kwargs for InternVL2 models (#10518)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-21 14:20:08 +00:00
8a93a598d9 fix the issue that len(tokenizer(prompt)["input_ids"]) > prompt_len (#10524)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2024-11-21 11:15:36 +00:00
1cfde82ffd [Model] Add Support for Multimodal Granite Models (#10291)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-11-21 10:46:20 +00:00
f0e0238016 [Doc] fix a small typo in docstring of llama_tool_parser (#10513) 2024-11-21 09:05:23 +00:00
aaddce5d26 [platforms] improve error message for unspecified platforms (#10520)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-20 23:07:56 -08:00
3430857b64 [Misc] Increase default video fetch timeout (#10495)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-20 23:06:42 -08:00
8b0fe06c89 [torch.compile] Inductor code caching fix (#10273)
Signed-off-by: luka <luka@neuralmagic.com>
Signed-off-by: Luka Govedic <luka.govedic@gmail.com>
2024-11-20 21:44:57 -08:00
9d827170a3 [Platforms] Add device_type in Platform (#10508)
Signed-off-by: MengqingCao <cmq0113@163.com>
2024-11-21 04:44:20 +00:00
6c1208d083 [Core] Add Sliding Window Support with Flashinfer (#10462)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2024-11-20 19:56:47 -08:00
388ee3de66 [torch.compile] limit inductor threads and lazy import quant (#10482)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-20 18:36:33 -08:00
2f77b6cfec [TPU] Implement prefix caching for TPUs (#10307)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-20 13:54:15 -08:00
c68f7ede6a [Bugfix]: allow extra fields in requests to openai compatible server (#10463)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2024-11-20 16:42:21 -05:00
0cd3d9717e [7/N] torch.compile, reduce compilation time (#10460)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-20 11:20:38 -08:00
5f1d6af2b6 [perf bench] H200 development (#9768)
Signed-off-by: simon-mo <simon.mo@hey.com>
2024-11-20 11:06:56 -08:00
772a66732d [platforms] restore xpu check for parallel config (#10479)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-20 17:13:28 +00:00
63f1fde277 [Hardware][CPU] Support chunked-prefill and prefix-caching on CPU (#10355)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2024-11-20 10:57:39 +00:00
d5b28447e0 [Platforms] Refactor xpu code (#10468)
Signed-off-by: MengqingCao <cmq0113@163.com>
2024-11-19 22:52:13 -08:00
09dbf9ff16 [Bugfix] Handle conflicts between modern and legacy fields (#10471)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-20 14:45:08 +08:00
343041c4c4 [model] Reduce medusa weight (#10454)
Signed-off-by: skylee-01 <497627264@qq.com>
2024-11-20 06:05:55 +00:00
ed701ca963 [ci/build] Combine nightly and optional (#10465) 2024-11-19 21:36:03 -08:00
7629a9c6e5 [CI/Build] Support compilation with local cutlass path (#10423) (#10424) 2024-11-19 21:35:50 -08:00
709c9f1f25 [CI/Build] Add sphinx/rst linter for docs (#10366) 2024-11-19 21:35:31 -08:00
b4be5a8adb [Bugfix] Enforce no chunked prefill for embedding models (#10470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-20 05:12:51 +00:00
ad44437ba3 [Bugfix] Fix Mamba model initialization and MLP Speculator weights loading (#10456)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-20 05:04:05 +00:00
9e05252b46 [Misc] Add __setitem__ for LazyDict (#10469)
Signed-off-by: Yanyi Liu <wolfsonliu@163.com>
2024-11-20 04:44:57 +00:00
d200972e7f [Bugfix] Marlin 2:4 temp fix for large M dim (>256) (#10464)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2024-11-19 19:40:33 -08:00
d5b68aba2f [CI/Build] Update Dockerfile.rocm (#10434)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2024-11-19 17:19:59 -08:00
a324d3a1a7 Change granite chat template to keep json list formatting for tool calls (#10452)
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
2024-11-19 18:16:54 -07:00
b00b33d77e [Model][Quantization] HQQ support through Marlin kernel expansion (#9766)
Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
2024-11-19 13:31:12 -08:00
efa9084628 [Core] Avoid metrics log noise when idle (#8868)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-11-19 21:05:25 +00:00
803f37eaaa [6/N] torch.compile rollout to users (#10437)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-19 10:09:03 -08:00
fd9f124971 [Doc] fix link for page that was renamed (#10455)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-11-19 09:48:30 -08:00
1ea291a417 Fix: Build error seen on Power Architecture (#10421)
Signed-off-by: Manjul Mohan <manjul.mohan@ibm.com>
Signed-off-by: B-201 <Joy25810@foxmail.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: ismael-dm <ismaeldm99@gmail.com>
Signed-off-by: Andrew Nesbitt <andrewnez@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: yan ma <yan.ma@intel.com>
Signed-off-by: Angus Wang <wangjadehao@gmail.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: rickyx <rickyx@anyscale.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Manjul Mohan manjul.mohan@ibm.com <manjulmohan@ltcd97-lp2.aus.stglabs.ibm.com>
Co-authored-by: B-201 <Joy25810@foxmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: ismael-dm <ismaeldm99@gmail.com>
Co-authored-by: Andrew Nesbitt <andrewnez@gmail.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Angus Wang <wangjadehao@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Ricky Xu <rickyx@anyscale.com>
Co-authored-by: Kevin H. Luu <kevin@anyscale.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2024-11-19 09:34:57 -08:00
11fd7ea639 [Pixtral-Large] Pixtral actually has no bias in vision-lang adapter (#10449) 2024-11-19 17:33:06 +00:00
f028dff33d [BugFix] Fix hermes tool parser output error stream arguments in some cases (#10395) (#10398)
Signed-off-by: xiyuan lee <lixiyuan@haier.com>
2024-11-19 13:42:50 +00:00
b4614656b8 [CI][CPU] adding numa node number as container name suffix (#10441)
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
2024-11-19 13:16:43 +00:00
25f9c78961 [misc][plugin] improve plugin loading (#10443)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-19 10:43:21 +00:00
5390d6664f [Doc] Add the start of an arch overview page (#10368) 2024-11-19 09:52:11 +00:00
382b6a4852 [Misc] Avoid misleading warning messages (#10438)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-19 08:54:58 +00:00
272e31c0bd [Bugfix] Guard for negative counter metrics to prevent crash (#10430)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-11-19 04:57:10 +00:00
74f8c2cf5f Add openai.beta.chat.completions.parse example to structured_outputs.rst (#10433) 2024-11-19 04:37:46 +00:00
8c1fb50705 [Platform][Refactor] Extract func get_default_attn_backend to Platform (#10358)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2024-11-19 11:22:26 +08:00
7eb719df13 [Bugfix]Fix Phi-3 BNB online quantization (#10417)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2024-11-19 03:21:42 +00:00
284203f171 [ci/build] Have dependabot ignore all patch update (#10436)
We have too many dependencies and all patch updates can be a little noisy. This is to have dependabot ignore all patch version updates.
2024-11-19 01:04:25 +00:00
90a6c759ca [misc] partial prefix & random input generation benchmark (#9929)
Signed-off-by: rickyx <rickyx@anyscale.com>
2024-11-18 15:39:14 -08:00
2298e69b5f [ci][bugfix] fix kernel tests (#10431)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-18 15:29:37 -08:00
a03ea40792 [3/N][torch.compile] consolidate custom op logging (#10399)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-18 15:14:59 -08:00
96d999fbe8 [Kernel] Initial Machete W4A8 support + Refactors (#9855)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2024-11-18 12:59:29 -07:00
c2170a5b39 [Kernel] Explicitly specify other value in tl.load calls (#9014)
Signed-off-by: Angus Wang <wangjadehao@gmail.com>
2024-11-18 11:39:40 -08:00
6b2d25efc7 [Hardware][XPU] AWQ/GPTQ support for xpu backend (#10107)
Signed-off-by: yan ma <yan.ma@intel.com>
2024-11-18 11:18:05 -07:00
281cc4b3cd [Model][Bugfix] Support TP for PixtralHF ViT (#10405)
Signed-off-by: mgoin <michael@neuralmagic.com>
2024-11-18 10:04:14 -08:00
4f686d139f Fix open_collective value in FUNDING.yml (#10426)
Signed-off-by: Andrew Nesbitt <andrewnez@gmail.com>
2024-11-18 09:52:42 -08:00
31894a2155 [Doc] Add documentation for Structured Outputs (#9943)
Signed-off-by: ismael-dm <ismaeldm99@gmail.com>
2024-11-18 09:52:12 -08:00
7851b45196 [5/N][torch.compile] torch.jit.script --> torch.compile (#10406)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-18 23:20:06 +08:00
4186be8111 [Doc] Update doc for LoRA support in GLM-4V (#10425)
Signed-off-by: B-201 <Joy25810@foxmail.com>
2024-11-18 15:08:30 +00:00
e7ebb662d7 [Model] Remove transformers attention porting in VITs (#10414)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-18 21:45:21 +08:00
5be4e52b65 [Model][LoRA]LoRA support added for glm-4v (#10418)
Signed-off-by: B-201 <Joy25810@foxmail.com>
2024-11-18 12:57:10 +00:00
01aae1cc68 [Model] Remove redundant softmax when using PoolingType.STEP (#10415) 2024-11-18 10:05:36 +00:00
c7dec926f6 [VLM] Report multi_modal_placeholders in output (#10407)
Signed-off-by: Linkun Chen <lkchen+anyscale@github.com>
2024-11-18 16:06:16 +08:00
51bb12d17b [4/N][torch.compile] clean up set_torch_compile_backend (#10401)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-17 23:57:20 -08:00
47826cacf0 [Bugfix] Ignore ray reinit error when current platform is ROCm or XPU (#10375)
Signed-off-by: Hollow Man <hollowman@opensuse.org>
2024-11-18 11:29:26 +08:00
c4e464333e [Misc] Add uninitialized params tracking for AutoWeightsLoader (#10327)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-18 09:07:46 +08:00
d1557e66d3 [Misc] Enhance offline_inference to support user-configurable paramet… (#10392)
Signed-off-by: wchen61 <wchen61@foxmail.com>
2024-11-17 11:32:40 +00:00
80d85c5d7b [Bugfix] Fix mrope_position_delta in non-last prefill chunk (#10403)
Signed-off-by: imkero <kerorek@outlook.com>
2024-11-17 08:50:24 +00:00
76aab90ab6 [Hardware] [HPU]add mark_step for hpu (#10239)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2024-11-17 00:44:44 -08:00
8d74b5aee9 [platforms] refactor cpu code (#10402)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-16 23:14:23 -08:00
cf349c4a97 [Bugfix][CPU] Fix CPU embedding runner with tensor parallel (#10394)
Signed-off-by: Isotr0py <2037008807@qq.com>
2024-11-16 23:12:04 -08:00
905d0f0af4 [CI/Build] Fix IDC hpu [Device not found] issue (#10384)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2024-11-17 14:58:22 +08:00
643ecf7b11 [V1] Refactor model executable interface for all text-only language models (#10374)
Signed-off-by: Roger Wang <ywang@roblox.com>
2024-11-17 05:18:46 +00:00
4fd9375028 [2/N][torch.compile] make compilation cfg part of vllm cfg (#10383)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-16 18:02:14 -08:00
661a34fd4f [V1] Add code owners for V1 (#10397)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-16 10:45:26 -08:00
361c29e174 [Bugfix] Fix M-RoPE position calculation when chunked prefill is enabled (#10388)
Signed-off-by: imkero <kerorek@outlook.com>
2024-11-17 02:10:00 +08:00
b98d89efd4 [Misc] Medusa supports custom bias (#10361) 2024-11-16 16:33:01 +00:00
8b6725b0cf [Misc] Update benchmark to support image_url file or http (#10287)
Signed-off-by: rbbang <anjaehyun87@gmail.com>
2024-11-16 18:15:40 +08:00
1d75472626 [BugFix] [Kernel] Fix GPU SEGV occuring in fused_moe kernel (#10385)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2024-11-16 09:55:05 +00:00
2f427c2d16 [misc][plugin] improve log messages (#10386)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-16 01:23:20 -08:00
755b85359b [doc] add doc for the plugin system (#10372)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-15 21:46:27 -08:00
32e46e000f [Frontend] Automatic detection of chat content format from AST (#9919)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-16 13:35:40 +08:00
4f168f69a3 [Docs] Misc updates to TPU installation instructions (#10165) 2024-11-15 13:26:17 -08:00
3e8d14d8a1 [Doc] Move PR template content to docs (#10159)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-11-15 13:20:20 -08:00
a067f85e08 [Frontend] Add --version flag to CLI (#10369)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2024-11-15 13:13:53 -08:00
c76ac49d26 [Docs] Add Nebius as sponsors (#10371)
Signed-off-by: simon-mo <simon.mo@hey.com>
2024-11-15 12:47:40 -08:00
1195 changed files with 99201 additions and 38608 deletions

View File

@ -2,8 +2,11 @@ import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
# Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
def print_top_10_largest_files(zip_file):

View File

@ -0,0 +1,24 @@
import argparse
import os
template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
</body>
</html>
"""
parser = argparse.ArgumentParser()
parser.add_argument("--wheel", help="The wheel path.", required=True)
args = parser.parse_args()
filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# cloudfront requires escaping the '+' character
f.write(
template.format(wheel=filename,
wheel_html_escaped=filename.replace("+", "%2B")))

View File

@ -1,5 +1,6 @@
steps:
- label: "Wait for container to be ready"
key: wait-for-container-image
agents:
queue: A100
plugins:
@ -9,16 +10,18 @@ steps:
- image: badouralix/curl-jq
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
@ -41,20 +44,49 @@ steps:
- name: devshm
emptyDir:
medium: Memory
# - label: "H100"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# command:
# - bash
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
# mount-buildkite-agent: true
# propagate-environment: true
# ipc: host
# gpus: all
# environment:
# - VLLM_USAGE_SOURCE
# - HF_TOKEN
- label: "H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
depends_on: wait-for-container-image
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: 4,5,6,7
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: wait-for-container-image
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN

View File

@ -157,6 +157,18 @@ if __name__ == "__main__":
throughput_results,
serving_results)
for df in [latency_results, serving_results, throughput_results]:
if df.empty:
continue
# Sort all dataframes by their respective "Test name" columns
df.sort_values(by="Test name", inplace=True)
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
# get markdown tables
latency_md_table = tabulate(latency_results,
headers='keys',

View File

@ -43,7 +43,7 @@ main() {
# The figures should be genereated by a separate process outside the CI/CD pipeline
# The figures should be generated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib

View File

@ -301,6 +301,104 @@ run_serving_tests() {
kill_gpu_processes
}
run_genai_perf_tests() {
# run genai-perf tests
# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
genai_perf_test_file=$1
# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
# get client and server arguments
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if [[ $reuse_server == "true" ]]; then
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi
if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps=$num_prompts
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
#TODO: add output dir.
client_command="genai-perf profile \
-m $model \
--service-kind openai \
--backend vllm \
--endpoint-type chat \
--streaming \
--url localhost:$port \
--request-rate $qps \
--num-prompts $num_prompts \
"
echo "Client command: $client_command"
eval "$client_command"
#TODO: process/record outputs
done
done
kill_gpu_processes
}
prepare_dataset() {
@ -328,12 +426,17 @@ main() {
pip install -U transformers
pip install -r requirements-dev.txt
which genai-perf
# check storage
df -h
ensure_installed wget
ensure_installed curl
ensure_installed jq
# genai-perf dependency
ensure_installed libb64-0d
prepare_dataset
@ -345,6 +448,10 @@ main() {
# run the test
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
# run genai-perf tests
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
mv artifacts/ $RESULTS_FOLDER/
# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"

View File

@ -6,6 +6,7 @@
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
set -x
set -o pipefail
check_gpus() {
@ -85,11 +86,7 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pkill -f pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill -f python3
pkill -f /usr/bin/python3
pgrep python3 | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
@ -289,7 +286,7 @@ run_serving_tests() {
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
@ -322,7 +319,7 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \

View File

@ -1,6 +1,6 @@
#!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10

View File

@ -0,0 +1,23 @@
[
{
"test_name": "llama8B_tp1_genai_perf",
"qps_list": [4,8,16,32],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"port": 8000,
"num_prompts": 500,
"reuse_server": false
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"genai_perf_input_parameters": {
}
}
]

View File

@ -1,7 +1,7 @@
steps:
- label: "Build wheel - CUDA 12.1"
agents:
queue: cpu_queue
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
@ -18,7 +18,7 @@ steps:
- label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel
agents:
queue: cpu_queue
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
@ -26,3 +26,47 @@ steps:
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- block: "Build release image"
depends_on: ~
key: block-release-image-build
- label: "Build release image"
depends_on: block-release-image-build
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
agents:
queue: tpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
- docker-login#v3.0.0:
username: vllm
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
- block: "Build CPU release image"
key: block-cpu-release-image-build
depends_on: ~
- label: "Build and publish CPU release image"
depends_on: block-cpu-release-image-build
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
env:
DOCKER_BUILDKIT: "1"

View File

@ -85,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \

View File

@ -4,49 +4,11 @@
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
function cpu_tests() {
set -e
# Run basic model test
docker exec cpu-test bash -c "
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# online inference
docker exec cpu-test bash -c "
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
}
# All of CPU tests are expected to be finished less than 25 mins.
export -f cpu_tests
timeout 25m bash -c "cpu_tests"

View File

@ -9,35 +9,33 @@ CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
# offline inference
docker exec cpu-test-avx2 bash -c "
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference.py"
python3 examples/offline_inference/basic.py"
# Run basic model test
docker exec cpu-test bash -c "
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
pip install -r vllm/requirements-test.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@ -45,20 +43,26 @@ function cpu_tests() {
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# Run compressed-tensor test
docker exec cpu-test bash -c "
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test bash -c "
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# online inference
docker exec cpu-test bash -c "
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
@ -71,8 +75,14 @@ function cpu_tests() {
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
}
# All of CPU tests are expected to be finished less than 25 mins.
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 25m bash -c "cpu_tests $CORE_RANGE"
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@ -0,0 +1,28 @@
#!/bin/bash
# This script build the GH200 docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
python3 use_existing_torch.py
# Try building the docker image
DOCKER_BUILDKIT=1 docker build . \
--target vllm-openai \
--platform "linux/arm64" \
-t gh200-test \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
# Setup cleanup
remove_docker_container() { docker rm -f gh200-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic.py
'

View File

@ -8,9 +8,17 @@ set -ex
docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup
# certain versions of HPU software stack have a bug that can
# override the exit code of the script, so we need to use
# separate remove_docker_container and remove_docker_container_and_exit
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE=1
remove_docker_container() { docker rm -f hpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
trap remove_docker_container_and_exit EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
EXITCODE=$?

View File

@ -3,6 +3,18 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
set -v
image_name="neuron/vllm-ci"
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@ -13,41 +25,33 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune -f
# Remove huggingface model artifacts and compiler cache
rm -rf "${HF_MOUNT:?}/*"
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t neuron -f Dockerfile.neuron .
docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() { docker rm -f neuron || true; }
remove_docker_container() {
docker image rm -f "${image_name}" || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py

11
.buildkite/run-tpu-test.sh Normal file → Executable file
View File

@ -14,4 +14,13 @@ remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"

View File

@ -12,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
# Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference/basic.py
python3 examples/offline_inference/cli.py -tp 2
'

View File

@ -9,8 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# nightly(bool): run this test in nightly pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@ -39,7 +38,7 @@ steps:
- pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
- grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true
@ -51,7 +50,9 @@ steps:
- tests/multimodal
- tests/test_utils
- tests/worker
- tests/standalone_tests/lazy_torch_compile.py
commands:
- python3 standalone_tests/lazy_torch_compile.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@ -60,6 +61,13 @@ steps:
- pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker
- label: Python-only Installation Test
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 30min
#mirror_hardwares: [amd]
fast_check: true
@ -68,7 +76,9 @@ steps:
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
@ -98,14 +108,12 @@ steps:
source_file_dependencies:
- vllm/
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -119,11 +127,15 @@ steps:
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
- examples/offline_inference/rlhf.py
commands:
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py
- label: Metrics, Tracing Test # 10min
num_gpus: 2
@ -171,29 +183,41 @@ steps:
- vllm/
- tests/v1
commands:
- pytest -v -s v1
# split the test to avoid interference
- VLLM_USE_V1=1 pytest -v -s v1/core
- VLLM_USE_V1=1 pytest -v -s v1/engine
- VLLM_USE_V1=1 pytest -v -s v1/sample
- VLLM_USE_V1=1 pytest -v -s v1/worker
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- VLLM_USE_V1=1 pytest -v -s v1/e2e
- label: Examples Test # 15min
- label: Examples Test # 25min
working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/entrypoints
- examples/
commands:
- pip install awscli tensorizer # for llava example and tensorizer test
- python3 offline_inference.py
- python3 cpu_offload.py
- python3 offline_inference_chat.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 offline_inference_vision_language.py
- python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py
- python3 offline_profile.py --model facebook/opt-125m
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic.py
- python3 offline_inference/cpu_offload.py
- python3 offline_inference/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/classification.py
- python3 offline_inference/embedding.py
- python3 offline_inference/scoring.py
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
#mirror_hardwares: [amd]
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/prefix_caching
@ -205,6 +229,7 @@ steps:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
- tests/samplers
- tests/conftest.py
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
@ -213,23 +238,29 @@ steps:
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 30min
- label: Speculative decoding tests # 40min
source_file_dependencies:
- vllm/spec_decode
- tests/spec_decode
- vllm/model_executor/models/eagle.py
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" # 9min
@ -313,17 +344,15 @@ steps:
##### models test #####
- label: Basic Models Test # 30min
- label: Basic Models Test # 24min
source_file_dependencies:
- vllm/
- tests/models
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 42min
- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
@ -333,10 +362,9 @@ steps:
commands:
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model
- pytest -v -s models/embedding/vision_language -m core_model
- label: Language Models Test (Extended) # 50min
nightly: true
- label: Language Models Test (Extended) # 1h10min
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
@ -345,24 +373,28 @@ steps:
commands:
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Standard) # 26min
- label: Multi-Modal Models Test (Standard) # 40min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/audio_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Multi-Modal Models Test (Extended) # 1h15m
nightly: true
- label: Multi-Modal Models Test (Extended) 1 # 48m
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
@ -370,14 +402,26 @@ steps:
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 # 38m
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
optional: true
@ -412,11 +456,11 @@ steps:
- tests/distributed/
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd]
@ -429,19 +473,45 @@ steps:
- vllm/model_executor/models/
- tests/distributed/
- vllm/compilation
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
commands:
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
# this test fails consistently.
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
fast_check: true
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
commands:
# begin platform plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# other tests continue here:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
@ -458,7 +528,9 @@ steps:
- vllm/engine
- tests/multi_step
commands:
- pytest -v -s multi_step/test_correctness_async_llm.py
# this test is quite flaky
# TODO: investigate and fix.
# - pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 45min
@ -474,18 +546,23 @@ steps:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA Long Context (Distributed) # 11min
# This test runs llama 13B, so it is required to run on 4 GPUs.
- label: LoRA TP Test (Distributed)
num_gpus: 4
soft_fail: true
source_file_dependencies:
- vllm/lora
- tests/lora/test_long_context
- tests/lora
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests"
@ -513,6 +590,7 @@ steps:
- label: Distributed Tests (A100) # optional
gpu: a100
optional: true
num_gpus: 4
source_file_dependencies:
- vllm/
@ -521,11 +599,12 @@ steps:
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:

View File

@ -23,16 +23,49 @@ wheel="$new_wheel"
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version"
normal_wheel="$wheel" # Save the original wheel filename
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
new_version="1.0.0.dev"
suffix="${version##*.}"
if [[ $suffix == cu* ]]; then
new_version="1.0.0.dev+${suffix}"
else
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
mv -- "$wheel" "$new_wheel"
# use cp to keep both files in the artifacts directory
cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# Upload the wheel to S3
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
# generate index for this commit
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"

30
.github/CODEOWNERS vendored
View File

@ -2,29 +2,35 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
CMakeLists.txt @tlrmchlsmth @WoosukKwon
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin
/vllm/multimodal @DarkLight1337 @ywang96
CMakeLists.txt @tlrmchlsmth
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
# Test ownership
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/quantization @mgoin @robertgshaw2-neuralmagic
/tests/quantization @mgoin @robertgshaw2-redhat
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/multi_step @alexm-neuralmagic @comaniac
/tests/multi_step @alexm-redhat @comaniac
/tests/weight_loading @mgoin @youkaichao
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

2
.github/FUNDING.yml vendored
View File

@ -1,2 +1,2 @@
github: [vllm-project]
open_collective: [vllm]
open_collective: vllm

View File

@ -9,7 +9,7 @@ body:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
- type: textarea
attributes:
label: The model to consider.

View File

@ -2,73 +2,4 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*)
**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
---
<details>
<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>
<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
<ul>
<li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
<h3>Code Quality</h3>
<p>The PR need to meet the following code quality standards:</p>
<ul>
<li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
<li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
<li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
</ul>
<h3>Adding or changing kernels</h3>
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
<ul>
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
</ul>
<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
<h3>What to Expect for the Reviews</h3>
<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
<li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
</li>
</ul>
<h3>Thank You</h3>
<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
</details>
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **

View File

@ -15,6 +15,8 @@ updates:
allow:
- dependency-type: "all"
ignore:
- dependency-name: "*"
update-types: ["version-update:semver-patch"]
- dependency-name: "torch"
- dependency-name: "torchvision"
- dependency-name: "xformers"
@ -24,9 +26,6 @@ updates:
- dependency-name: "ray[adag]"
- dependency-name: "lm-eval"
groups:
patch-update:
applies-to: version-updates
update-types: ["patch"]
minor-update:
applies-to: version-updates
update-types: ["minor"]

View File

@ -15,19 +15,36 @@ NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
sed -i '/FIX #xxxx.*$/d' "${NEW}"
# Remove "FILL IN THE PR DESCRIPTION HERE"
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
python3 - <<EOF
import re
with open("${NEW}", "r") as file:
content = file.read()
pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
content = re.sub(pattern, '', content)
with open("${NEW}", "w") as file:
file.write(content)
EOF
# Run this only if ${NEW} is different than ${OLD}
if ! cmp -s "${OLD}" "${NEW}"; then
echo "Updating PR body"
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
echo
echo "Updated PR body:"
echo
cat "${NEW}"
else
echo "No changes needed"
fi

View File

@ -1,40 +0,0 @@
name: Lint GitHub Actions workflows
on:
push:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'
pull_request:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
- '.github/workflows/matchers/actionlint.json'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Run actionlint"
run: |
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color

View File

@ -1,53 +0,0 @@
name: clang-format
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- '**/*.h'
- '**/*.cpp'
- '**/*.cu'
- '**/*.cuh'
- '.github/workflows/clang-format.yml'
pull_request:
branches:
- main
paths:
- '**/*.h'
- '**/*.cpp'
- '**/*.cu'
- '**/*.cuh'
- '.github/workflows/clang-format.yml'
jobs:
clang-format:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install clang-format==18.1.5
- name: Running clang-format
run: |
EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu'
'csrc/quantization/gguf/ggml-common.h'
'csrc/quantization/gguf/dequantize.cuh'
'csrc/quantization/gguf/vecdotq.cuh'
'csrc/quantization/gguf/mmq.cuh'
'csrc/quantization/gguf/mmvq.cuh'
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| xargs clang-format --dry-run --Werror

View File

@ -1,45 +0,0 @@
name: codespell
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- "**/*.md"
- "**/*.rst"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/codespell.yml
pull_request:
branches:
- main
paths:
- "**/*.py"
- "**/*.md"
- "**/*.rst"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/codespell.yml
jobs:
codespell:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml

82
.github/workflows/lint-and-deploy.yaml vendored Normal file
View File

@ -0,0 +1,82 @@
name: Lint and Deploy Charts
on: pull_request
jobs:
lint-and-deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Set up Helm
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
with:
version: v3.14.4
#Python is required because ct lint runs Yamale and yamllint which require Python.
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: '3.13'
- name: Set up chart-testing
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
with:
version: v3.10.1
- name: Run chart-testing (lint)
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
- name: Setup minio
run: |
docker network create vllm-net
docker run -d -p 9000:9000 --name minio --net vllm-net \
-e "MINIO_ACCESS_KEY=minioadmin" \
-e "MINIO_SECRET_KEY=minioadmin" \
-v /tmp/data:/data \
-v /tmp/config:/root/.minio \
minio/minio server /data
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_EC2_METADATA_DISABLED=true
mkdir opt-125m
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
- name: Create kind cluster
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
- name: Build the Docker image vllm cpu
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster
run: |
docker pull amazon/aws-cli:2.6.4
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
kind load docker-image vllm-cpu-env:latest --name chart-testing
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
kubectl create ns ns-vllm
- name: Run chart-testing (install)
run: |
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test
run: |
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
sleep 10
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
--header "Content-Type: application/json" \
--data '{
"model": "opt-125m",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'):$CODE"
echo "$CODE"

View File

@ -1,17 +0,0 @@
{
"problemMatcher": [
{
"owner": "ruff",
"pattern": [
{
"regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
"file": 1,
"line": 2,
"column": 3,
"code": 4,
"message": 5
}
]
}
]
}

View File

@ -1,51 +0,0 @@
name: mypy
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- '**/*.py'
- '.github/workflows/mypy.yaml'
- 'tools/mypy.sh'
- 'pyproject.toml'
pull_request:
branches:
- main
# This workflow is only relevant when one of the following files changes.
# However, we have github configured to expect and require this workflow
# to run and pass before github with auto-merge a pull request. Until github
# allows more flexible auto-merge policy, we can just run this on every PR.
# It doesn't take that long to run, anyway.
#paths:
# - '**/*.py'
# - '.github/workflows/mypy.yaml'
# - 'tools/mypy.sh'
# - 'pyproject.toml'
jobs:
mypy:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install mypy==1.11.1
pip install types-setuptools
pip install types-PyYAML
pip install types-requests
pip install types-setuptools
- name: Mypy
run: |
echo "::add-matcher::.github/workflows/matchers/mypy.json"
tools/mypy.sh 1 ${{ matrix.python-version }}

19
.github/workflows/pre-commit.yml vendored Normal file
View File

@ -0,0 +1,19 @@
name: pre-commit
on:
pull_request:
push:
branches: [main]
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:
extra_args: --all-files --hook-stage manual

View File

@ -39,67 +39,68 @@ jobs:
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)
wheel:
name: Build Wheel
runs-on: ${{ matrix.os }}
needs: release
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
# wheel:
# name: Build Wheel
# runs-on: ${{ matrix.os }}
# needs: release
strategy:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
python-version: ['3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
# strategy:
# fail-fast: false
# matrix:
# os: ['ubuntu-20.04']
# python-version: ['3.9', '3.10', '3.11', '3.12']
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
# cuda-version: ['11.8', '12.1']
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# steps:
# - name: Checkout
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Setup ccache
uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
# - name: Setup ccache
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
# with:
# create-symlink: true
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
run: |
bash -x .github/workflows/scripts/env.sh
# - name: Set up Linux Env
# if: ${{ runner.os == 'Linux' }}
# run: |
# bash -x .github/workflows/scripts/env.sh
- name: Set up Python
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
# - name: Set up Python
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
# with:
# python-version: ${{ matrix.python-version }}
- name: Install CUDA ${{ matrix.cuda-version }}
run: |
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
# - name: Install CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
- name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
run: |
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
- name: Build wheel
shell: bash
env:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
# - name: Build wheel
# shell: bash
# env:
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
# run: |
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
# asset_name=${wheel_name//"linux"/"manylinux1"}
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
- name: Upload Release Asset
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./dist/${{ env.wheel_name }}
asset_name: ${{ env.asset_name }}
asset_content_type: application/*
# - name: Upload Release Asset
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# with:
# upload_url: ${{ needs.release.outputs.upload_url }}
# asset_path: ./dist/${{ env.wheel_name }}
# asset_name: ${{ env.asset_name }}
# asset_content_type: application/*
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# - name: Publish package

View File

@ -1,52 +0,0 @@
name: ruff
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/matchers/ruff.json
- .github/workflows/ruff.yml
pull_request:
branches:
- main
# This workflow is only relevant when one of the following files changes.
# However, we have github configured to expect and require this workflow
# to run and pass before github with auto-merge a pull request. Until github
# allows more flexible auto-merge policy, we can just run this on every PR.
# It doesn't take that long to run, anyway.
#paths:
# - "**/*.py"
# - pyproject.toml
# - requirements-lint.txt
# - .github/workflows/matchers/ruff.json
# - .github/workflows/ruff.yml
jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github .
- name: Run isort
run: |
isort . --check-only

View File

@ -1,37 +0,0 @@
name: Lint shell scripts
on:
push:
branches:
- "main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
pull_request:
branches:
- "main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
shellcheck:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Check shell scripts"
run: |
tools/shellcheck.sh

View File

@ -1,38 +0,0 @@
name: yapf
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- .github/workflows/yapf.yml
pull_request:
branches:
- main
paths:
- "**/*.py"
- .github/workflows/yapf.yml
jobs:
yapf:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
run: |
yapf --diff --recursive .

3
.gitignore vendored
View File

@ -79,8 +79,7 @@ instance/
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/*.rst
!**/*.template.rst
docs/source/getting_started/examples/
# PyBuilder
.pybuilder/

93
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,93 @@
default_stages:
- pre-commit # Run locally
- manual # Run in CI
repos:
- repo: https://github.com/google/yapf
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.3
hooks:
- id: ruff
args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
rev: v2.4.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
- repo: https://github.com/PyCQA/isort
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.7
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.27
hooks:
- id: pymarkdown
files: docs/.*
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
- id: actionlint
- repo: local
hooks:
- id: mypy-local
name: Run mypy for local Python installation
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: tools/mypy.sh 1 "3.9"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: tools/mypy.sh 1 "3.10"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
entry: tools/mypy.sh 1 "3.11"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
entry: tools/mypy.sh 1 "3.12"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
entry: tools/shellcheck.sh
language: script
types: [shell]
- id: png-lint
name: Lint PNG exports from excalidraw
entry: tools/png-lint.sh
language: script
types: [png]
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system
verbose: true
pass_filenames: false

141
CMakeLists.txt Normal file → Executable file
View File

@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
@ -34,7 +31,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
@ -181,6 +178,31 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
# Define other extension targets
#
#
# cumem_allocator extension
#
set(VLLM_CUMEM_EXT_SRC
"csrc/cumem_allocator.cpp")
set_gencode_flags_for_srcs(
SRCS "${VLLM_CUMEM_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling cumem allocator extension.")
# link against cuda driver library
list(APPEND CUMEM_LIBS cuda)
define_gpu_extension_target(
cumem_allocator
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_CUMEM_EXT_SRC}
LIBRARIES ${CUMEM_LIBS}
USE_SABI 3.8
WITH_SOABI)
endif()
#
# _C extension
#
@ -196,6 +218,8 @@ set(VLLM_EXT_SRC
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
@ -204,19 +228,32 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
FetchContent_Declare(
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
endif()
if(VLLM_CUTLASS_SRC_DIR)
if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
endif()
message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
else()
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG v3.5.1
GIT_TAG v3.6.0
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE
)
)
endif()
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
@ -224,10 +261,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
"csrc/cutlass_extensions/common.cpp")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@ -236,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
@ -256,10 +295,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
" in CUDA target architectures")
endif()
#
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
@ -288,7 +326,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@ -309,6 +347,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
#
# 2:4 Sparse Kernels
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper.")
else()
message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
#
# Machete kernels
@ -390,7 +453,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)
@ -414,7 +477,7 @@ set_gencode_flags_for_srcs(
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
@ -469,7 +532,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
endif()
# vllm-flash-attn currently only supported on CUDA
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
return()
endif ()
@ -492,7 +555,7 @@ endif()
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
# If no component is specified, vllm-flash-attn is still installed.
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
@ -504,43 +567,41 @@ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
FetchContent_Declare(
vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
set(VLLM_PARENT_BUILD ON)
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
# Make sure vllm-flash-attn install rules are nested under vllm/
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
# Copy over the vllm-flash-attn python files
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn
COMPONENT vllm_flash_attn_c
FILES_MATCHING PATTERN "*.py"
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py"
)
# Nothing after vllm-flash-attn, see comment about macros above

View File

@ -2,8 +2,8 @@
# to run the OpenAI compatible server.
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
# docs/source/contributing/dockerfile/dockerfile.md and
# docs/source/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE ####################
@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
@ -44,12 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR /workspace
# install build and runtime dependencies
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
fi
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
@ -63,6 +73,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM
# install build dependencies
COPY requirements-build.txt requirements-build.txt
@ -115,8 +126,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# Default max size of the wheel is 250MB
ARG VLLM_MAX_SIZE_MB=250
# sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=300
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -134,15 +145,17 @@ COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
#################### DEV IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
# TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@ -151,7 +164,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
@ -168,17 +181,45 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# install vllm wheel first, so that torch etc will be installed
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
fi
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
COPY examples examples
#################### vLLM installation IMAGE ####################
# How to build this FlashInfer wheel:
# $ export FLASHINFER_ENABLE_AOT=1
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
fi
COPY examples examples
# Although we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
@ -191,6 +232,10 @@ ADD . /vllm-workspace/
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install hf_transfer
@ -205,18 +250,30 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
else \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
COPY examples/online_serving/sagemaker-entrypoint.sh .
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
FROM vllm-openai-base AS vllm-openai
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################

62
Dockerfile.arm Normal file
View File

@ -0,0 +1,62 @@
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
FROM ubuntu:22.04 AS cpu-test-arm
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
# Set LD_PRELOAD for tcmalloc on ARM
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
RUN echo 'ulimit -c 0' >> ~/.bashrc
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
FROM cpu-test-arm AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Disabling AVX512 specific optimizations for ARM
ARG VLLM_CPU_DISABLE_AVX512="true"
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -16,7 +16,7 @@ RUN --mount=type=cache,target=/var/cache/apt \
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp
pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
@ -62,4 +62,8 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,4 +1,4 @@
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
COPY ./ /workspace/vllm
@ -11,6 +11,9 @@ ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

View File

@ -1,5 +1,6 @@
# default base image
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
FROM $BASE_IMAGE
@ -14,16 +15,17 @@ RUN apt-get update && \
ffmpeg libsm6 libxext6 libgl1
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
# When launching the container, mount the code directory to /workspace
ARG APP_MOUNT=/workspace
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest
COPY . .
ARG GIT_REPO_CHECK=0
@ -38,4 +40,10 @@ ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
pip install --no-build-isolation -v -e .
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
# overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
CMD ["/bin/bash"]

View File

@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN python3 -m pip install -U pip
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
# build vLLM with OpenVINO backend
@ -22,4 +23,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC
COPY examples/ /workspace/examples
COPY benchmarks/ /workspace/benchmarks
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
CMD ["/bin/bash"]

View File

@ -4,7 +4,7 @@ USER root
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
# These packages will be in rocketce eventually
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
@ -29,6 +28,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

View File

@ -1,171 +1,119 @@
# Default ROCm 6.2 base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
# default base image
ARG REMOTE_VLLM="0"
ARG USE_CYTHON="0"
ARG BUILD_RPD="1"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
# Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
FROM ${BASE_IMAGE} AS base
# Whether to install CK-based flash-attention
# If 0, will not install flash-attention
ARG BUILD_FA="1"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="3cea2fb"
# Whether to build triton on rocm
ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e192dba"
### Base image build stage
FROM $BASE_IMAGE AS base
# Import arg(s) defined before this build stage
ARG PYTORCH_ROCM_ARCH
ARG ARG_PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
sudo \
git \
bzip2 \
libx11-6 \
build-essential \
wget \
unzip \
tmux \
ccache \
&& rm -rf /var/lib/apt/lists/*
# When launching the container, mount the code directory to /vllm-workspace
ARG APP_MOUNT=/vllm-workspace
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components
RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
# Remove sccache
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}
# Install torch == 2.6.0 on ROCm
RUN --mount=type=cache,target=/root/.cache/pip \
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.2"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --pre \
torch==2.6.0.dev20240918 \
'setuptools-scm>=8' \
torchvision==0.20.0.dev20240918 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
# -----------------------
# vLLM fetch stages
FROM base AS fetch_vllm_0
ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main"
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git checkout ${VLLM_BRANCH}
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------
# vLLM build stages
FROM fetch_vllm AS build_vllm
ARG USE_CYTHON
# Build vLLM
RUN cd vllm \
&& python3 -m pip install -r requirements-rocm.txt \
&& python3 setup.py clean --all \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
# -----------------------
# Test vLLM image
FROM base AS test
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements-rocm.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
WORKDIR /vllm-workspace
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
# install development dependencies (for testing)
RUN cd /vllm-workspace \
&& rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4 \
&& python3 -m pip install pytest-shard
# -----------------------
# Final vLLM image
FROM base AS final
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually remove it so that later steps of numpy upgrade can continue
RUN case "$(which python3)" in \
*"/opt/conda/envs/py_3.9"*) \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
*) ;; esac
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
RUN python3 -m pip install --upgrade huggingface-hub[cli]
ARG BUILD_RPD
RUN if [ ${BUILD_RPD} -eq "1" ]; then \
git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
&& cd rocmProfileData/rpd_tracer \
&& pip install -r requirements.txt && cd ../ \
&& make && make install \
&& cd hipMarker && python3 setup.py install ; fi
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV CCACHE_DIR=/root/.cache/ccache
# Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements-rocm.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
ARG COMMON_WORKDIR
### AMD-SMI build stage
FROM base AS build_amdsmi
# Build amdsmi wheel always
RUN cd /opt/rocm/share/amd_smi \
&& python3 -m pip wheel . --wheel-dir=/install
# Copy over the benchmark scripts as well
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
### Flash-Attention wheel build stage
FROM base AS build_fa
ARG BUILD_FA
ARG FA_GFX_ARCHS
ARG FA_BRANCH
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_FA" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout "${FA_BRANCH}" \
&& git submodule update --init \
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
### Triton wheel build stage
FROM base AS build_triton
ARG BUILD_TRITON
ARG TRITON_BRANCH
# Build triton wheel if `BUILD_TRITON = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& python3 -m pip install ninja cmake wheel pybind11 \
&& git clone https://github.com/OpenAI/triton.git \
&& cd triton \
&& git checkout "${TRITON_BRANCH}" \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=/install; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
### Final vLLM build stage
FROM base AS final
# Import the vLLM development directory from the build context
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN python3 -m pip install --upgrade pip
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
# Workaround for ray >= 2.10.0
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# Silences the HF Tokenizers warning
ENV TOKENIZERS_PARALLELISM=false
RUN --mount=type=cache,target=${CCACHE_DIR} \
--mount=type=bind,source=.git,target=.git \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -Ur requirements-rocm.txt \
&& python3 setup.py clean --all \
&& python3 setup.py develop
# Copy amdsmi wheel into final image
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
mkdir -p libs \
&& cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y amdsmi;
# Copy triton wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y triton; fi
# Copy flash-attn wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y flash-attn; fi
# Install wheels that were built to the final image
RUN --mount=type=cache,target=/root/.cache/pip \
if ls libs/*.whl; then \
python3 -m pip install libs/*.whl; fi
# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1
CMD ["/bin/bash"]

158
Dockerfile.rocm_base Normal file
View File

@ -0,0 +1,158 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="8d4926e"
ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG PYTHON_VERSION=3.12
RUN mkdir -p /app
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
ARG LEGACY_HIPBLASLT_OPTION
RUN git clone https://github.com/ROCm/hipBLAS-common.git
RUN cd hipBLAS-common \
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
&& mkdir build \
&& cd build \
&& cmake .. \
&& make package \
&& dpkg -i ./*.deb
RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \
&& make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
FROM base AS build_rccl
ARG RCCL_BRANCH
ARG RCCL_REPO
RUN git clone ${RCCL_REPO}
RUN cd rccl \
&& git checkout ${RCCL_BRANCH} \
&& ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
FROM base AS build_triton
ARG TRITON_BRANCH
ARG TRITON_REPO
RUN git clone ${TRITON_REPO}
RUN cd triton \
&& git checkout ${TRITON_BRANCH} \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
FROM base AS build_amdsmi
RUN cd /opt/rocm/share/amd_smi \
&& pip wheel . --wheel-dir=dist
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
FROM base AS build_pytorch
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
pip install -r requirements.txt && git submodule update --init --recursive \
&& python3 tools/amd_build/build_amd.py \
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl
RUN git clone ${FA_REPO}
RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install
FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \
&& sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
&& sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
dpkg -i /install/*deb \
&& sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
&& sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
ARG BASE_IMAGE
ARG HIPBLASLT_BRANCH
ARG LEGACY_HIPBLASLT_OPTION
ARG RCCL_BRANCH
ARG RCCL_REPO
ARG TRITON_BRANCH
ARG TRITON_REPO
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
&& echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
&& echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt

View File

@ -1,4 +1,4 @@
ARG NIGHTLY_DATE="20241017"
ARG NIGHTLY_DATE="20250124"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
@ -22,4 +22,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
-r requirements-tpu.txt
RUN python3 setup.py develop
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
CMD ["/bin/bash"]

View File

@ -64,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV VLLM_USAGE_SOURCE production-docker-image \
TRITON_XPU_PROFILE 1
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -16,9 +16,12 @@ Easy, fast, and cheap LLM serving for everyone
---
*Latest News* 🔥
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
@ -33,10 +36,12 @@ Easy, fast, and cheap LLM serving for everyone
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
vLLM is fast with:
- State-of-the-art serving throughput
- Efficient management of attention key and value memory with **PagedAttention**
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
@ -59,7 +64,7 @@ vLLM is flexible and easy to use with:
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g. E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA)
@ -67,16 +72,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod
## Getting Started
Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
```bash
pip install vllm
```
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
## Contributing
@ -89,27 +94,33 @@ vLLM is a community project. Our compute resources for development and testing a
<!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
Cash Donations:
- a16z
- Dropbox
- Sequoia Capital
- Skywork AI
- ZhenFund
Compute Resources:
- AMD
- Anyscale
- AWS
- Crusoe Cloud
- Databricks
- DeepInfra
- Dropbox
- Google Cloud
- Lambda Lab
- Nebius
- Novita AI
- NVIDIA
- Replicate
- Roblox
- RunPod
- Sequoia Capital
- Skywork AI
- Trainy
- UC Berkeley
- UC San Diego
- ZhenFund
Slack Sponsor: Anyscale
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
@ -132,3 +143,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
* For coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
## Media Kit
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

View File

@ -4,7 +4,7 @@
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
---

View File

@ -22,8 +22,10 @@ class RequestFuncInput:
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
best_of: int = 1
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
@ -33,9 +35,11 @@ class RequestFuncOutput:
generated_text: str = ""
success: bool = False
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
@ -47,13 +51,15 @@ async def async_request_tgi(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
params = {
"best_of": request_func_input.best_of,
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
"truncate": request_func_input.prompt_len,
# TGI does not accept ignore_eos flag.
}
payload = {
@ -75,7 +81,7 @@ async def async_request_tgi(
continue
chunk_bytes = chunk_bytes.decode("utf-8")
#NOTE: Sometimes TGI returns a ping response without
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
@ -118,7 +124,8 @@ async def async_request_trt_llm(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
payload = {
"accumulate_tokens": True,
@ -152,7 +159,7 @@ async def async_request_trt_llm(
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = time.perf_counter() - st
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
@ -182,7 +189,8 @@ async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
payload = {
@ -230,17 +238,25 @@ async def async_request_openai_completions(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model,
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"best_of": request_func_input.best_of,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
"stream_options": {
"include_usage": True,
},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
@ -249,7 +265,6 @@ async def async_request_openai_completions(
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
@ -264,15 +279,16 @@ async def async_request_openai_completions(
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
if chunk != "[DONE]":
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if data["choices"][0]["text"]:
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
@ -286,7 +302,10 @@ async def async_request_openai_completions(
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += data["choices"][0]["text"]
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
@ -295,7 +314,7 @@ async def async_request_openai_completions(
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = latency
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
@ -318,12 +337,14 @@ async def async_request_openai_chat_completions(
"chat/completions"
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = {
"model": request_func_input.model,
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"messages": [
{
"role": "user",
@ -333,8 +354,14 @@ async def async_request_openai_chat_completions(
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
"stream_options": {
"include_usage": True,
},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
@ -358,17 +385,15 @@ async def async_request_openai_chat_completions(
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
delta = data["choices"][0]["delta"]
if delta.get("content", None):
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = time.perf_counter() - st
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
@ -376,13 +401,16 @@ async def async_request_openai_chat_completions(
output.itl.append(timestamp -
most_recent_timestamp)
generated_text += delta["content"]
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = latency
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
@ -410,14 +438,35 @@ def get_model(pretrained_model_name_or_path: str) -> str:
def get_tokenizer(
pretrained_model_name_or_path: str, trust_remote_code: bool
pretrained_model_name_or_path: str,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(
pretrained_model_name_or_path)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
trust_remote_code=trust_remote_code)
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError(
"Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
from vllm.transformers_utils.tokenizer import MistralTokenizer
except ImportError as e:
raise ImportError("MistralTokenizer requires vllm package.\n"
"Please install it with `pip install vllm` "
"to use mistral tokenizer mode.") from e
return MistralTokenizer.from_pretrained(
str(pretrained_model_name_or_path))
else:
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
**kwargs,
)
ASYNC_REQUEST_FUNCS = {

View File

@ -0,0 +1,494 @@
"""Benchmark guided decoding throughput."""
import argparse
import dataclasses
import json
import os
import random
import time
from typing import List
import datasets
import pandas as pd
import uvloop
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.sampling_params import GuidedDecodingParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@dataclasses.dataclass
class SampleRequest:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
"""
prompt: str
prompt_len: int
expected_output_len: int
schema: dict
structure_type: str = 'json'
completion: str = None
def run_vllm(requests: List[SampleRequest],
engine_args: EngineArgs,
n: int,
guided_decoding_rate: float = 1.0,
warmup: bool = False) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**vars(engine_args))
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
# create a list containing random selected true or false
guided_decoding_req_idx = random.sample(
range(len(requests)), int(len(requests) * guided_decoding_rate))
if warmup:
print(">>>>> Running warmup prompt, for the first 5")
# We setup the first 5 requests to warmup FSM
# if using xgrammar dataset, we will skip warmup
warmup_requests = requests[:5]
for i, request in enumerate(warmup_requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(json=request.schema)
if guided_decoding_rate > 0 else None,
))
llm.generate(prompts, sampling_params, use_tqdm=False)
print(">>>>> Benchmark started...")
prompts = []
sampling_params = []
for i, request in enumerate(requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(
**{request.structure_type: request.schema})
if i in guided_decoding_req_idx else None,
))
start = time.perf_counter()
outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
ret = []
for output, request in zip(outputs, requests):
generated_text = output.outputs[0].text
ret.append({
"generated": generated_text,
"expected": request.completion
})
end = time.perf_counter()
return end - start, ret
async def run_vllm_async(
requests: List[SampleRequest],
engine_args: AsyncEngineArgs,
n: int,
guided_decoding_rate: float = 1.0,
warmup: bool = False,
disable_frontend_multiprocessing: bool = False) -> float:
from vllm import SamplingParams
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
guided_decoding_req_idx = random.sample(
range(len(requests)), int(len(requests) * guided_decoding_rate))
if warmup:
print(">>>>>> Running warmup prompt, for the first 5")
# We setup the first 5 requests to warmup FSM
# if using xgrammar dataset, we will skip warmup
warmup_requests = requests[:5]
for i, request in enumerate(warmup_requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(
json=request.schema)
if guided_decoding_rate > 0 else None,
))
generators = []
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
print(">>>>> Benchmark started...")
prompts = []
sampling_params = []
for i, request in enumerate(requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(json=request.schema)
if i in guided_decoding_req_idx else None,
))
generators = []
start_time = []
latencies = []
start = time.perf_counter()
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
generators.append(generator)
start_time.append(time.perf_counter())
latencies.append([])
all_gens = merge_async_iterators(*generators)
generated_texts = [''] * len(requests)
async for i, res in all_gens:
generated_texts[i] = res.outputs[0].text
lat = time.perf_counter() - start_time[i]
latencies[i].append(lat)
ret = [{
'generated': gt,
'expected': req.completion
} for gt, req in zip(generated_texts, requests)]
end = time.perf_counter()
first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
for lat in latencies])
return end - start, ret, (first_latency, next_latency)
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]:
if args.dataset == 'json':
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
args.json_schema_path = os.path.join(dir_path,
"structured_schemas",
"structured_schema_1.json")
with open(args.json_schema_path) as f:
schema = json.load(f)
prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "grammar":
schema = """
?start: select_statement
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
?table_name: identifier
?column_name: identifier
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
prompt = "Generate an SQL query to show the 'username' \
and 'email' from the 'users' table."
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "regex":
regex = r"\w+@\w+\.com\n"
args.regex = regex
prompt = "Generate an email address for Alan Turing, \
who works in Enigma. End in .com and new line. \
Example result: alan.turing@enigma.com\n"
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=regex,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "choice":
choice = ["Positive", "Negative"]
args.choice = choice
prompt = "Classify this sentiment: vLLM is wonderful!"
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=choice,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "xgrammar_bench":
args.warmup = False
requests: List[SampleRequest] = []
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
split="train")
print(f"dataset has {len(dataset)} entries")
len_dataset = len(dataset)
for data_point_idx in range(args.num_prompts):
idx = data_point_idx
while idx >= len_dataset:
idx -= len_dataset
schema = dataset["schema"][idx]
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
tokenize=False)
input_len = len(tokenizer(prompt).input_ids)
completion = dataset["completion"][idx]
requests.append(
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
completion=completion))
return requests
def evaluate(ret, args):
def _eval_correctness_json(expected, actual):
# extract json string from string using regex
import re
actual = actual.replace('\n', '').replace(' ', '').strip()
try:
actual = re.search(r'\{.*\}', actual).group()
actual = json.loads(actual)
except Exception:
return False
return True
def _eval_correctness_choice(expected, actual):
return actual in args.choice
def _eval_correctness_regex(expected, actual):
import re
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
if args.structure_type == 'json':
return _eval_correctness_json(expected, actual)
elif args.structure_type == 'regex':
return _eval_correctness_regex(expected, actual)
elif args.structure_type == 'choice':
return _eval_correctness_choice(expected, actual)
else:
return None
scores = []
for res in ret:
score = _eval_correctness(res['expected'], res['generated'])
res['correctness'] = score
scores.append(score)
not_none_scores = [score for score in scores if score is not None]
return (sum(not_none_scores) / len(not_none_scores) *
100) if len(not_none_scores) > 0 else None
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
# async engine is working for 'regex', 'choice' and 'grammar'
if args.dataset == 'grammar':
args.structure_type = 'grammar'
args.async_engine = False
elif args.dataset == 'regex':
args.structure_type = 'regex'
args.async_engine = False
elif args.dataset == 'choice':
args.structure_type = 'choice'
args.async_engine = False
else:
args.structure_type = 'json'
if args.no_guided_decoding:
args.guided_decoding_ratio = 0
if args.save_results:
result_file_name = f'{args.guided_decoding_ratio}guided'
result_file_name += f"_{args.model.split('/')[-1]}"
result_file_name += f"_{args.dataset}"
result_file_name += f"_{args.num_prompts}"
result_file_name += f"_out{args.output_len}"
result_file_name += f"_async{args.async_engine}"
result_file_name += f"_warmup{args.warmup}"
result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
result_file_name += ".txt"
else:
result_file_name = None
# Synthesize a prompt with the given input length.
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
requests = sample_requests(tokenizer, args)
if args.async_engine:
engine_args = AsyncEngineArgs.from_cli_args(args)
elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
run_vllm_async(requests, engine_args, args.n,
args.guided_decoding_ratio, args.warmup,
args.disable_frontend_multiprocessing))
else:
engine_args = EngineArgs.from_cli_args(args)
elapsed_time, ret = run_vllm(requests, engine_args, args.n,
args.guided_decoding_ratio, args.warmup)
first_latency, next_latency = None, None
score = evaluate(ret, args)
total_num_tokens = sum(request.prompt_len + request.expected_output_len
for request in requests)
total_output_tokens = sum(request.expected_output_len
for request in requests)
if first_latency is not None:
latency_breakdown = "\nFirst token latency(msecs):\n"
latency_breakdown += f"{first_latency.describe()}"
latency_breakdown += "\nNext token latency(msecs):\n"
latency_breakdown += f"{next_latency.describe()}"
print(
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
f"Correct rate is {score} %",
f"{latency_breakdown if first_latency is not None else ''}")
# Output JSON results if specified
if args.output_json or result_file_name:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"total_output_tokens": total_output_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
"output_tokens_per_second":
f"{total_output_tokens / elapsed_time:.2f}",
"correct_rate(%)": score
}
results = {"outputs": ret, **results}
if first_latency is not None:
results["first_token_latency(msecs)"] = first_latency.describe(
).to_dict()
results["next_token_latency(msecs)"] = next_latency.describe(
).to_dict()
if args.output_json:
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
elif result_file_name:
with open(result_file_name, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
parser = AsyncEngineArgs.add_cli_args(parser)
parser.add_argument("--output-len",
type=int,
default=512,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument(
"--dataset",
default='json',
choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
parser.add_argument("--json_schema_path",
type=str,
default=None,
help="Path to json schema.")
parser.add_argument("--n",
type=int,
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--num-prompts",
type=int,
default=10,
help="Number of prompts to process.")
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument("--async-engine",
action='store_true',
default=False,
help="Use vLLM async engine rather than LLM class.")
parser.add_argument("--no-guided-decoding",
action='store_true',
default=False,
help="Whether to disable JSON decoding or not.")
parser.add_argument("--guided-decoding-ratio",
type=float,
default=1.0,
help="Ratio of Guided Decoding requests")
parser.add_argument("--disable-frontend-multiprocessing",
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
parser.add_argument("--warmup",
action="store_true",
default=False,
help="Run warmup prompts before benchmark.")
parser.add_argument("--save-results",
action="store_true",
default=False,
help="save output results.")
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
main(args)

View File

@ -13,6 +13,7 @@ from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser
@ -40,6 +41,20 @@ def main(args: argparse.Namespace):
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
def llm_generate():
if not args.use_beam_search:
llm.generate(dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
else:
llm.beam_search(
dummy_prompts,
BeamSearchParams(
beam_width=args.n,
max_tokens=args.output_len,
ignore_eos=True,
))
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
with torch.profiler.profile(
@ -49,15 +64,11 @@ def main(args: argparse.Namespace):
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
llm.generate(dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else:
start_time = time.perf_counter()
llm.generate(dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
llm_generate()
end_time = time.perf_counter()
latency = end_time - start_time
return latency

View File

@ -0,0 +1,183 @@
"""
Offline benchmark to test the long document QA throughput.
Example usage:
# This workload samples 8 different prompts with a default input
# length of 20000 tokens, then replicates each prompt 2 times
# in random order.
python benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--repeat-count 2
Commandline arguments:
--num-documents: The number of documents to sample prompts from.
--document-length: The length of each document in tokens.
(Optional, default: 20000)
--output-len: The number of tokens to generate for each prompt.
(Optional, default: 10)
--repeat-count: The number of times to repeat each prompt.
(Optional, default: 2)
--repeat-mode: The mode to repeat prompts. The supported modes are:
- 'random': shuffle the prompts randomly. (Default)
- 'tile': the entire prompt list is repeated in sequence. (Potentially
lowest cache hit)
- 'interleave': each prompt is repeated consecutively before
moving to the next element. (Highest cache hit)
--shuffle-seed: Random seed when the repeat mode is "random".
(Optional, default: 0)
In the meantime, it also supports all the vLLM engine args to initialize the
LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
details.
"""
import dataclasses
import random
import time
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
"""
Test long document QA with the given prompts and sampling parameters.
Print the time spent in processing all the prompts.
Args:
llm: The language model used for generating responses.
sampling_params: Sampling parameter used to generate the response.
prompts: A list of prompt strings to be processed by the LLM.
"""
start_time = time.time()
llm.generate(prompts, sampling_params=sampling_params)
end_time = time.time()
print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
def repeat_prompts(prompts, repeat_count, mode: str):
"""
Repeat each prompt in the list for a specified number of times.
The order of prompts in the output list depends on the mode.
Args:
prompts: A list of prompts to be repeated.
repeat_count: The number of times each prompt is repeated.
mode: The mode of repetition. Supported modes are:
- 'random': Shuffle the prompts randomly after repetition.
- 'tile': Repeat the entire prompt list in sequence.
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
- 'interleave': Repeat each prompt consecutively before moving to
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
Returns:
A list of repeated prompts in the specified order.
Raises:
ValueError: If an invalid mode is provided.
"""
print("Repeat mode: ", mode)
if mode == 'random':
repeated_prompts = prompts * repeat_count
random.shuffle(repeated_prompts)
return repeated_prompts
elif mode == 'tile':
return prompts * repeat_count
elif mode == 'interleave':
repeated_prompts = []
for prompt in prompts:
repeated_prompts.extend([prompt] * repeat_count)
return repeated_prompts
else:
raise ValueError(f"Invalid mode: {mode}, only support "
"'random', 'tile', 'interleave'")
def main(args):
random.seed(args.shuffle_seed)
# Prepare the prompts:
# we append the document id at the beginning to avoid any of the document
# being the prefix of other documents
prompts = [
str(i) + ' '.join(['hi'] * args.document_length)
for i in range(args.num_documents)
]
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
warmup_prompts = [
"This is warm up request " + str(i) + \
' '.join(['hi'] * args.document_length)
for i in range(args.num_documents)]
# Create the LLM engine
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------")
test_long_document_qa(
llm=llm,
prompts=warmup_prompts,
sampling_params=sampling_params,
)
print("------start generating------")
test_long_document_qa(
llm=llm,
prompts=prompts,
sampling_params=sampling_params,
)
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description=
'Benchmark the performance with or without automatic prefix caching.')
parser.add_argument(
'--document-length',
type=int,
# Roughly the number of tokens for a system paper,
# excluding images
default=20000,
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')
parser.add_argument('--num-documents',
type=int,
default=8,
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')
parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--repeat-count',
type=int,
default=2,
help='Number of times to repeat each prompt')
parser.add_argument("--repeat-mode",
type=str,
default='random',
help='The mode to repeat prompts. The supported '
'modes are "random", "tile", and "interleave". '
'See repeat_prompts() in the source code for details.')
parser.add_argument("--shuffle-seed",
type=int,
default=0,
help='Random seed when the repeat mode is "random"')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)

View File

@ -10,7 +10,8 @@ Fixed example usage:
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100
--repeat-count 100 \
--input-length-range 128:256
ShareGPT example usage:
# This command samples 20 prompts with input lengths
@ -54,13 +55,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
print(f"cost time {end_time - start_time}")
def sample_requests(
@dataclasses.dataclass
class Request:
prompt: str
prompt_len: int
output_len: int
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
vocab = tokenizer.get_vocab()
# Remove the special tokens.
vocab = {
k: v
for k, v in vocab.items() if k not in tokenizer.all_special_ids
}
return random.choices(list(vocab.values()), k=length)
def sample_requests_from_dataset(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: Tuple[int, int],
fixed_output_len: Optional[int],
) -> List[Tuple[str, int, int]]:
) -> List[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -77,31 +95,55 @@ def sample_requests(
random.shuffle(dataset)
min_len, max_len = input_length_range
assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
filtered_requests: List[Request] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
if len(filtered_requests) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
prompt_token_ids = tokenizer(dataset[i][0]).input_ids
prompt = tokenizer.decode(prompt_token_ids)
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
output_len = (len(completion_token_ids)
if fixed_output_len is None else fixed_output_len)
if min_len <= prompt_len <= max_len:
filtered_dataset.append((prompt, prompt_len, output_len))
filtered_requests.append(Request(prompt, prompt_len, output_len))
return filtered_dataset
return filtered_requests
def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: Tuple[int, int],
fixed_output_len: Optional[int],
prefix_len: int,
) -> List[Request]:
requests = []
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
min_len, max_len = input_length_range
for i in range(num_requests):
unique_part_token_ids = sample_tokens(
tokenizer,
random.randint(min_len - prefix_len, max_len - prefix_len))
prompt_token_ids = prefix_token_ids + unique_part_token_ids
prompt = tokenizer.decode(prompt_token_ids)
prompt_len = len(prompt_token_ids)
assert (min_len <= prompt_len <= max_len
), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
requests.append(Request(prompt, prompt_len, fixed_output_len))
return requests
def repeat_and_sort_requests(requests: List[Request],
repeat_count: int,
sort: bool = False) -> List[str]:
repeated_requests = requests * repeat_count
@ -109,7 +151,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
repeated_requests.sort(key=lambda x: x[1])
else:
random.shuffle(repeated_requests)
return [req[0] for req in repeated_requests]
return [req.prompt for req in repeated_requests]
def main(args):
@ -117,9 +159,12 @@ def main(args):
input_length_range = tuple(map(int, args.input_length_range.split(':')))
random.seed(args.seed)
if args.dataset_path is not None:
print(f"Start to sample {args.num_prompts} prompts"
if args.prefix_len > 0:
raise ValueError("prefix-len is not supported when "
"dataset-path is provided.")
print(f"Start to sample {args.num_prompts} prompts "
f"from {args.dataset_path}")
filtered_datasets = sample_requests(
filtered_requests = sample_requests_from_dataset(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
@ -127,9 +172,22 @@ def main(args):
fixed_output_len=args.output_len,
)
else:
prompt_len = len(tokenizer(PROMPT).input_ids)
filtered_datasets = [(PROMPT, prompt_len, args.output_len)
] * args.num_prompts
print(f"Start to sample {args.num_prompts} prompts from random")
filtered_requests = sample_requests_from_random(
num_requests=args.num_prompts,
tokenizer=tokenizer,
input_length_range=input_length_range,
fixed_output_len=args.output_len,
prefix_len=args.prefix_len,
)
# Print some helpful stats of the requests.
print(f"Sampled {len(filtered_requests)} requests.")
prompt_lens = [req.prompt_len for req in filtered_requests]
print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
print(f"Min Prompt Length: {min(prompt_lens)}")
print(f"Max Prompt Length: {max(prompt_lens)}")
engine_args = EngineArgs.from_cli_args(args)
@ -137,8 +195,8 @@ def main(args):
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("Testing filtered datasets")
prompts = repeat_and_sort_requests(filtered_datasets,
print("Testing filtered requests")
prompts = repeat_and_sort_requests(filtered_requests,
repeat_count=args.repeat_count,
sort=args.sort)
@ -161,20 +219,29 @@ if __name__ == "__main__":
parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--num-prompts',
type=int,
default=1,
required=True,
help="Number of the prompts sampled from dataset")
parser.add_argument('--repeat-count',
type=int,
default=100,
default=1,
help='Number of times to repeat each prompt')
parser.add_argument('--sort',
action='store_true',
help='Sort prompts by input length')
parser.add_argument('--input-length-range',
type=str,
default='128:256',
required=True,
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')
parser.add_argument(
"--prefix-len",
type=int,
default=0,
help="Specifies the length of a common prefix to be "
"added to the input prompt. The input-length-range will "
"subtract this length when filtering prompts. Only used "
"when dataset-path is not provided.",
)
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()

View File

@ -25,6 +25,7 @@ On the client side, run:
import argparse
import asyncio
import base64
import gc
import io
import json
import os
@ -199,15 +200,72 @@ def sample_sonnet_requests(
return sampled_requests
def sample_vision_arena_requests(
dataset,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in dataset:
if len(sampled_requests) == num_requests:
break
prompt = data["turns"][0][0]['content']
prompt_token_ids = tokenizer(prompt).input_ids
if fixed_output_len is None:
# Default max output len is set to 128
print("--hf-output-len is not provided. Using default value 128.")
fixed_output_len = 128
prompt_len = len(prompt_token_ids)
output_len = fixed_output_len
assert isinstance(
data["images"][0],
Image), ("Input image format must be `PIL.Image.Image`, "
f"given {type(data['image'])}.")
image: Image = data["images"][0]
image = image.convert("RGB")
image_data = io.BytesIO()
image.save(image_data, format='JPEG')
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
mm_content = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
sampled_requests.append((prompt, prompt_len, output_len, mm_content))
return sampled_requests
def sample_hf_requests(
dataset_path: str,
dataset_subset: str,
dataset_subset: Optional[str],
dataset_split: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
random_seed: int,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
# Special case for vision_arena dataset
if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
and dataset_subset is None:
assert dataset_split == "train"
dataset = load_dataset(dataset_path,
name=dataset_subset,
split=dataset_split,
streaming=True)
dataset = dataset.shuffle(seed=random_seed)
return sample_vision_arena_requests(dataset, num_requests, tokenizer,
fixed_output_len)
dataset = load_dataset(dataset_path,
name=dataset_subset,
split=dataset_split,
@ -251,6 +309,19 @@ def sample_hf_requests(
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
elif "image" in data and isinstance(data["image"], str):
if (data["image"].startswith("http://") or \
data["image"].startswith("file://")):
image_url = data["image"]
else:
image_url = f"file://{data['image']}"
mm_content = {
"type": "image_url",
"image_url": {
"url": image_url
},
}
else:
mm_content = None
@ -345,7 +416,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
gootput_config_dict: Dict[str, float],
goodput_config_dict: Dict[str, float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
total_input = 0
@ -358,19 +429,23 @@ def calculate_metrics(
e2els: List[float] = []
for i in range(len(outputs)):
if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together
# Note : this may inflate the output token count slightly
output_len = len(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
output_len = outputs[i].output_tokens
if output_len is None:
# We use the tokenizer to count the number of output tokens
# for some serving backends instead of looking at
# len(outputs[i].itl) since multiple output tokens may be
# bundled together
# Note : this may inflate the output token count slightly
output_len = len(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i][1]
tpot = 0
if output_len > 1:
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
1)
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
tpot = latency_minus_ttft / (output_len - 1)
tpots.append(tpot)
# Note: if output_len <= 1, we regard tpot as 0 for goodput
all_tpots.append(tpot)
@ -381,21 +456,21 @@ def calculate_metrics(
else:
actual_output_lens.append(0)
if gootput_config_dict:
if goodput_config_dict:
valid_metrics = []
slo_values = []
if "ttft" in gootput_config_dict:
if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts)
slo_values.append(gootput_config_dict["ttft"] /
slo_values.append(goodput_config_dict["ttft"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "tpot" in gootput_config_dict:
if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots)
slo_values.append(gootput_config_dict["tpot"] /
slo_values.append(goodput_config_dict["tpot"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "e2el" in gootput_config_dict:
if "e2el" in goodput_config_dict:
valid_metrics.append(e2els)
slo_values.append(gootput_config_dict["e2el"] /
slo_values.append(goodput_config_dict["e2el"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
for req_metric in zip(*valid_metrics):
@ -447,6 +522,7 @@ async def benchmark(
api_url: str,
base_url: str,
model_id: str,
model_name: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: List[Tuple[str, int, int]],
logprobs: Optional[int],
@ -458,7 +534,7 @@ async def benchmark(
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
ignore_eos: bool,
gootput_config_dict: Dict[str, float],
goodput_config_dict: Dict[str, float],
max_concurrency: Optional[int],
):
if backend in ASYNC_REQUEST_FUNCS:
@ -475,6 +551,7 @@ async def benchmark(
"Multi-modal content is only supported on 'openai-chat' backend.")
test_input = RequestFuncInput(
model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=api_url,
prompt_len=test_prompt_len,
@ -495,6 +572,7 @@ async def benchmark(
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
@ -538,6 +616,7 @@ async def benchmark(
async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt=prompt,
api_url=api_url,
prompt_len=prompt_len,
@ -579,7 +658,7 @@ async def benchmark(
tokenizer=tokenizer,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
gootput_config_dict=gootput_config_dict,
goodput_config_dict=goodput_config_dict,
)
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@ -591,7 +670,7 @@ async def benchmark(
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
if gootput_config_dict:
if goodput_config_dict:
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
metrics.request_goodput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
@ -606,7 +685,7 @@ async def benchmark(
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
"request_goodput:":
metrics.request_goodput if gootput_config_dict else None,
metrics.request_goodput if goodput_config_dict else None,
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@ -662,11 +741,11 @@ async def benchmark(
def check_goodput_args(args):
# Check and parse goodput arguments
gootput_config_dict = {}
goodput_config_dict = {}
VALID_NAMES = ["ttft", "tpot", "e2el"]
if args.goodput:
gootput_config_dict = parse_goodput(args.goodput)
for slo_name, slo_val in gootput_config_dict.items():
goodput_config_dict = parse_goodput(args.goodput)
for slo_name, slo_val in goodput_config_dict.items():
if slo_name not in VALID_NAMES:
raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. "
@ -677,22 +756,22 @@ def check_goodput_args(args):
f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be "
"non-negative.")
return gootput_config_dict
return goodput_config_dict
def parse_goodput(slo_pairs):
gootput_config_dict = {}
goodput_config_dict = {}
try:
for slo_pair in slo_pairs:
slo_name, slo_val = slo_pair.split(":")
gootput_config_dict[slo_name] = float(slo_val)
goodput_config_dict[slo_name] = float(slo_val)
except ValueError as err:
raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. "
"Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds.") from err
return gootput_config_dict
return goodput_config_dict
def main(args: argparse.Namespace):
@ -702,7 +781,9 @@ def main(args: argparse.Namespace):
backend = args.backend
model_id = args.model
model_name = args.served_model_name
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
tokenizer_mode = args.tokenizer_mode
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"
@ -712,6 +793,7 @@ def main(args: argparse.Namespace):
base_url = f"http://{args.host}:{args.port}"
tokenizer = get_tokenizer(tokenizer_id,
tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code)
if args.dataset is not None:
@ -789,7 +871,11 @@ def main(args: argparse.Namespace):
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
gootput_config_dict = check_goodput_args(args)
goodput_config_dict = check_goodput_args(args)
# Avoid GC processing "static" data - reduce pause times.
gc.collect()
gc.freeze()
benchmark_result = asyncio.run(
benchmark(
@ -797,6 +883,7 @@ def main(args: argparse.Namespace):
api_url=api_url,
base_url=base_url,
model_id=model_id,
model_name=model_name,
tokenizer=tokenizer,
input_requests=input_requests,
logprobs=args.logprobs,
@ -810,7 +897,7 @@ def main(args: argparse.Namespace):
float(p) for p in args.metric_percentiles.split(",")
],
ignore_eos=args.ignore_eos,
gootput_config_dict=gootput_config_dict,
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
))
@ -839,8 +926,8 @@ def main(args: argparse.Namespace):
)
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
@ -1132,5 +1219,22 @@ if __name__ == "__main__":
"from the sampled HF dataset.",
)
parser.add_argument(
'--tokenizer-mode',
type=str,
default="auto",
choices=['auto', 'slow', 'mistral'],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
'always use the slow tokenizer. \n* '
'"mistral" will always use the `mistral_common` tokenizer.')
parser.add_argument("--served-model-name",
type=str,
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ")
args = parser.parse_args()
main(args)

View File

@ -0,0 +1,881 @@
r"""Benchmark online serving throughput with guided decoding.
On the server side, run one of the following commands:
(vLLM OpenAI API server)
vllm serve <your_model> --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving.py \
--backend <backend> \
--model <your_model> \
--dataset json \
--guided-decoding-ratio 1.0 \
--guided-decoding-backend xgrammar \
--request-rate 10 \
--num-prompts 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
"""
import argparse
import asyncio
import dataclasses
import json
import os
import random
import time
import warnings
from dataclasses import dataclass
from typing import AsyncGenerator, List, Optional, Tuple
import datasets
import numpy as np
import pandas as pd
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput)
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
try:
from vllm.utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@dataclass
class BenchmarkMetrics:
completed: int
total_input: int
total_output: int
request_throughput: float
request_goodput: float
output_throughput: float
total_token_throughput: float
mean_ttft_ms: float
median_ttft_ms: float
std_ttft_ms: float
percentiles_ttft_ms: List[Tuple[float, float]]
mean_tpot_ms: float
median_tpot_ms: float
std_tpot_ms: float
percentiles_tpot_ms: List[Tuple[float, float]]
mean_itl_ms: float
median_itl_ms: float
std_itl_ms: float
percentiles_itl_ms: List[Tuple[float, float]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms: float
median_e2el_ms: float
std_e2el_ms: float
percentiles_e2el_ms: List[Tuple[float, float]]
@dataclasses.dataclass
class SampleRequest:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
"""
prompt: str
prompt_len: int
expected_output_len: int
schema: dict
structure_type: str
completion: str = None
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]:
if args.dataset == 'json':
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
args.json_schema_path = os.path.join(dir_path,
"structured_schemas",
"structured_schema_1.json")
with open(args.json_schema_path) as f:
schema = json.load(f)
prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "grammar":
schema = """
?start: select_statement
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
?table_name: identifier
?column_name: identifier
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
prompt = "Generate an SQL query to show the 'username' \
and 'email' from the 'users' table."
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "regex":
regex = r"\w+@\w+\.com\n"
args.regex = regex
prompt = "Generate an email address for Alan Turing, \
who works in Enigma. End in .com and new line. \
Example result: alan.turing@enigma.com\n"
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=regex,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "choice":
choice = ["Positive", "Negative"]
args.choice = choice
prompt = "Classify this sentiment: vLLM is wonderful!"
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=choice,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "xgrammar_bench":
requests: List[SampleRequest] = []
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
split="train")
print(f"dataset has {len(dataset)} entries")
len_dataset = len(dataset)
for data_point_idx in range(args.num_prompts):
idx = data_point_idx
while idx >= len_dataset:
idx -= len_dataset
schema = dataset["schema"][idx]
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
tokenize=False)
input_len = len(tokenizer(prompt).input_ids)
completion = dataset["completion"][idx]
requests.append(
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
structure_type=args.structure_type,
completion=completion))
return requests
async def get_request(
input_requests: List[SampleRequest],
request_rate: float,
burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
Args:
input_requests:
A list of input requests, each represented as a tuple.
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests = iter(input_requests)
# Calculate scale parameter theta to maintain the desired request_rate.
assert burstiness > 0, (
f"A positive burstiness factor is expected, but given {burstiness}.")
theta = 1.0 / (request_rate * burstiness)
for i, request in enumerate(input_requests):
yield i, request
if request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait.
continue
# Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution.
interval = np.random.gamma(shape=burstiness, scale=theta)
# The next request will be sent after the interval.
await asyncio.sleep(interval)
def calculate_metrics(
input_requests: List[Tuple[str, int, int]],
outputs: List[RequestFuncOutput],
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
total_input = 0
completed = 0
good_completed = 0
itls: List[float] = []
tpots: List[float] = []
all_tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
for i in range(len(outputs)):
if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together
# Note : this may inflate the output token count slightly
output_len = len(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i].prompt_len
tpot = 0
if output_len > 1:
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
1)
tpots.append(tpot)
outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
# Note: if output_len <= 1, we regard tpot as 0 for goodput
all_tpots.append(tpot)
itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
e2els.append(outputs[i].latency)
completed += 1
else:
actual_output_lens.append(0)
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
total_output=sum(actual_output_lens),
request_throughput=completed / dur_s,
request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
mean_ttft_ms=np.mean(ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by backend
std_ttft_ms=np.std(ttfts or 0) * 1000,
median_ttft_ms=np.median(ttfts or 0) * 1000,
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
for p in selected_percentiles],
mean_tpot_ms=np.mean(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
for p in selected_percentiles],
mean_itl_ms=np.mean(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
for p in selected_percentiles],
mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
)
return metrics, actual_output_lens
async def benchmark(
backend: str,
api_url: str,
base_url: str,
model_id: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: List[SampleRequest],
request_rate: float,
burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
ignore_eos: bool,
max_concurrency: Optional[int],
guided_decoding_ratio: float,
guided_decoding_backend: str,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
else:
raise ValueError(f"Unknown backend: {backend}")
def prepare_extra_body(request) -> dict:
extra_body = {}
# Add the schema to the extra_body
extra_body[request.structure_type] = request.schema
# Add the specific guided_decoding_backend
extra_body["guided_decoding_backend"] = guided_decoding_backend
return extra_body
print("Starting initial single prompt test run...")
guided_decoding_req_idx = random.sample(
range(len(input_requests)),
int(len(input_requests) * guided_decoding_ratio))
test_request = input_requests[0]
test_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
api_url=api_url,
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
ignore_eos=ignore_eos,
extra_body=prepare_extra_body(test_request),
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {test_output.error}")
else:
print("Initial test run completed. Starting main benchmark run...")
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
api_url=base_url + "/start_profile",
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
ignore_eos=ignore_eos,
extra_body=prepare_extra_body(test_request),
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
if burstiness == 1.0:
distribution = "Poisson process"
else:
distribution = "Gamma distribution"
print(f"Traffic request rate: {request_rate}")
print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = (asyncio.Semaphore(max_concurrency)
if max_concurrency else None)
async def limited_request_func(request_func_input, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input,
pbar=pbar)
async with semaphore:
return await request_func(request_func_input=request_func_input,
pbar=pbar)
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
expected: List[str] = []
async for i, request in get_request(input_requests, request_rate,
burstiness):
extra_body = prepare_extra_body(
request) if i in guided_decoding_req_idx else None
request_func_input = RequestFuncInput(
model=model_id,
prompt=request.prompt,
api_url=api_url,
prompt_len=request.prompt_len,
output_len=request.expected_output_len,
ignore_eos=ignore_eos,
extra_body=extra_body,
)
expected.append(request.completion)
tasks.append(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
extra_body={test_request.structure_type: test_request.schema},
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
if pbar is not None:
pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time
metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests,
outputs=outputs,
dur_s=benchmark_duration,
tokenizer=tokenizer,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
)
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
print("{:<40} {:<10}".format("Total generated tokens:",
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
metrics.total_token_throughput))
result = {
"duration":
benchmark_duration,
"completed":
metrics.completed,
"total_input_tokens":
metrics.total_input,
"total_output_tokens":
metrics.total_output,
"request_throughput":
metrics.request_throughput,
"output_throughput":
metrics.output_throughput,
"total_token_throughput":
metrics.total_token_throughput,
"ttft_description":
pd.Series([output.ttft for output in outputs]).describe().to_dict(),
"tpot_description":
pd.Series([output.tpot for output in outputs]).describe().to_dict(),
"input_lens": [output.prompt_len for output in outputs],
"output_lens":
actual_output_lens,
"ttfts": [output.ttft for output in outputs],
"itls": [output.itl for output in outputs],
"errors": [output.error for output in outputs],
}
ret = [{
'generated': output.generated_text,
'expected': gt
} for output, gt in zip(outputs, expected)]
def process_one_metric(
# E.g., "ttft"
metric_attribute_name: str,
# E.g., "TTFT"
metric_name: str,
# E.g., "Time to First Token"
metric_header: str,
):
# This function prints and adds statistics of the specified
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
print("{:<40} {:<10.2f}".format(
f"Mean {metric_name} (ms):",
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
print("{:<40} {:<10.2f}".format(
f"Median {metric_name} (ms):",
getattr(metrics, f"median_{metric_attribute_name}_ms")))
result[f"mean_{metric_attribute_name}_ms"] = getattr(
metrics, f"mean_{metric_attribute_name}_ms")
result[f"median_{metric_attribute_name}_ms"] = getattr(
metrics, f"median_{metric_attribute_name}_ms")
result[f"std_{metric_attribute_name}_ms"] = getattr(
metrics, f"std_{metric_attribute_name}_ms")
for p, value in getattr(metrics,
f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p)
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("tpot", "TPOT",
"Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency")
print("=" * 50)
return result, ret
def evaluate(ret, args):
def _eval_correctness_json(expected, actual):
# extract json string from string using regex
import re
actual = actual.replace('\n', '').replace(' ', '').strip()
try:
actual = re.search(r'\{.*\}', actual).group()
actual = json.loads(actual)
except Exception:
return False
return True
def _eval_correctness_choice(expected, actual):
return actual in args.choice
def _eval_correctness_regex(expected, actual):
import re
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
if args.structure_type == 'guided_json':
return _eval_correctness_json(expected, actual)
elif args.structure_type == 'guided_regex':
return _eval_correctness_regex(expected, actual)
elif args.structure_type == 'guided_choice':
return _eval_correctness_choice(expected, actual)
else:
return None
scores = []
for res in ret:
score = _eval_correctness(res['expected'], res['generated'])
res['correctness'] = score
scores.append(score)
not_none_scores = [score for score in scores if score is not None]
return (sum(not_none_scores) / len(not_none_scores) *
100) if len(not_none_scores) > 0 else None
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
np.random.seed(args.seed)
backend = args.backend
model_id = args.model
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"
base_url = f"{args.base_url}"
else:
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
base_url = f"http://{args.host}:{args.port}"
tokenizer = get_tokenizer(tokenizer_id,
trust_remote_code=args.trust_remote_code)
if args.dataset == 'grammar':
args.structure_type = 'guided_grammar'
elif args.dataset == 'regex':
args.structure_type = 'guided_regex'
elif args.dataset == 'choice':
args.structure_type = 'guided_choice'
else:
args.structure_type = 'guided_json'
if args.no_guided_decoding:
args.guided_decoding_ratio = 0
if args.save_results:
result_file_name = f'{args.guided_decoding_ratio}guided'
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"
result_file_name += f"_{args.dataset}"
result_file_name += f"_{args.num_prompts}"
result_file_name += f"_out{args.output_len}"
result_file_name += ".txt"
else:
result_file_name = None
input_requests = sample_requests(tokenizer, args)
benchmark_result, ret = asyncio.run(
benchmark(
backend=backend,
api_url=api_url,
base_url=base_url,
model_id=model_id,
tokenizer=tokenizer,
input_requests=input_requests,
request_rate=args.request_rate,
burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
selected_percentiles=[
float(p) for p in args.metric_percentiles.split(",")
],
ignore_eos=args.ignore_eos,
max_concurrency=args.max_concurrency,
guided_decoding_ratio=args.guided_decoding_ratio,
guided_decoding_backend=args.guided_decoding_backend,
))
# Save config and results to json
score = evaluate(ret, args)
print("correct_rate(%)", score, '\n')
if args.save_results:
results = {
"backend":
backend,
"model_id":
model_id,
"tokenizer_id":
tokenizer_id,
"num_prompts":
args.num_prompts,
"request_rate":
args.request_rate if args.request_rate < float("inf") else "inf",
"burstiness":
args.burstiness,
"max_concurrency":
args.max_concurrency,
"correct_rate(%)":
score
}
results = {"outputs": ret, **results, **benchmark_result}
# Save to file
if args.result_filename:
result_file_name = args.result_filename
if args.result_dir:
result_file_name = os.path.join(args.result_dir, result_file_name)
with open(result_file_name, "w", encoding='utf-8') as outfile:
json.dump(results, outfile, indent=4)
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument(
"--backend",
type=str,
default="vllm",
choices=list(ASYNC_REQUEST_FUNCS.keys()),
)
parser.add_argument(
"--base-url",
type=str,
default=None,
help="Server or API base url if not using http host and port.",
)
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--endpoint",
type=str,
default="/v1/completions",
help="API endpoint.",
)
parser.add_argument(
"--dataset",
default='json',
choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
parser.add_argument("--json_schema_path",
type=str,
default=None,
help="Path to json schema.")
parser.add_argument(
"--max-concurrency",
type=int,
default=None,
help="Maximum number of concurrent requests. This can be used "
"to help simulate an environment where a higher level component "
"is enforcing a maximum number of concurrent requests. While the "
"--request-rate argument controls the rate at which requests are "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up.")
parser.add_argument(
"--model",
type=str,
required=True,
help="Name of the model.",
)
parser.add_argument(
"--tokenizer",
type=str,
help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument(
"--num-prompts",
type=int,
default=1000,
help="Number of prompts to process.",
)
parser.add_argument(
"--output-len",
type=int,
default=128,
help="Number of output tokens.",
)
parser.add_argument(
"--request-rate",
type=float,
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process or gamma distribution "
"to synthesize the request arrival times.",
)
parser.add_argument(
"--burstiness",
type=float,
default=1.0,
help="Burstiness factor of the request generation. "
"Only take effect when request_rate is not inf. "
"Default value is 1, which follows Poisson process. "
"Otherwise, the request intervals follow a gamma distribution. "
"A lower burstiness value (0 < burstiness < 1) results in more "
"bursty requests. A higher burstiness value (burstiness > 1) "
"results in a more uniform arrival of requests.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Trust remote code from huggingface",
)
parser.add_argument(
"--disable-tqdm",
action="store_true",
help="Specify to disable tqdm progress bar.",
)
parser.add_argument(
"--save-results",
action="store_true",
help="Specify to save benchmark results to a json file",
)
parser.add_argument(
"--profile",
action="store_true",
help="Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
)
parser.add_argument(
"--result-dir",
type=str,
default=None,
help="Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory.",
)
parser.add_argument(
"--result-filename",
type=str,
default=None,
help="Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format.",
)
parser.add_argument(
"--ignore-eos",
action="store_true",
help="Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
)
parser.add_argument("--no-guided-decoding",
action='store_true',
default=False,
help="Whether to disable JSON decoding or not.")
parser.add_argument("--guided-decoding-ratio",
type=float,
default=1.0,
help="Ratio of Guided Decoding requests")
parser.add_argument("--guided-decoding-backend",
type=str,
choices=["outlines", "lm-format-enforcer", "xgrammar"],
default="xgrammar",
help="Backend to use for guided decoding")
args = parser.parse_args()
main(args)

View File

@ -4,7 +4,8 @@ import dataclasses
import json
import random
import time
from typing import List, Optional
from functools import cache
from typing import Dict, List, Optional, Tuple
import torch
import uvloop
@ -17,8 +18,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@ -28,15 +32,17 @@ class SampleRequest:
Attributes:
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt: str
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[MultiModalDataDict] = None
lora_request: Optional[LoRARequest] = None
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
@ -60,8 +66,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
raise ValueError(f"Unsupported model {model}")
@cache
def lora_path_on_disk(lora_path: str) -> str:
return get_adapter_absolute_path(lora_path)
lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
def get_random_lora_request(
args: argparse.Namespace
) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
global lora_tokenizer_cache
lora_id = random.randint(1, args.max_loras)
lora_request = LoRARequest(lora_name=str(lora_id),
lora_int_id=lora_id,
lora_path=lora_path_on_disk(args.lora_path))
if lora_id not in lora_tokenizer_cache:
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
return lora_request, lora_tokenizer_cache[lora_id]
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]:
dataset_path: str = args.dataset
num_requests: int = args.num_prompts
fixed_output_len: Optional[int] = args.output_len
@ -79,7 +107,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
# Filter out sequences that are too long or too short
filtered_dataset: List[SampleRequest] = []
for data in dataset:
for data in tqdm(dataset,
total=len(filtered_dataset),
desc="sampling requests"):
if len(filtered_dataset) == num_requests:
break
@ -102,9 +132,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
continue
prompt = _get_prompt_for_image_model(question=prompt, model=model)
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids = tokenizer(prompt).input_ids
completion_token_ids = tokenizer(completion).input_ids
prompt_token_ids = request_tokenizer(prompt).input_ids
completion_token_ids = request_tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
@ -118,7 +155,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
SampleRequest(prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=multi_modal_data))
multi_modal_data=multi_modal_data,
lora_request=lora_request))
return filtered_dataset
@ -146,14 +184,21 @@ def run_vllm(
ignore_eos=True,
max_tokens=request.expected_output_len,
))
lora_requests: Optional[List[LoRARequest]] = None
if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests]
use_beam_search = False
if not use_beam_search:
start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True)
llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
end = time.perf_counter()
else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
@ -185,6 +230,7 @@ async def run_vllm_async(
# Add the requests to the engine.
prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
lora_requests: List[Optional[LoRARequest]] = []
for request in requests:
prompts.append(
TextPrompt(prompt=request.prompt,
@ -197,11 +243,16 @@ async def run_vllm_async(
ignore_eos=True,
max_tokens=request.expected_output_len,
))
lora_requests.append(request.lora_request)
generators = []
start = time.perf_counter()
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
for i, (prompt, sp,
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
generator = llm.generate(prompt,
sp,
lora_request=lr,
request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
@ -294,23 +345,45 @@ def main(args: argparse.Namespace):
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for i in range(-10, 10):
prompt = "hi " * (args.input_len + i)
tokenized_prompt = tokenizer(prompt).input_ids
if len(tokenized_prompt) == args.input_len:
break
else:
raise ValueError(
f"Failed to synthesize a prompt with {args.input_len} tokens.")
requests = [
SampleRequest(prompt=prompt,
prompt_len=args.input_len,
expected_output_len=args.output_len)
for _ in range(args.num_prompts)
]
vocab_size = tokenizer.vocab_size
requests = []
for _ in range(args.num_prompts):
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids = [
random.randint(0, vocab_size - 1)
for _ in range(args.input_len)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for _ in range(5): # Max attempts to correct
candidate_prompt = request_tokenizer.decode(candidate_ids)
tokenized_len = len(request_tokenizer.encode(candidate_prompt))
if tokenized_len == args.input_len:
break
# Adjust length based on difference
diff = args.input_len - tokenized_len
if diff > 0:
candidate_ids.extend([
random.randint(100, vocab_size - 100)
for _ in range(diff)
])
else:
candidate_ids = candidate_ids[:diff]
requests.append(
SampleRequest(prompt=candidate_prompt,
prompt_len=args.input_len,
expected_output_len=args.output_len,
lora_request=lora_request))
else:
requests = sample_requests(tokenizer, args)
@ -409,6 +482,14 @@ if __name__ == "__main__":
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
# LoRA
parser.add_argument(
"--lora-path",
type=str,
default=None,
help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.")
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
@ -418,6 +499,8 @@ if __name__ == "__main__":
assert args.output_len is not None
else:
assert args.input_len is None
if args.enable_lora:
assert args.lora_path is not None
if args.backend == "vllm":
if args.hf_max_batch_size is not None:
@ -427,6 +510,9 @@ if __name__ == "__main__":
raise ValueError("HF max batch size is required for HF backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
elif args.backend == "mii":
if args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
@ -439,4 +525,7 @@ if __name__ == "__main__":
if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII "
"backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
main(args)

View File

@ -0,0 +1,384 @@
import argparse
import copy
import itertools
import pickle as pkl
import time
from typing import Callable, Iterable, List, Tuple
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from utils import make_rand_sparse_tensors
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1]
# bench
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
**kwargs) -> TMeasurement:
min_run_time = 1
globals = {
"args": args,
"kwargs": kwargs,
"fn": fn,
}
return TBenchmark.Timer(
stmt="fn(*args, **kwargs)",
globals=globals,
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.int8
b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
torch.bfloat16)
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
if not torch.allclose(out, out_ref):
print("Incorrect results")
print(out)
print(out_ref)
else:
print("Correct results")
timers = []
# pytorch impl - bfloat16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16),
b.to(dtype=torch.bfloat16)))
# pytorch impl - float16
timers.append(
bench_fn(label, sub_label,
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
# cutlass impl
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
torch.bfloat16))
# cutlass with bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
bias))
# cutlass sparse impl
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
scale_b, torch.bfloat16))
# cutlass sparse with bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
scale_b, torch.bfloat16, bias))
return timers
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.float8_e4m3fn
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
torch.bfloat16)
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
if not torch.allclose(out, out_ref):
print("Incorrect results")
print(out)
print(out_ref)
else:
print("Correct results")
timers = []
# pytorch impl w. bf16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda")))
# pytorch impl: bf16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16))
# pytorch impl: bf16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True))
# pytorch impl: fp16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16))
# pytorch impl: fp16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16,
use_fast_accum=True))
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
torch.bfloat16))
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
scale_b, torch.bfloat16))
# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
scale_b, torch.float16))
# cutlass impl: bf16 output, with bias
timers.append(
bench_fn(label, sub_label,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
scale_b, torch.bfloat16, bias))
# cutlass impl: fp16 output, with bias
timers.append(
bench_fn(label, sub_label,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
scale_b, torch.float16, bias.to(dtype=torch.float16)))
return timers
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn:
return bench_fp8(dtype, m, k, n, label, sub_label)
raise ValueError("unsupported type")
# runner
def print_timers(timers: Iterable[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
def run(dtype: torch.dtype,
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
f"MKN=({m}x{k}x{n})")
print_timers(timers)
results.extend(timers)
return results
# output makers
def make_output(data: Iterable[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None):
print(f"== All Results {base_description} ====")
print_timers(data)
# pickle all the results
timestamp = int(time.time()) if timestamp is None else timestamp
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
pkl.dump(data, f)
# argparse runners
def run_square_bench(args):
dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}")
def run_range_bench(args):
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
n = len(dim_sizes)
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
MKNs = list(zip(Ms, Ks, Ns))
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"range_bench-{args.dtype}")
def run_model_bench(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
KNs.append(KN)
return KNs
model_bench_data = []
models_tps = list(itertools.product(args.models, args.tp_sizes))
for model, tp_size in models_tps:
Ms = args.batch_sizes
KNs = model_shapes(model, tp_size)
MKNs = []
for m in Ms:
for k, n in KNs:
MKNs.append((m, k, n))
data = run(args.dtype, MKNs)
model_bench_data.append(data)
# Print all results
for data, model_tp in zip(model_bench_data, models_tps):
model, tp_size = model_tp
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
print_timers(data)
timestamp = int(time.time())
all_data = []
for d in model_bench_data:
all_data.extend(d)
# pickle all data
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
pkl.dump(all_data, f)
if __name__ == '__main__':
def to_torch_dtype(dt):
if dt == "int8":
return torch.int8
if dt == "fp8":
return torch.float8_e4m3fn
raise ValueError("unsupported dtype")
parser = FlexibleArgumentParser(
description="""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--dtype",
type=to_torch_dtype,
required=True,
help="Available options are ['int8', 'fp8']")
subparsers = parser.add_subparsers(dest="cmd")
square_parser = subparsers.add_parser("square_bench")
square_parser.add_argument("--dim-start", type=int, required=True)
square_parser.add_argument("--dim-end", type=int, required=True)
square_parser.add_argument("--dim-increment", type=int, required=True)
square_parser.set_defaults(func=run_square_bench)
range_parser = subparsers.add_parser("range_bench")
range_parser.add_argument("--dim-start", type=int, required=True)
range_parser.add_argument("--dim-end", type=int, required=True)
range_parser.add_argument("--dim-increment", type=int, required=True)
range_parser.add_argument("--m-constant", type=int, default=None)
range_parser.add_argument("--n-constant", type=int, default=None)
range_parser.add_argument("--k-constant", type=int, default=None)
range_parser.set_defaults(func=run_range_bench)
model_parser = subparsers.add_parser("model_bench")
model_parser.add_argument("--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES.keys())
model_parser.add_argument("--tp-sizes",
nargs="+",
type=int,
default=DEFAULT_TP_SIZES)
model_parser.add_argument("--batch-sizes",
nargs="+",
type=int,
default=DEFAULT_BATCH_SIZES)
model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args()
args.func(args)

View File

@ -0,0 +1,96 @@
# Cutlass bench utils
from typing import Iterable, Tuple
import torch
import vllm._custom_ops as ops
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(dtype=torch.bfloat16)
def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(dtype=torch.float16)
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
if dtype == torch.int8:
return to_int8(a), to_int8(b)
if dtype == torch.float8_e4m3fn:
return to_fp8(a), to_fp8(b)
raise ValueError("unsupported dtype")
def prune_to_2_4(tensor):
# Reshape tensor to [N, 4] where N is number of groups of 4
original_shape = tensor.shape
reshaped = tensor.reshape(-1, 4)
# Get indices of top 2 absolute values in each group of 4
_, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
# Create binary mask
mask = torch.zeros_like(reshaped)
mask.scatter_(dim=1,
index=indices,
src=torch.ones_like(indices, dtype=mask.dtype))
# Apply mask and reshape back
pruned = reshaped * mask
# Turn all -0.0 to 0.0
pruned[pruned == -0.0] = 0.0
return pruned.reshape(original_shape)
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
b = prune_to_2_4(b.t()).t()
if dtype == torch.int8:
a, b = to_int8(a), to_int8(b)
elif dtype == torch.float8_e4m3fn:
a, b = to_fp8(a), to_fp8(b)
elif dtype == torch.float16:
a, b = to_fp16(a), to_fp16(b)
elif dtype == torch.bfloat16:
a, b = to_bf16(a), to_bf16(b)
else:
raise ValueError("unsupported dtype")
b_compressed, e = ops.cutlass_sparse_compress(b.t())
# Compressed B, Metadata, Original A, B
return b_compressed, e, a, b
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
m: int, n: int, k: int) -> \
Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = []
for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
if b_comp is not None:
ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
BComps, Es, As, Bs = zip(*ABs)
return list(BComps), list(Es), list(As), list(Bs)

View File

@ -8,6 +8,7 @@ from typing import Callable, Iterable, List, Tuple
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from utils import make_rand_tensors
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
@ -17,31 +18,6 @@ DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1]
# helpers
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
if dtype == torch.int8:
return to_int8(a), to_int8(b)
if dtype == torch.float8_e4m3fn:
return to_fp8(a), to_fp8(b)
raise ValueError("unsupported dtype")
# bench
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
@ -386,4 +362,4 @@ Benchmark Cutlass GEMM.
model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args()
args.func(args)
args.func(args)

View File

@ -40,4 +40,4 @@ WEIGHT_SHAPES = {
([8192, 57344], 1),
([28672, 8192], 0),
],
}
}

View File

@ -0,0 +1,145 @@
#!/bin/bash
# benchmark the overhead of disaggregated prefill.
# methodology:
# - send all request to prefill vLLM instance. It will buffer KV cache.
# - then send all request to decode instance.
# - The TTFT of decode instance is the overhead.
set -ex
kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local port=$1
timeout 1200 bash -c "
until curl -s localhost:${port}/v1/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
benchmark() {
export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# compare chunked prefill with disaggregated prefill
results_folder="./results"
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name="sonnet"
dataset_path="../sonnet_4x.txt"
num_prompts=10
qps=$1
prefix_len=50
input_len=2048
output_len=$2
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
# let the prefill instance finish prefill
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"
# send the request to decode.
# The TTFT of this command will be the overhead of disagg prefill impl.
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
kill_gpu_processes
}
main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
pip install quart httpx datasets
cd "$(dirname "$0")"
cd ..
# create sonnet-4x.txt
echo "" > sonnet_4x.txt
for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
cd disagg_benchmarks
rm -rf results
mkdir results
default_qps=1
default_output_len=1
benchmark $default_qps $default_output_len
}
main "$@"

View File

@ -0,0 +1,163 @@
#!/bin/bash
# Requirement: 2x GPUs.
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
# Resource: 2x GPU
# Approaches:
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
# Decoding instance: force the input tokens be the same across requests to bypass prefilling
set -ex
kill_gpu_processes() {
# kill all processes on GPU.
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
sleep 1
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local port=$1
timeout 1200 bash -c "
until curl -s localhost:${port}/v1/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}
launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len 10000 \
--enable-chunked-prefill \
--gpu-memory-utilization 0.6 &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len 10000 \
--enable-chunked-prefill \
--gpu-memory-utilization 0.6 &
wait_for_server 8100
wait_for_server 8200
python3 round_robin_proxy.py &
sleep 1
}
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100
wait_for_server 8200
python3 disagg_prefill_proxy_server.py &
sleep 1
}
benchmark() {
results_folder="./results"
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name="sonnet"
dataset_path="../sonnet_4x.txt"
num_prompts=100
qps=$1
prefix_len=50
input_len=1024
output_len=$2
tag=$3
python3 ../benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--sonnet-input-len $input_len \
--sonnet-output-len "$output_len" \
--sonnet-prefix-len $prefix_len \
--num-prompts $num_prompts \
--port 8000 \
--save-result \
--result-dir $results_folder \
--result-filename "$tag"-qps-"$qps".json \
--request-rate "$qps"
sleep 2
}
main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
(which lsof) || (apt-get -y install lsof)
pip install quart httpx matplotlib aiohttp datasets
cd "$(dirname "$0")"
cd ..
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > sonnet_4x.txt
for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
cd disagg_benchmarks
rm -rf results
mkdir results
default_output_len=6
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
launch_chunked_prefill
for qps in 2 4 6 8; do
benchmark $qps $default_output_len chunked_prefill
done
kill_gpu_processes
launch_disagg_prefill
for qps in 2 4 6 8; do
benchmark $qps $default_output_len disagg_prefill
done
kill_gpu_processes
python3 visualize_benchmark_results.py
}
main "$@"

View File

@ -0,0 +1,61 @@
import os
import aiohttp
from quart import Quart, make_response, request
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
app = Quart(__name__)
async def forward_request(url, data):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
async with session.post(url=url, json=data,
headers=headers) as response:
if response.status == 200:
# if response.headers.get('Transfer-Encoding') == 'chunked':
if True:
async for chunk_bytes in response.content.iter_chunked(
1024):
yield chunk_bytes
else:
content = await response.read()
yield content
@app.route('/v1/completions', methods=['POST'])
async def handle_request():
try:
original_request_data = await request.get_json()
prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill
prefill_request['max_tokens'] = 1
# finish prefill
async for _ in forward_request('http://localhost:8100/v1/completions',
prefill_request):
continue
# return decode
generator = forward_request('http://localhost:8200/v1/completions',
original_request_data)
response = await make_response(generator)
response.timeout = None
return response
except Exception as e:
import sys
import traceback
exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server")
print(e)
print("".join(traceback.format_exception(*exc_info)))
if __name__ == '__main__':
app.run(port=8000)

View File

@ -0,0 +1,60 @@
import asyncio
import itertools
import aiohttp
from aiohttp import web
class RoundRobinProxy:
def __init__(self, target_ports):
self.target_ports = target_ports
self.port_cycle = itertools.cycle(self.target_ports)
async def handle_request(self, request):
target_port = next(self.port_cycle)
target_url = f"http://localhost:{target_port}{request.path_qs}"
async with aiohttp.ClientSession() as session:
try:
# Forward the request
async with session.request(
method=request.method,
url=target_url,
headers=request.headers,
data=request.content,
) as response:
# Start sending the response
resp = web.StreamResponse(status=response.status,
headers=response.headers)
await resp.prepare(request)
# Stream the response content
async for chunk in response.content.iter_any():
await resp.write(chunk)
await resp.write_eof()
return resp
except Exception as e:
return web.Response(text=f"Error: {str(e)}", status=500)
async def main():
proxy = RoundRobinProxy([8100, 8200])
app = web.Application()
app.router.add_route('*', '/{path:.*}', proxy.handle_request)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, 'localhost', 8000)
await site.start()
print("Proxy server started on http://localhost:8000")
# Keep the server running
await asyncio.Event().wait()
if __name__ == '__main__':
asyncio.run(main())

View File

@ -0,0 +1,46 @@
import json
import matplotlib.pyplot as plt
import pandas as pd
if __name__ == "__main__":
data = []
for name in ['disagg_prefill', 'chunked_prefill']:
for qps in [2, 4, 6, 8]:
with open(f"results/{name}-qps-{qps}.json") as f:
x = json.load(f)
x['name'] = name
x['qps'] = qps
data.append(x)
df = pd.DataFrame.from_dict(data)
dis_df = df[df['name'] == 'disagg_prefill']
chu_df = df[df['name'] == 'chunked_prefill']
plt.style.use('bmh')
plt.rcParams['font.size'] = 20
for key in [
'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
'median_itl_ms', 'p99_itl_ms'
]:
fig, ax = plt.subplots(figsize=(11, 7))
plt.plot(dis_df['qps'],
dis_df[key],
label='disagg_prefill',
marker='o',
linewidth=4)
plt.plot(chu_df['qps'],
chu_df[key],
label='chunked_prefill',
marker='o',
linewidth=4)
ax.legend()
ax.set_xlabel('QPS')
ax.set_ylabel(key)
ax.set_ylim(bottom=0)
fig.savefig(f'results/{key}.png')
plt.close(fig)

View File

@ -0,0 +1,173 @@
import pickle as pkl
import time
from dataclasses import dataclass
from itertools import product
from typing import Callable, Iterable, List, Optional
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from tqdm import tqdm
import vllm._custom_ops as ops
from vllm.model_executor.layers.layernorm import RMSNorm
@dataclass
class bench_params_t:
num_tokens: int
hidden_size: int
add_residual: bool
dtype: torch.dtype
def description(self):
return (f'N {self.num_tokens} '
f'x D {self.hidden_size} '
f'x R {self.add_residual} '
f'x DT {self.dtype}')
def get_bench_params() -> List[bench_params_t]:
## Test Fixtures
NUM_TOKENS = [2**x for x in range(11)]
HIDDEN_SIZES = list(range(1024, 8129, 1024))
ADD_RESIDUAL = [True, False]
DTYPES = [torch.bfloat16, torch.float]
combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
bench_params = list(map(lambda x: \
bench_params_t(x[0], x[1], x[2], x[3]), combinations))
return bench_params
# Reference impls
def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
residual: Optional[torch.Tensor],
quant_dtype: torch.dtype):
# Norm
torch_out = None
if residual is None:
torch_out = rms_norm_layer.forward_cuda(x, residual)
else:
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
# Quant
torch_out, _, _ = ops.scaled_int8_quant(torch_out)
def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
residual: Optional[torch.Tensor],
quant_dtype: torch.dtype):
# Norm
torch_out = None
if residual is None:
torch_out = rms_norm_layer.forward_cuda(x, residual)
else:
torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
# Quant
torch_out, _ = ops.scaled_fp8_quant(torch_out)
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: Optional[torch.Tensor],
quant_dtype: torch.dtype):
out, _ = ops.rms_norm_dynamic_per_token_quant(x,
rms_norm_layer.weight,
1e-6,
quant_dtype,
residual=residual)
# Bench functions
def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
quant_dtype: torch.dtype, label: str, sub_label: str,
fn: Callable, description: str) -> TMeasurement:
min_run_time = 1
globals = {
"rms_norm_layer": rms_norm_layer,
"x": x,
"residual": residual,
"quant_dtype": quant_dtype,
"fn": fn,
}
return TBenchmark.Timer(
stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
globals=globals,
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)
def bench(params: bench_params_t, label: str, sub_label: str) \
-> Iterable[TMeasurement]:
# Make inputs
layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
# Make weights
layer.weight.data.normal_(mean=1.0, std=0.1)
# Make inputs
scale = 1 / params.hidden_size
x = torch.randn(params.num_tokens,
params.hidden_size,
dtype=params.dtype,
device='cuda') * scale
residual = (torch.randn_like(x) * scale).to(device='cuda') \
if params.add_residual else None
timers = []
# unfused int8 impl.
timers.append(
bench_fn(layer, x, residual, torch.int8, label, sub_label,
unfused_int8_impl, "unfused_int8_impl"))
# unfused fp8 impl.
timers.append(
bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
unfused_fp8_impl, "unfused_fp8_impl"))
# fused int8 impl.
timers.append(
bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
"fused_int8_impl"))
# fused fp8 impl.
timers.append(
bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
fused_impl, "fused_fp8_impl"))
print_timers(timers)
return timers
# launch bench
# runner
def print_timers(timers: Iterable[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
def main():
torch.set_default_device('cuda')
bench_params = get_bench_params()
timers = []
for bp in tqdm(bench_params):
timers.extend(
bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
print_timers(timers)
# pickle all the results
timestamp = int(time.time())
with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
pkl.dump(timers, f)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@ -2,8 +2,10 @@ import argparse
import copy
import itertools
import math
import os
import pickle as pkl
import time
from dataclasses import dataclass
from itertools import product
from typing import Callable, Iterable, List, Optional, Tuple
@ -15,11 +17,12 @@ from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales,
marlin_zero_points)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, pack_rows, quantize_weights)
pack_rows, quantize_weights)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
@ -27,149 +30,350 @@ DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
DEFAULT_TP_SIZES = [1]
NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
w_q = w_q.t().contiguous().t() # make col major
return ops.machete_prepack_B(w_q, wtype)
if NVTX_PROFILE:
import nvtx
def make_bench_tensors(
atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
k: int
) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
torch.tensor]]]:
def terse_type_name(dt):
return {
torch.bfloat16: "bf16",
torch.float16: "fp16",
torch.int8: "int8",
torch.float8_e4m3fn: "fp8",
torch.bfloat16: "bf16",
torch.float: "float",
torch.int: "int",
}[dt]
@dataclass
class BenchmarkTensors:
w_ref: torch.Tensor
a: torch.Tensor
w_q: torch.Tensor
group_size: Optional[int]
wtype: ScalarType
w_g_s: torch.Tensor
w_g_zp: Optional[torch.Tensor]
w_ch_s: Optional[torch.Tensor]
w_tok_s: Optional[torch.Tensor]
@dataclass
class TypeConfig:
act_type: torch.dtype
weight_type: ScalarType
output_type: Optional[torch.dtype]
group_scale_type: Optional[torch.dtype]
group_zero_type: Optional[torch.dtype]
channel_scale_type: Optional[torch.dtype]
token_scale_type: Optional[torch.dtype]
def rand_data(shape, dtype=torch.float16, scale=1):
if dtype.is_floating_point:
return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
else:
return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
def quantize_and_pack(atype: torch.dtype,
w: torch.Tensor,
wtype: ScalarType,
stype: Optional[torch.dtype],
group_size: Optional[int],
zero_points: bool = False):
assert wtype.is_integer(), "TODO: support floating point weights"
w_ref, w_q, w_s, w_zp = quantize_weights(
w,
wtype,
group_size=group_size,
zero_points=zero_points,
# to match how the kernel applies zps
ref_zero_points_after_scales=True)
w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
return w_ref, w_q, w_s, w_zp
def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
group_size: Optional[int]) -> List[BenchmarkTensors]:
m, n, k = shape
# we want to make sure that weights don't fit into L2 cache between runs so
# we construct enough weights to exceed L2 cache, which is 50mb on a H100
# so we target total weight size > 2*50mb
num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
num_weights = math.ceil(2 * 50 * 1024**2 * 8 /
(k * n * types.weight_type.size_bits))
a = torch.randn((m, k), device="cuda", dtype=atype) * 5
weights = [
torch.randn((k, n), device="cuda", dtype=atype)
for _ in range(num_weights)
]
quanitized_weights = [
quantize_weights(w, wtype, group_size) for w in weights
]
a = rand_data((m, k), types.act_type, scale=5)
return a, quanitized_weights
benchmark_tensors: List[BenchmarkTensors] = []
for _ in range(num_weights):
w = rand_data((k, n), types.act_type, scale=5)
if types.group_scale_type is not None:
w = w.to(types.group_scale_type)
if w.dtype.itemsize == 1:
w = w.to(torch.float16)
w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
a.dtype, w, types.weight_type, types.group_scale_type, group_size,
types.group_zero_type is not None)
if not a.dtype.is_floating_point:
aiinfo = torch.iinfo(a.dtype)
w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
w_ref = w_ref.to(torch.float32)
w_ch_s = None if types.channel_scale_type is None else\
rand_data((n,), types.channel_scale_type)
w_tok_s = None if types.token_scale_type is None else\
rand_data((m,), types.token_scale_type)
benchmark_tensors.append(
BenchmarkTensors(w_ref=w_ref,
a=a,
w_q=w_q_packed,
wtype=types.weight_type,
w_g_s=w_s,
w_g_zp=w_zp,
group_size=group_size,
w_ch_s=w_ch_s,
w_tok_s=w_tok_s))
return benchmark_tensors
def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
a = bt.a
w = bt.w_ref.to(bt.a.dtype) # use float reference tensor
if a.dtype not in [torch.float16, torch.bfloat16]:
a = a.to(torch.float16)
w = w.to(torch.float16)
return lambda: torch.matmul(a, w)
def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
if bt.w_ch_s is not None and bt.w_tok_s is not None:
scale_a = bt.w_tok_s.to(torch.float32)
scale_b = bt.w_ch_s.to(torch.float32)
else:
scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
return lambda: ops.cutlass_scaled_mm(
bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16)
def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
device = bt.a.device
workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_MAX_PARALLEL)
if bt.w_g_zp is None:
w_zp = torch.empty(0, dtype=torch.int, device=device)
else:
w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0],
bt.w_ref.shape[1], bt.wtype.size_bits)
if bt.group_size is None:
w_s = torch.tensor([], device="cuda", dtype=torch.half)
else:
w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0],
bt.w_ref.shape[1], bt.group_size)
sort_indices = torch.empty(0, dtype=torch.int, device=device)
g_idx = torch.empty(0, dtype=torch.int, device=device)
w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0],
bt.w_ref.shape[1], bt.wtype.size_bits)
if bt.a.dtype.is_floating_point:
assert bt.w_ch_s is None
assert bt.w_tok_s is None
assert bt.group_size is not None
fn = lambda: ops.gptq_marlin_gemm(a=bt.a,
b_q_weight=w_q,
b_scales=w_s,
b_zeros=w_zp,
g_idx=g_idx,
perm=sort_indices,
workspace=workspace.scratch,
b_q_type=bt.wtype,
size_m=bt.a.shape[0],
size_n=bt.w_ref.shape[1],
size_k=bt.w_ref.shape[0],
is_k_full=True,
is_zp_float=False)
else:
assert bt.a.dtype == torch.int8
assert bt.wtype == scalar_types.uint4b8
if bt.w_ch_s is not None:
s_ch = bt.w_ch_s.to(torch.float32)
else:
s_ch = torch.ones(bt.w_ref.shape[1],
dtype=torch.float32,
device=device)
if bt.w_tok_s is not None:
s_tok = bt.w_tok_s.to(torch.float32)
else:
s_tok = torch.ones(bt.a.shape[0],
dtype=torch.float32,
device=device)
fn = lambda: ops.marlin_qqq_gemm(a=bt.a,
b_q_weight=w_q,
s_group=w_s,
s_tok=s_tok,
s_ch=s_ch,
workspace=workspace.scratch,
size_m=bt.a.shape[0],
size_n=bt.w_ref.shape[1],
size_k=bt.w_ref.shape[0])
return fn
def machete_create_bench_fn(bt: BenchmarkTensors,
out_type=torch.dtype,
schedule=None) -> Callable:
w_q = bt.w_q.t().contiguous().t() # make col major
w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype,
None if bt.w_g_s is None else bt.w_g_s.dtype)
w_g_zp = bt.w_g_zp
if w_g_zp is not None:
w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
return lambda: ops.machete_mm(
a=bt.a,
b_q=bt.w_q,
b_type=bt.wtype,
b_group_scales=bt.w_g_s,
b_group_zeros=w_g_zp,
b_group_size=bt.group_size,
b_channel_scales=bt.w_ch_s,
a_token_scales=bt.w_tok_s,
out_type=out_type,
schedule=schedule,
)
# impl
# bench
def bench_fn(label: str, sub_label: str, description: str,
fn: Callable) -> TMeasurement:
min_run_time = 1
return TBenchmark.Timer(
stmt="fn()",
def bench_fns(label: str, sub_label: str, description: str,
fns: List[Callable]):
min_run_time = 1 if not NVTX_PROFILE else 0.1
res = TBenchmark.Timer(
stmt="""
for fn in fns:
fn()
""",
globals={
"fn": fn
"fns": fns
},
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)
if NVTX_PROFILE:
with nvtx.annotate("mm-bench"), nvtx.annotate(
f"{label}|{sub_label}|{description}"):
fns[0]()
def loop_over_weights(
a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
torch.tensor, torch.tensor]],
fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
None]):
for w_ref, w_q, w_s, _ in weights:
fn(a, w_ref, w_q, w_s)
return res
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
def bench(atype: torch.dtype,
wtype: ScalarType,
def bench(types: TypeConfig,
group_size: int,
m: int,
k: int,
n: int,
label: str,
sub_label: str,
benchmark_marlinv1: bool = True,
sweep_schedules: bool = True) -> Iterable[TMeasurement]:
global _SWEEP_SCHEDULES_RESULTS
sweep_schedules: bool = True) -> List[TMeasurement]:
benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
sub_label += f", L={len(benchmark_tensors)}"
a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
sub_label += f", L={len(weights)}"
weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
for w_ref, w_q, w_s, w_zp in weights]
name_type_string = f"W{types.weight_type}"+\
f"-A{terse_type_name(types.act_type)}"
if types.group_scale_type is not None:
name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
if types.group_zero_type is not None:
name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
if group_size is not None:
name_type_string += f"-G{group_size}"
if types.channel_scale_type is not None:
name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
if types.token_scale_type is not None:
name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"
timers = []
# pytorch impl
timers.append(
bench_fn(
label, sub_label, "torch.matmul", lambda: loop_over_weights(
a,
weights,
lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
)))
bench_fns(
label, sub_label, "torch.matmul (fp16)",
[torch_matmul_f16_create_bench_fn(bt)
for bt in benchmark_tensors]))
if benchmark_marlinv1:
w_ref = weights[0][0]
w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
wtype.size_bits)
def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
return marlin_permute_scales(w_s, *w_ref.shape, group_size)
weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
marlinv1_permute_scales(w_s), w_zp)
for w_ref, w_q, w_s, w_zp in weights]
workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_MAX_PARALLEL)
# marlinv1
if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
timers.append(
bench_fn(
label, sub_label, "marlin_orig", lambda: loop_over_weights(
a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
gptq_marlin_gemm(a,
w_q,
w_s,
w_zp_empty,
g_idx,
sort_indices,
workspace.scratch,
wtype,
size_m=a.shape[0],
size_n=w_ref.shape[1],
size_k=w_ref.shape[0],
is_k_full=True))))
bench_fns(
label, sub_label,
f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [
cutlass_scaled_mm_create_bench_fn(bt)
for bt in benchmark_tensors
]))
if types.act_type != torch.float8_e4m3fn:
timers.append(
bench_fns(label, sub_label, f"marlin ({name_type_string})",
[marlin_create_bench_fn(bt)
for bt in benchmark_tensors]))
# machete
timers.append(
bench_fn(
label, sub_label, "machete_heuristic", lambda: loop_over_weights(
a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
bench_fns(label, sub_label, f"machete ({name_type_string})", [
machete_create_bench_fn(bt, out_type=types.output_type)
for bt in benchmark_tensors
]))
if sweep_schedules:
global _SWEEP_SCHEDULES_RESULTS
print("Finding best schedule for machete")
best = None
best_schedule = None
schedules = ops.machete_supported_schedules(wtype)
schedules = ops.machete_supported_schedules(
a_type=types.act_type,
b_type=types.weight_type,
group_scales_type=types.group_scale_type,
group_zeros_type=types.group_zero_type,
token_scales_type=types.token_scale_type,
channel_scales_type=types.channel_scale_type,
out_type=types.output_type)
if schedules is None or len(schedules) == 0:
raise ValueError("No schedules found to sweep")
for schedule in reversed(schedules):
schedule_M = int(schedule.split("_")[0].split("x")[1])
@ -177,16 +381,11 @@ def bench(atype: torch.dtype,
if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
continue
def run(a, _, w_q, w_s, schedule=schedule):
ops.machete_gemm(a,
w_q,
wtype,
w_s,
b_group_size=group_size,
schedule=schedule)
res = bench_fn(label, sub_label, "machete_best",
lambda: loop_over_weights(a, weights_machete, run))
res = bench_fns(label, sub_label, "machete_best", [
machete_create_bench_fn(
bt, out_type=types.output_type, schedule=schedule)
for bt in benchmark_tensors
])
results_row = {
"M": m,
@ -213,25 +412,33 @@ def bench(atype: torch.dtype,
# runner
def print_timers(timers: Iterable[TMeasurement]):
def print_timers(timers: List[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
def run(dtype: torch.dtype, sweep_schedules: bool,
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
types = TypeConfig(
act_type=args.act_type,
weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
else scalar_types.uint4,
output_type=args.out_type,
group_scale_type=args.group_scale_type,
group_zero_type=args.group_zero_type,
channel_scale_type=args.channel_scale_type,
token_scale_type=args.token_scale_type,
)
results = []
results: List[TMeasurement] = []
for m, k, n in MKNs:
timers = bench(dtype,
scalar_types.uint4b8,
128,
timers = bench(types,
args.group_size,
m,
k,
n,
f"{dtype}-gemm",
f"{args.act_type}-gemm",
f"MKN=({m}x{k}x{n})",
sweep_schedules=sweep_schedules)
sweep_schedules=args.sweep_schedules)
print_timers(timers)
results.extend(timers)
@ -240,7 +447,7 @@ def run(dtype: torch.dtype, sweep_schedules: bool,
# output makers
def make_output(
data: Iterable[TMeasurement],
data: List[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None,
@ -262,7 +469,6 @@ def run_square_bench(args):
dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, args.sweep_schedules, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}")
@ -306,33 +512,49 @@ def run_model_bench(args):
for k, n in KNs:
MKNs.append((m, k, n))
data = run(args.dtype, args.sweep_schedules, MKNs)
data = run(args, MKNs)
model_bench_data.append(data)
type_string = f"{args.act_type}"
# Print all results
for data, model_tp in zip(model_bench_data, models_tps):
model, tp_size = model_tp
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
print(f"== Results {type_string} {model}-TP{tp_size} ====")
print_timers(data)
timestamp = int(time.time())
timestr = time.strftime("%Y%m%d-%H%M%S")
all_data = []
all_results = []
for d in model_bench_data:
all_data.extend(d)
all_results.extend(d)
# pickle all data
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
pkl.dump(all_data, f)
with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
args_dict = vars(args)
args_dict.pop("func")
pkl.dump({
"args": args_dict,
"results": all_results,
}, f)
if __name__ == "__main__":
def to_torch_dtype(dt):
if dt == "bfloat16":
return torch.bfloat16
if dt == "float16":
return torch.float16
raise ValueError("unsupported dtype")
return {
"bfloat16": torch.bfloat16,
"float16": torch.float16,
"int8": torch.int8,
"float8_e4m3fn": torch.float8_e4m3fn,
"int": torch.int,
"float": torch.float,
}[dt]
class ToTorchDtype(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, to_torch_dtype(values))
parser = FlexibleArgumentParser(
description="""
@ -352,12 +574,42 @@ Benchmark Machete GEMM.
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--dtype",
type=to_torch_dtype,
"--act-type",
action=ToTorchDtype,
required=True,
help="Available options are ['bfloat16', 'float16']",
choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'],
)
parser.add_argument(
"--group-scale-type",
action=ToTorchDtype,
choices=['bfloat16', 'float16'],
)
parser.add_argument(
"--group-zero-type",
type=to_torch_dtype,
choices=['bfloat16', 'float16'],
)
parser.add_argument(
"--channel-scale-type",
action=ToTorchDtype,
choices=['float'],
)
parser.add_argument(
"--token-scale-type",
action=ToTorchDtype,
choices=['float'],
)
parser.add_argument(
"--out-type",
action=ToTorchDtype,
choices=['bfloat16', 'float16'],
)
parser.add_argument(
"--group-size",
type=int,
help="Available options are ['None', '-1', '128'], default=128",
default=128,
)
parser.add_argument(
"--sweep-schedules",

View File

@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
results.append(
benchmark.Timer(
stmt=
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
results.append(
benchmark.Timer(
stmt=
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,

View File

@ -1,6 +1,7 @@
import argparse
import time
from datetime import datetime
from itertools import product
from typing import Any, Dict, List, Tuple, TypedDict
import ray
@ -13,6 +14,9 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
) else torch.float8_e4m3fn
class BenchmarkConfig(TypedDict):
BLOCK_SIZE_M: int
@ -80,8 +84,8 @@ def benchmark_config(
a1_scale = torch.randn(1, dtype=torch.float32)
a2_scale = torch.randn(1, dtype=torch.float32)
w1 = w1.to(torch.float8_e4m3fn)
w2 = w2.to(torch.float8_e4m3fn)
w1 = w1.to(FP8_DTYPE)
w2 = w2.to(FP8_DTYPE)
input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
@ -141,28 +145,172 @@ def benchmark_config(
return avg
def get_configs_compute_bound() -> List[Dict[str, int]]:
# Reduced search space for faster tuning.
# TODO(woosuk): Increase the search space and use a performance model to
# prune the search space.
def get_rocm_tuning_space(use_fp16):
block_mn_range = [16, 32, 64, 128, 256]
block_k_range = [16, 32, 64, 128, 256]
if not use_fp16:
block_k_range.remove(16) # BLOCK_K=16 not supported for fp8
num_warps_range = [1, 2, 4, 8]
group_m_range = [1, 4, 8, 16, 32]
num_stage_range = [2]
waves_per_eu_range = [0]
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
kpack_range = [1, 2] if use_fp16 else []
param_ranges = {
"BLOCK_SIZE_M": block_mn_range,
"BLOCK_SIZE_N": block_mn_range,
"BLOCK_SIZE_K": block_k_range,
"GROUP_SIZE_M": group_m_range,
"num_warps": num_warps_range,
"num_stages": num_stage_range,
"waves_per_eu": waves_per_eu_range,
}
if use_fp16:
param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
param_ranges["kpack"] = kpack_range
return param_ranges
def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
configs: List[BenchmarkConfig] = []
for num_stages in [2, 3, 4, 5]:
for block_m in [16, 32, 64, 128, 256]:
for block_k in [64, 128, 256]:
for block_n in [32, 64, 128, 256]:
for num_warps in [4, 8]:
for group_size in [1, 16, 32, 64]:
configs.append({
"BLOCK_SIZE_M": block_m,
"BLOCK_SIZE_N": block_n,
"BLOCK_SIZE_K": block_k,
"GROUP_SIZE_M": group_size,
"num_warps": num_warps,
"num_stages": num_stages,
})
if current_platform.is_rocm():
param_ranges = get_rocm_tuning_space(use_fp16)
else:
# Reduced search space for faster tuning.
# TODO(woosuk): Increase the search space and use a performance model to
# prune the search space.
block_m_range = [16, 32, 64, 128, 256]
block_n_range = [32, 64, 128, 256]
block_k_range = [64, 128, 256]
num_warps_range = [4, 8]
group_m_range = [1, 16, 32, 64]
num_stage_range = [2, 3, 4, 5]
param_ranges = {
"BLOCK_SIZE_M": block_m_range,
"BLOCK_SIZE_N": block_n_range,
"BLOCK_SIZE_K": block_k_range,
"GROUP_SIZE_M": group_m_range,
"num_warps": num_warps_range,
"num_stages": num_stage_range,
}
keys, values = zip(*param_ranges.items())
for config_values in product(*values):
config = dict(zip(keys, config_values))
configs.append(config)
return configs
def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
search_space, is_fp16):
N1, K1 = shard_intermediate_size, hidden_size
N2, K2 = hidden_size, shard_intermediate_size // 2
pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
is_fp16)
pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
is_fp16)
search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
return search_space
# The following code is inspired by ROCm/Triton GEMM tuning script:
# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
def prune_rocm_configs(M, N, K, configs, is_fp16=True):
pruned_configs = []
elemBytes_a = 2 if is_fp16 else 1
elemBytes_b = 2 if is_fp16 else 1
mfma = 16 if M < 32 or N < 32 else 32
# TODO (zhanglx): figure out the boundary between large and small gemms
large_gemm = False
if M >= 2048 and N >= 2048:
large_gemm = True
for config in configs:
BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
num_warps = config.get("num_warps")
if is_fp16:
matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
if matrix_instr_nonkdim > mfma:
continue
if mfma == 4 and BLOCK_SIZE_K < 64:
continue
# some layouts could not work properly in case
# number elements per thread is less 1
if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
continue
SPLIT_K = config.get("SPLIT_K", 1)
GROUP_M = config.get("GROUP_SIZE_M")
if is_fp16:
if (matrix_instr_nonkdim > BLOCK_SIZE_M
or matrix_instr_nonkdim > BLOCK_SIZE_N):
continue
if (matrix_instr_nonkdim >= M
and matrix_instr_nonkdim != BLOCK_SIZE_M):
continue
if (matrix_instr_nonkdim >= N
and matrix_instr_nonkdim != BLOCK_SIZE_N):
continue
# Skip BLOCK_SIZE that is too large compare to M/N
# unless BLOCK_SIZE is already small enough
if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
continue
if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
continue
# skip large split_k when not necessary
if SPLIT_K != 1 and not need_split_k(M, N, K):
continue
# skip split_k that leads to EVEN_K = false
leap = SPLIT_K * BLOCK_SIZE_K
modv = K % leap
if modv != 0:
continue
# skip large GROUP_M
if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
continue
# out of shared memory resource
# TODO (zhanglx): This does not consider the LDS usage in the epilogue
LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a +
BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b)
if LDS > 65536:
continue
# Skip small block sizes and num_warps for large gemm
# For fp16 and f8, we want to only use BLOCK_SIZE >= 64
if large_gemm:
if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
continue
if BLOCK_SIZE_K < 64:
continue
if num_warps < 4:
continue
pruned_configs.append(config)
return pruned_configs
def need_split_k(SIZE_M, SIZE_N, SIZE_K):
return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
def merge_unique_dicts(list1, list2):
result = []
combined_list = list1.copy()
combined_list.extend(list2)
for dictionary in combined_list:
if dictionary not in result:
result.append(dictionary)
return result
@ray.remote(num_gpus=1)
class BenchmarkWorker:
@ -170,6 +318,10 @@ class BenchmarkWorker:
torch.set_default_device("cuda")
current_platform.seed_everything(seed)
self.seed = seed
# Get the device ID to allocate tensors and kernels
# on the respective GPU. This is required for Ray to work
# correctly with multi-GPU tuning on the ROCm platform.
self.device_id = int(ray.get_gpu_ids()[0])
def benchmark(
self,
@ -217,25 +369,33 @@ class BenchmarkWorker:
) -> Dict[str, int]:
best_config = None
best_time = float("inf")
for config in tqdm(search_space):
try:
kernel_time = benchmark_config(config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
num_iters=10)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
if current_platform.is_rocm():
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = prune_rocm_search_space(num_tokens,
shard_intermediate_size,
hidden_size, search_space,
is_fp16)
if kernel_time < best_time:
best_time = kernel_time
best_config = config
with torch.cuda.device(self.device_id):
for config in tqdm(search_space):
try:
kernel_time = benchmark_config(config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
num_iters=20)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
if kernel_time < best_time:
best_time = kernel_time
best_config = config
now = datetime.now()
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
assert best_config is not None
@ -244,12 +404,27 @@ class BenchmarkWorker:
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
return {
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
"BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
"GROUP_SIZE_M": config["GROUP_SIZE_M"],
"num_warps": config["num_warps"],
"num_stages": config["num_stages"],
"BLOCK_SIZE_M":
config["BLOCK_SIZE_M"],
"BLOCK_SIZE_N":
config["BLOCK_SIZE_N"],
"BLOCK_SIZE_K":
config["BLOCK_SIZE_K"],
"GROUP_SIZE_M":
config["GROUP_SIZE_M"],
"num_warps":
config["num_warps"],
"num_stages":
config["num_stages"],
**({
"waves_per_eu": config["waves_per_eu"]
} if "waves_per_eu" in config else {}),
**({
"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
} if "matrix_instr_nonkdim" in config else {}),
**({
"kpack": config["kpack"]
} if "kpack" in config else {}),
}
@ -275,7 +450,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
def main(args: argparse.Namespace):
print(args)
config = AutoConfig.from_pretrained(args.model)
config = AutoConfig.from_pretrained(
args.model, trust_remote_code=args.trust_remote_code)
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
@ -286,6 +462,11 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "DeepseekV3ForCausalLM":
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else:
# Default: Mixtral.
E = config.num_local_experts
@ -294,7 +475,7 @@ def main(args: argparse.Namespace):
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
@ -322,7 +503,8 @@ def main(args: argparse.Namespace):
return ray.get(outputs)
if args.tune:
search_space = get_configs_compute_bound()
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16)
print(f"Start tuning over {len(search_space)} configurations...")
start = time.time()
@ -362,6 +544,7 @@ if __name__ == "__main__":
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true")
parser.add_argument("--trust-remote-code", action="store_true")
args = parser.parse_args()
main(args)

View File

@ -98,7 +98,9 @@ def main(
start_time = time.perf_counter()
# Using default kv_scale
k_scale = v_scale = 1.0
k_scale = v_scale = torch.tensor(1.0,
dtype=torch.float32,
device=device)
for _ in range(num_iters):
if version == "v1":

View File

@ -0,0 +1,262 @@
import itertools
from typing import Optional, Tuple, Union
import torch
import triton
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
from torch import nn
from vllm import _custom_ops as vllm_ops
class HuggingFaceRMSNorm(nn.Module):
def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
orig_dtype = x.dtype
x = x.to(torch.float32)
if residual is not None:
x = x + residual.to(torch.float32)
residual = x.to(orig_dtype)
variance = x.pow(2).mean(dim=-1, keepdim=True)
x = x * torch.rsqrt(variance + self.variance_epsilon)
x = x.to(orig_dtype) * self.weight
if residual is None:
return x
else:
return x, residual
def rmsnorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
eps: float = 1e-6,
):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
naive_norm.weight = nn.Parameter(weight)
naive_norm = naive_norm.to(x.device)
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
if residual is not None:
residual = residual.view(-1, residual.shape[-1])
output = naive_norm(x, residual)
if isinstance(output, tuple):
output = (output[0].view(orig_shape), output[1].view(orig_shape))
else:
output = output.view(orig_shape)
return output
def rmsnorm_flashinfer(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
if residual is not None:
residual = residual.view(-1, residual.shape[-1])
if residual is not None:
fused_add_rmsnorm(x, residual, weight, eps)
output = (x, residual)
else:
output = rmsnorm(x, weight, eps)
if isinstance(output, tuple):
output = (output[0].view(orig_shape), output[1].view(orig_shape))
else:
output = output.view(orig_shape)
return output
def rmsnorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
if residual is not None:
residual = residual.view(-1, residual.shape[-1])
if residual is not None:
vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
output = (x, residual)
else:
out = torch.empty_like(x)
vllm_ops.rms_norm(out, x, weight, eps)
output = out
if isinstance(output, tuple):
output = (output[0].view(orig_shape), output[1].view(orig_shape))
else:
output = output.view(orig_shape)
return output
def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
dtype = torch.bfloat16
x = torch.randn(batch_size,
seq_len,
hidden_size,
dtype=dtype,
device="cuda")
weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
residual = torch.randn_like(x) if use_residual else None
output_naive = rmsnorm_naive(
x.clone(), weight,
residual.clone() if residual is not None else None)
output_flashinfer = rmsnorm_flashinfer(
x.clone(), weight,
residual.clone() if residual is not None else None)
output_vllm = rmsnorm_vllm(
x.clone(), weight,
residual.clone() if residual is not None else None)
if use_residual:
output_naive = output_naive[0]
output_flashinfer = output_flashinfer[0]
output_vllm = output_vllm[0]
print(f"Naive output={output_naive}")
print(f"FlashInfer output={output_flashinfer}")
print(f"VLLM output={output_vllm}")
if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
rtol=1e-2) and torch.allclose(
output_naive, output_vllm, atol=1e-2, rtol=1e-2):
print("✅ All implementations match")
else:
print("❌ Implementations differ")
batch_size_range = [2**i for i in range(0, 7, 2)]
seq_length_range = [2**i for i in range(6, 11, 1)]
head_num_range = [32, 48]
configs = list(
itertools.product(head_num_range, batch_size_range, seq_length_range))
def get_benchmark(use_residual):
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["head_num", "batch_size", "seq_len"],
x_vals=[list(_) for _ in configs],
line_arg="provider",
line_vals=["huggingface", "flashinfer", "vllm"],
line_names=["HuggingFace", "FlashInfer", "vLLM"],
styles=[("blue", "-"), ("green", "-"), ("red", "-")],
ylabel="us",
plot_name=
f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
args={},
))
def benchmark(head_num, batch_size, seq_len, provider):
dtype = torch.bfloat16
hidden_size = head_num * 128 # assuming head_dim = 128
x = torch.randn(batch_size,
seq_len,
hidden_size,
dtype=dtype,
device="cuda")
weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
residual = torch.randn_like(x) if use_residual else None
quantiles = [0.5, 0.2, 0.8]
if provider == "huggingface":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: rmsnorm_naive(
x.clone(),
weight,
residual.clone() if residual is not None else None,
),
quantiles=quantiles,
)
elif provider == "flashinfer":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: rmsnorm_flashinfer(
x.clone(),
weight,
residual.clone() if residual is not None else None,
),
quantiles=quantiles,
)
else:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: rmsnorm_vllm(
x.clone(),
weight,
residual.clone() if residual is not None else None,
),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
return benchmark
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--batch-size",
type=int,
default=4,
help="Batch size",
)
parser.add_argument(
"--seq-len",
type=int,
default=128,
help="Sequence length",
)
parser.add_argument(
"--hidden-size",
type=int,
default=4096,
help="Hidden size (2nd dimension) of the sequence",
)
parser.add_argument("--use-residual",
action="store_true",
help="Whether to use residual connection")
parser.add_argument(
"--save-path",
type=str,
default="./configs/rmsnorm/",
help="Path to save rmsnorm benchmark results",
)
args = parser.parse_args()
# Run correctness test
calculate_diff(batch_size=args.batch_size,
seq_len=args.seq_len,
hidden_size=args.hidden_size,
use_residual=args.use_residual)
# Get the benchmark function with proper use_residual setting
benchmark = get_benchmark(args.use_residual)
# Run performance benchmark
benchmark.run(print_data=True, save_path=args.save_path)

View File

@ -20,10 +20,11 @@ if __name__ == "__main__":
args = parser.parse_args()
with open(args.filename, 'rb') as f:
data: List[TMeasurement] = pickle.load(f)
data = pickle.load(f)
raw_results: List[TMeasurement] = data["results"]
results = defaultdict(lambda: list())
for v in data:
for v in raw_results:
result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
if result is not None:
KN = result.group(1)

210
benchmarks/kernels/utils.py Normal file
View File

@ -0,0 +1,210 @@
import dataclasses
from typing import Any, Callable, Iterable, Optional
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
@dataclasses.dataclass
class CudaGraphBenchParams:
num_ops_in_cuda_graph: int
@dataclasses.dataclass
class ArgPool:
"""
When some argument of the benchmarking function is annotated with this type,
the benchmarking class (BenchMM) will collapse the argument to a pick a
single value from the given list of values, during function invocation.
For every invocation during a benchmarking run, it will choose a
different value from the list.
"""
values: Iterable[Any]
def __getitem__(self, index):
return self.values[index]
class Bench:
class ArgsIterator:
def __init__(self, args_list, kwargs_list):
assert len(args_list) == len(kwargs_list)
self.args_list = args_list
self.kwargs_list = kwargs_list
self.n = len(self.args_list)
self.idx = 0
def __next__(self):
while True:
yield (self.args_list[self.idx], self.kwargs_list[self.idx])
self.idx += 1
self.idx = self.idx % self.n
def reset(self):
self.idx = 0
@property
def n_args(self):
return self.n
def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
label: str, sub_label: str, description: str, fn: Callable,
*args, **kwargs):
self.cuda_graph_params = cuda_graph_params
self.use_cuda_graph = self.cuda_graph_params is not None
self.label = label
self.sub_label = sub_label
self.description = description
self.fn = fn
# Process args
self._args = args
self._kwargs = kwargs
self.args_list, self.kwargs_list = self.collapse_argpool(
*args, **kwargs)
self.args_iterator = self.ArgsIterator(self.args_list,
self.kwargs_list)
# Cudagraph runner
self.g = None
if self.use_cuda_graph:
self.g = self.get_cuda_graph_runner()
# benchmark run params
self.min_run_time = 1
def collapse_argpool(self, *args, **kwargs):
argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
arg for arg in kwargs.values() if isinstance(arg, ArgPool)
]
if len(argpool_args) == 0:
return [args], [kwargs]
# Make sure all argpools are of the same size
argpool_size = len(argpool_args[0].values)
assert all([argpool_size == len(arg.values) for arg in argpool_args])
# create copies of the args
args_list = []
kwargs_list = []
for _ in range(argpool_size):
args_list.append(args)
kwargs_list.append(kwargs.copy())
for i in range(argpool_size):
# collapse args; Just pick the ith value
args_list[i] = tuple([
arg[i] if isinstance(arg, ArgPool) else arg
for arg in args_list[i]
])
# collapse kwargs
kwargs_i = kwargs_list[i]
arg_pool_keys = [
k for k, v in kwargs_i.items() if isinstance(v, ArgPool)
]
for k in arg_pool_keys:
# again just pick the ith value
kwargs_i[k] = kwargs_i[k][i]
kwargs_list[i] = kwargs_i
return args_list, kwargs_list
def get_cuda_graph_runner(self):
assert self.use_cuda_graph
assert self.args_iterator is not None
num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
# warmup
args_it = self.args_iterator.__next__()
for _ in range(2):
args, kwargs = next(args_it)
self.fn(*args, **kwargs)
self.args_iterator.reset()
args_it = self.args_iterator.__next__()
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
for _ in range(num_graph_ops):
args, kwargs = next(args_it)
self.fn(*args, **kwargs)
return g
def run_cudagrah(self) -> TMeasurement:
assert self.use_cuda_graph
globals = {'g': self.g}
return TBenchmark.Timer(
stmt="g.replay()",
globals=globals,
label=(
f"{self.label}"
f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
),
sub_label=self.sub_label,
description=self.description,
).blocked_autorange(min_run_time=self.min_run_time)
def run_eager(self) -> TMeasurement:
setup = None
stmt = None
globals = None
has_arg_pool = self.args_iterator.n_args > 1
if has_arg_pool:
setup = '''
args_iterator.reset()
args_it = args_iterator.__next__()
'''
stmt = '''
args, kwargs = next(args_it)
fn(*args, **kwargs)
'''
globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
else:
# no arg pool. Just use the args and kwargs directly
self.args_iterator.reset()
args_it = self.args_iterator.__next__()
args, kwargs = next(args_it)
setup = ""
stmt = '''
fn(*args, **kwargs)
'''
globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
return TBenchmark.Timer(
stmt=stmt,
setup=setup,
globals=globals,
label=self.label,
sub_label=self.sub_label,
description=self.description,
).blocked_autorange(min_run_time=self.min_run_time)
def run(self) -> TMeasurement:
timer = None
if self.use_cuda_graph: # noqa SIM108
timer = self.run_cudagrah()
else:
timer = self.run_eager()
if not timer.meets_confidence() or timer.has_warnings:
print("Doesn't meet confidence - re-running bench ...")
return self.run()
return timer
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
if exc_type:
print(f"exc type {exc_type}")
print(f"exc value {exc_value}")
print(f"exc traceback {traceback}")

View File

@ -40,4 +40,10 @@ WEIGHT_SHAPES = {
([8192, 57344], 1),
([28672, 8192], 0),
],
"meta-llama/Llama-3.1-405b-hf": [
([16384, 18432], 1),
([16384, 16384], 0),
([16384, 106496], 1),
([53248, 16384], 0),
],
}

View File

@ -0,0 +1,113 @@
{
"$schema":
"https://json-schema.org/draft/2020-12/schema",
"title":
"User Profile",
"type":
"object",
"properties": {
"userId": {
"type": "string",
"description": "Unique identifier for the user."
},
"personalInfo": {
"type": "object",
"properties": {
"firstName": {
"type": "string",
"description": "The user's first name."
},
"lastName": {
"type": "string",
"description": "The user's last name."
},
"age": {
"type": "integer",
"minimum": 0,
"description": "The user's age."
},
"phoneNumbers": {
"type":
"array",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["home", "work", "mobile"],
"description": "Type of phone number."
},
"number": {
"type": "string",
"pattern": "^\\+?[1-9]\\d{1,14}$",
"description": "Phone number in E.164 format."
}
},
"required": ["type", "number"]
},
"description":
"List of phone numbers associated with the user."
}
},
"required": ["firstName", "lastName"]
},
"address": {
"type": "object",
"properties": {
"street": {
"type": "string",
"description": "Street address."
},
"city": {
"type": "string",
"description": "City name."
},
"state": {
"type": "string",
"description": "State or province."
},
"postalCode": {
"type": "string",
"pattern": "^\\d{5}(-\\d{4})?$",
"description": "Postal code."
},
"country": {
"type": "string",
"description": "Country name."
}
},
"required": ["street", "city", "state", "postalCode", "country"]
},
"preferences": {
"type": "object",
"properties": {
"newsletterSubscribed": {
"type":
"boolean",
"description":
"Indicates if the user is subscribed to the newsletter."
},
"favoriteCategories": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of user's favorite categories."
}
},
"required": ["newsletterSubscribed"]
},
"accountStatus": {
"type": "string",
"enum": ["active", "inactive", "suspended"],
"description": "Current status of the user's account."
},
"registrationDate": {
"type": "string",
"format": "date-time",
"description": "ISO 8601 formatted date-time of user registration."
}
},
"required":
["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
}

View File

@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(MACOSX_FOUND TRUE)
endif()
#
# Define environment variables for special configurations
#
@ -13,22 +18,40 @@ endif()
include_directories("${CMAKE_SOURCE_DIR}/csrc")
set (ENABLE_NUMA TRUE)
#
# Check the compile flags
#
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-mf16c"
"-DVLLM_CPU_EXTENSION")
execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO)
if (NOT CPUINFO_RET EQUAL 0)
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
list(APPEND CXX_COMPILE_FLAGS
"-mf16c"
)
endif()
if(MACOSX_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-Xpreprocessor"
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
else()
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
endif()
if (NOT MACOSX_FOUND)
execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO)
if (NOT CPUINFO_RET EQUAL 0)
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
endif()
endif()
function (find_isa CPUINFO TARGET OUT)
string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
if(NOT ISA_FOUND EQUAL -1)
@ -49,10 +72,17 @@ endfunction()
is_avx512_disabled(AVX512_DISABLED)
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
set(APPLE_SILICON_FOUND TRUE)
else()
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED)
list(APPEND CXX_COMPILE_FLAGS
@ -72,9 +102,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
else()
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
endif()
elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA")
elseif (POWER9_FOUND OR POWER10_FOUND)
message(STATUS "PowerPC detected")
# Check for PowerPC VSX support
@ -82,8 +114,23 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
"-mvsx"
"-mcpu=native"
"-mtune=native")
elseif (ASIMD_FOUND)
message(STATUS "ARMv8 or later architecture detected")
if(ARM_BF16_FOUND)
message(STATUS "BF16 extension detected")
set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
add_compile_definitions(ARM_BF16_SUPPORT)
else()
message(WARNING "BF16 functionality is not available")
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
endif()
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
elseif(APPLE_SILICON_FOUND)
message(STATUS "Apple Silicon Detected")
set(ENABLE_NUMA OFF)
else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
endif()
#
@ -118,7 +165,12 @@ endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS numa)
if(ENABLE_NUMA)
list(APPEND LIBS numa)
else()
message(STATUS "NUMA is disabled")
add_compile_definitions(-DVLLM_NUMA_DISABLED)
endif()
#
# _C extension
@ -153,4 +205,4 @@ define_gpu_extension_target(
WITH_SOABI
)
message(STATUS "Enabling C extension.")
message(STATUS "Enabling C extension.")

View File

@ -58,8 +58,8 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
#
set(SRCS ${ORIG_SRCS})
set(CXX_SRCS ${ORIG_SRCS})
list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
#
# Generate ROCm/HIP source file names from CUDA file names.
@ -259,7 +259,7 @@ endmacro()
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
# 9.0a to the result.
# 9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS).
# The result is stored in `OUT_CUDA_ARCHS`.
#
# Example:
@ -270,34 +270,47 @@ endmacro()
#
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
set(_CUDA_ARCHS)
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
set(_CUDA_ARCHS "9.0a")
endif()
endif()
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
# for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is
# less or eqault to ARCH
foreach(_ARCH ${CUDA_ARCHS})
set(_TMP_ARCH)
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
set(_TMP_ARCH ${_SRC_ARCH})
else()
break()
# for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
# is less or equal to ARCH (but has the same major version since SASS binary
# compatibility is only forward compatible within the same major version).
foreach(_ARCH ${TGT_CUDA_ARCHS_})
set(_TMP_ARCH)
# Extract the major version of the target arch
string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
# Extract the major version of the source arch
string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
# Check major-version match AND version-less-or-equal
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
set(_TMP_ARCH "${_SRC_ARCH}")
endif()
else()
# If we hit a version greater than the target, we can break
break()
endif()
endforeach()
# If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
if (_TMP_ARCH)
list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
endif()
endforeach()
if (_TMP_ARCH)
list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
endif()
endforeach()
list(REMOVE_DUPLICATES _CUDA_ARCHS)
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)

View File

@ -9,8 +9,16 @@
namespace vllm {
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
bool act_first>
__device__ __forceinline__ scalar_t compute(const scalar_t& x,
const scalar_t& y) {
return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
}
// Activation and gating kernel template.
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
bool act_first>
__global__ void act_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
@ -19,7 +27,7 @@ __global__ void act_and_mul_kernel(
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
out[token_idx * d + idx] = ACT_FN(x) * y;
out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
}
}
@ -55,7 +63,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
} // namespace vllm
// Launch activation and gating kernel.
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
// Use ACT_FIRST (bool) indicating whether to apply the activation function
// first.
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens); \
@ -64,7 +74,7 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "act_and_mul_kernel", [&] { \
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
});
@ -72,19 +82,27 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
void silu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
}
void mul_and_silu(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
// The difference between mul_and_silu and silu_and_mul is that mul_and_silu
// applies the silu to the latter half of the input.
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
}
void gelu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
}
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
}
namespace vllm {

View File

@ -105,7 +105,7 @@ __device__ void paged_attention_kernel(
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int kv_block_stride, const int kv_head_stride,
const float k_scale, const float v_scale, const int tp_rank,
const float* k_scale, const float* v_scale, const int tp_rank,
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
const int seq_idx = blockIdx.y;
@ -285,7 +285,7 @@ __device__ void paged_attention_kernel(
Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
k_ptr + offset1 * BLOCK_SIZE * x + offset2);
k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
k_vec_quant, k_scale);
k_vec_quant, *k_scale);
}
}
@ -415,7 +415,7 @@ __device__ void paged_attention_kernel(
*reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
// Vector conversion from V_quant_vec to V_vec.
v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
v_scale);
*v_scale);
}
if (block_idx == num_seq_blocks - 1) {
// NOTE(woosuk): When v_vec contains the tokens that are out of the
@ -513,7 +513,7 @@ __global__ void paged_attention_v1_kernel(
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int kv_block_stride, const int kv_head_stride,
const float k_scale, const float v_scale, const int tp_rank,
const float* k_scale, const float* v_scale, const int tp_rank,
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
@ -549,7 +549,7 @@ __global__ void paged_attention_v2_kernel(
const int max_num_blocks_per_seq,
const float* __restrict__ alibi_slopes, // [num_heads]
const int q_stride, const int kv_block_stride, const int kv_head_stride,
const float k_scale, const float v_scale, const int tp_rank,
const float* k_scale, const float* v_scale, const int tp_rank,
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,

View File

@ -41,7 +41,7 @@
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
k_scale_ptr, v_scale_ptr, tp_rank, blocksparse_local_blocks, \
blocksparse_vert_stride, blocksparse_block_size, \
blocksparse_head_sliding_step);
@ -53,10 +53,10 @@ void paged_attention_v1_launcher(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
const int blocksparse_vert_stride, const int blocksparse_block_size,
const int blocksparse_head_sliding_step) {
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
torch::Tensor& v_scale, const int tp_rank,
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -80,6 +80,8 @@ void paged_attention_v1_launcher(
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr<int>();
int* seq_lens_ptr = seq_lens.data_ptr<int>();
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
int padded_max_seq_len =
@ -140,13 +142,10 @@ void paged_attention_v1_launcher(
blocksparse_block_size, blocksparse_head_sliding_step);
#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
switch (is_block_sparse) { \
case true: \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
break; \
case false: \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
break; \
if (is_block_sparse) { \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
} else { \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
@ -179,9 +178,10 @@ void paged_attention_v1(
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& seq_lens, // [num_seqs]
int64_t block_size, int64_t max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale, const int64_t tp_rank,
const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
const int64_t blocksparse_head_sliding_step) {
const bool is_block_sparse = (blocksparse_vert_stride > 1);

View File

@ -37,7 +37,7 @@
exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \
kv_block_stride, kv_head_stride, k_scale_ptr, v_scale_ptr, tp_rank, \
blocksparse_local_blocks, blocksparse_vert_stride, \
blocksparse_block_size, blocksparse_head_sliding_step); \
vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, \
@ -54,10 +54,10 @@ void paged_attention_v2_launcher(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
const int blocksparse_vert_stride, const int blocksparse_block_size,
const int blocksparse_head_sliding_step) {
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
torch::Tensor& v_scale, const int tp_rank,
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -84,6 +84,8 @@ void paged_attention_v2_launcher(
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr<int>();
int* seq_lens_ptr = seq_lens.data_ptr<int>();
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
@ -147,13 +149,10 @@ void paged_attention_v2_launcher(
blocksparse_head_sliding_step);
#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
switch (is_block_sparse) { \
case true: \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
break; \
case false: \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
break; \
if (is_block_sparse) { \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
} else { \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
@ -190,9 +189,10 @@ void paged_attention_v2(
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& seq_lens, // [num_seqs]
int64_t block_size, int64_t max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale, const int64_t tp_rank,
const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
const int64_t blocksparse_head_sliding_step) {
const bool is_block_sparse = (blocksparse_vert_stride > 1);

View File

@ -18,15 +18,15 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
const std::string& kv_cache_dtype, const double k_scale,
const double v_scale);
const std::string& kv_cache_dtype,
torch::Tensor& k_scale, torch::Tensor& v_scale);
void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache,
torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
const std::string& kv_cache_dtype,
const double k_scale, const double v_scale);
torch::Tensor& k_scale, torch::Tensor& v_scale);
// Just for unittest
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,

View File

@ -159,8 +159,8 @@ __global__ void reshape_and_cache_kernel(
// block_size]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int key_stride, const int value_stride, const int num_heads,
const int head_size, const int block_size, const int x, const float k_scale,
const float v_scale) {
const int head_size, const int block_size, const int x,
const float* k_scale, const float* v_scale) {
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
if (slot_idx < 0) {
@ -196,9 +196,9 @@ __global__ void reshape_and_cache_kernel(
value_cache[tgt_value_idx] = tgt_value;
} else {
key_cache[tgt_key_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
value_cache[tgt_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
}
}
}
@ -214,7 +214,7 @@ __global__ void reshape_and_cache_flash_kernel(
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int block_stride, const int key_stride, const int value_stride,
const int num_heads, const int head_size, const int block_size,
const float k_scale, const float v_scale) {
const float* k_scale, const float* v_scale) {
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
@ -239,9 +239,9 @@ __global__ void reshape_and_cache_flash_kernel(
value_cache[tgt_key_value_idx] = tgt_value;
} else {
key_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
value_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
}
}
}
@ -258,7 +258,9 @@ __global__ void reshape_and_cache_flash_kernel(
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
num_heads, head_size, block_size, x, k_scale, v_scale);
num_heads, head_size, block_size, x, \
reinterpret_cast<const float*>(k_scale.data_ptr()), \
reinterpret_cast<const float*>(v_scale.data_ptr()));
void reshape_and_cache(
torch::Tensor& key, // [num_tokens, num_heads, head_size]
@ -268,8 +270,8 @@ void reshape_and_cache(
torch::Tensor&
value_cache, // [num_blocks, num_heads, head_size, block_size]
torch::Tensor& slot_mapping, // [num_tokens]
const std::string& kv_cache_dtype, const double k_scale,
const double v_scale) {
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale) {
int num_tokens = key.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
@ -299,7 +301,9 @@ void reshape_and_cache(
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
value_stride, num_heads, head_size, block_size, k_scale, v_scale);
value_stride, num_heads, head_size, block_size, \
reinterpret_cast<const float*>(k_scale.data_ptr()), \
reinterpret_cast<const float*>(v_scale.data_ptr()));
void reshape_and_cache_flash(
torch::Tensor& key, // [num_tokens, num_heads, head_size]
@ -307,10 +311,20 @@ void reshape_and_cache_flash(
torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor&
value_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& slot_mapping, // [num_tokens]
const std::string& kv_cache_dtype, const double k_scale,
const double v_scale) {
int num_tokens = key.size(0);
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale) {
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
// slot_mapping.size(0) because of padding for CUDA graphs.
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
// both include padding.
// In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
// since key includes padding for CUDA graphs, while slot_mapping does not.
// In this case, slot_mapping.size(0) represents the actual number of tokens
// before padding.
// For compatibility with both cases, we use slot_mapping.size(0) as the
// number of tokens.
int num_tokens = slot_mapping.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
int block_size = key_cache.size(1);

7
csrc/core/math.hpp Normal file
View File

@ -0,0 +1,7 @@
#include <climits>
#include <iostream>
inline uint32_t next_pow_2(uint32_t const num) {
if (num <= 1) return num;
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
}

View File

@ -32,7 +32,7 @@ class ScalarType {
signed_(signed_),
bias(bias),
finite_values_only(finite_values_only),
nan_repr(nan_repr){};
nan_repr(nan_repr) {};
static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
return ScalarType(0, size_bits - 1, true, bias);

View File

@ -24,12 +24,20 @@ struct KernelVecType<float> {
template <>
struct KernelVecType<c10::Half> {
#ifdef __powerpc64__
// Power architecture-specific vector types
using q_load_vec_type = vec_op::FP32Vec8;
using k_load_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP32Vec16;
#else
// Fallback for other architectures, including x86
using q_load_vec_type = vec_op::FP16Vec8;
using q_vec_type = vec_op::FP32Vec16;
using k_load_vec_type = vec_op::FP16Vec16;
using v_load_vec_type = vec_op::FP16Vec16;
#endif
using q_vec_type = vec_op::FP32Vec16;
using k_vec_type = vec_op::FP32Vec16;
using qk_acc_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP16Vec16;
};
#ifdef __AVX512BF16__
@ -43,6 +51,10 @@ struct KernelVecType<c10::BFloat16> {
using v_load_vec_type = vec_op::BF16Vec16;
};
#else
#ifdef __aarch64__
#ifndef ARM_BF16_SUPPORT
// pass
#else
template <>
struct KernelVecType<c10::BFloat16> {
using q_load_vec_type = vec_op::BF16Vec8;
@ -52,6 +64,18 @@ struct KernelVecType<c10::BFloat16> {
using qk_acc_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::BF16Vec16;
};
#endif
#else
template <>
struct KernelVecType<c10::BFloat16> {
using q_load_vec_type = vec_op::BF16Vec8;
using q_vec_type = vec_op::FP32Vec16;
using k_load_vec_type = vec_op::BF16Vec16;
using k_vec_type = vec_op::FP32Vec16;
using qk_acc_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::BF16Vec16;
};
#endif
#endif
template <typename T>
@ -362,7 +386,7 @@ void paged_attention_v1_impl_launcher(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes) {
const std::optional<torch::Tensor>& alibi_slopes) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -435,12 +459,12 @@ void paged_attention_v1(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale, const int64_t tp_rank,
const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
const int64_t blocksparse_head_sliding_step) {
TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
TORCH_CHECK(blocksparse_vert_stride <= 1,
"CPU backend does not support blocksparse attention yet.");
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
@ -678,7 +702,7 @@ void paged_attention_v2_impl_launcher(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -757,12 +781,12 @@ void paged_attention_v2(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale, const int64_t tp_rank,
const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
const int64_t blocksparse_head_sliding_step) {
TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
TORCH_CHECK(blocksparse_vert_stride <= 1,
"CPU backend does not support blocksparse attention yet.");
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
@ -771,4 +795,4 @@ void paged_attention_v2(
CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
});
}
}

View File

@ -107,10 +107,8 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
const std::string& kv_cache_dtype, double k_scale,
double v_scale) {
TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
const std::string& kv_cache_dtype,
torch::Tensor& k_scale, torch::Tensor& v_scale) {
int num_tokens = key.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);

View File

@ -1,15 +1,17 @@
#ifndef CPU_TYPES_HPP
#define CPU_TYPES_HPP
#if defined(__x86_64__)
//x86 implementation
// x86 implementation
#include "cpu_types_x86.hpp"
#elif defined(__POWER9_VECTOR__)
//ppc implementation
// ppc implementation
#include "cpu_types_vsx.hpp"
#elif defined(__aarch64__)
// arm implementation
#include "cpu_types_arm.hpp"
#else
#warning "unsupported vLLM cpu implementation"
#endif
#endif
#endif

591
csrc/cpu/cpu_types_arm.hpp Normal file
View File

@ -0,0 +1,591 @@
#include <arm_neon.h>
#include <torch/all.h>
#include <cmath>
namespace vec_op {
#ifdef ARM_BF16_SUPPORT
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
#else
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#endif
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME)
#else
#define CPU_KERNEL_GUARD_IN(NAME) \
std::cout << #NAME << " invoked." << std::endl;
#define CPU_KERNEL_GUARD_OUT(NAME) \
std::cout << #NAME << " exit." << std::endl;
#endif
#define FORCE_INLINE __attribute__((always_inline)) inline
namespace {
template <typename T, T... indexes, typename F>
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
(f(std::integral_constant<T, indexes>{}), ...);
};
}; // namespace
template <typename T, T count, typename F,
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
constexpr void unroll_loop(F&& f) {
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
}
template <typename T>
struct Vec {
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
};
struct FP32Vec8;
struct FP32Vec16;
struct FP16Vec8 : public Vec<FP16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
float16x8_t reg;
explicit FP16Vec8(const void* ptr)
: reg(vld1q_f16(static_cast<const __fp16*>(ptr))) {};
explicit FP16Vec8(const FP32Vec8&);
void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); }
};
struct FP16Vec16 : public Vec<FP16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
float16x8x2_t reg;
explicit FP16Vec16(const void* ptr) {
reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));
reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
}
explicit FP16Vec16(const FP32Vec16& vec);
void save(void* ptr) const {
vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
}
void save(void* ptr, const int elem_num) const {
int full_blocks = elem_num / 8;
int remainder = elem_num % 8;
if (full_blocks > 0) {
vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
if (full_blocks > 1) {
vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
}
}
// Note: below is the unrolled version of the following code:
//
// for (int i = 0; i < remainder; ++i) {
// reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
// vgetq_lane_f16(temp, i);
// }
//
// For macOS build (Clang), the arm/neon intrinsics function
// `vgetq_lane_f16` needs the parameter `i` to be constant at compile
// time.
if (remainder > 0) {
float16x8_t temp = reg.val[full_blocks];
__fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
switch (remainder) {
case 1:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
break;
case 2:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
break;
case 3:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
break;
case 4:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
break;
case 5:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
break;
case 6:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
break;
case 7:
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
break;
default:
break;
}
}
}
};
#ifdef ARM_BF16_SUPPORT
struct BF16Vec8 : public Vec<BF16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
bfloat16x8_t reg;
explicit BF16Vec8(const void* ptr)
: reg(*reinterpret_cast<const bfloat16x8_t*>(ptr)) {};
explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
explicit BF16Vec8(const FP32Vec8&);
explicit BF16Vec8(float32x4x2_t v)
: reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};
void save(void* ptr) const { *reinterpret_cast<bfloat16x8_t*>(ptr) = reg; }
};
struct BF16Vec16 : public Vec<BF16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
bfloat16x8x2_t reg;
explicit BF16Vec16(const void* ptr)
: reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};
explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
explicit BF16Vec16(const FP32Vec16&);
explicit BF16Vec16(float32x4x4_t v)
: reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {};
void save(void* ptr) const { *reinterpret_cast<bfloat16x8x2_t*>(ptr) = reg; };
};
struct BF16Vec32 : public Vec<BF16Vec32> {
constexpr static int VEC_ELEM_NUM = 32;
bfloat16x8x4_t reg;
explicit BF16Vec32(const void* ptr)
: reg(*reinterpret_cast<const bfloat16x8x4_t*>(ptr)) {};
explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
explicit BF16Vec32(const BF16Vec8& vec8_data)
: reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {};
void save(void* ptr) const { *reinterpret_cast<bfloat16x8x4_t*>(ptr) = reg; };
};
#endif
struct FP32Vec4 : public Vec<FP32Vec4> {
constexpr static int VEC_ELEM_NUM = 4;
union AliasReg {
float32x4_t reg;
float values[VEC_ELEM_NUM];
};
float32x4_t reg;
explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {};
explicit FP32Vec4(float32x4_t data) : reg(data) {};
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
};
struct FP32Vec8 : public Vec<FP32Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
union AliasReg {
float32x4x2_t reg;
float values[VEC_ELEM_NUM];
};
float32x4x2_t reg;
explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
explicit FP32Vec8(const float* ptr)
: reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
explicit FP32Vec8(const FP16Vec8& v) {
reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));
reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg));
};
explicit FP32Vec8(float16x8_t v)
: reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
#ifdef ARM_BF16_SUPPORT
explicit FP32Vec8(bfloat16x8_t v)
: reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
explicit FP32Vec8(const BF16Vec8& v)
: reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
#endif
float reduce_sum() const {
AliasReg ar;
ar.reg = reg;
float answer = 0;
unroll_loop<int, VEC_ELEM_NUM>(
[&answer, &ar](int i) { answer += ar.values[i]; });
return answer;
}
FP32Vec8 exp() const {
AliasReg ar;
ar.reg = reg;
float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
float32x4x2_t result;
result.val[0] = result0;
result.val[1] = result1;
return FP32Vec8(result);
}
FP32Vec8 tanh() const {
AliasReg ar;
ar.reg = reg;
float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
float32x4x2_t result;
result.val[0] = result0;
result.val[1] = result1;
return FP32Vec8(result);
}
FP32Vec8 er() const {
AliasReg ar;
ar.reg = reg;
float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])),
static_cast<float32_t>(erf(ar.values[1]))};
float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])),
static_cast<float32_t>(erf(ar.values[3]))};
float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])),
static_cast<float32_t>(erf(ar.values[5]))};
float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])),
static_cast<float32_t>(erf(ar.values[7]))};
float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
float32x4x2_t result;
result.val[0] = result0;
result.val[1] = result1;
return FP32Vec8(result);
}
FP32Vec8 operator*(const FP32Vec8& b) const {
return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]),
vmulq_f32(reg.val[1], b.reg.val[1])}));
}
FP32Vec8 operator+(const FP32Vec8& b) const {
return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]),
vaddq_f32(reg.val[1], b.reg.val[1])}));
}
FP32Vec8 operator-(const FP32Vec8& b) const {
return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]),
vsubq_f32(reg.val[1], b.reg.val[1])}));
}
FP32Vec8 operator/(const FP32Vec8& b) const {
return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]),
vdivq_f32(reg.val[1], b.reg.val[1])}));
}
void save(float* ptr) const {
vst1q_f32(ptr, reg.val[0]);
vst1q_f32(ptr + 4, reg.val[1]);
}
};
struct FP32Vec16 : public Vec<FP32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
union AliasReg {
float32x4x4_t reg;
float values[VEC_ELEM_NUM];
};
float32x4x4_t reg;
explicit FP32Vec16(float v)
: reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
explicit FP32Vec16()
: reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0),
vmovq_n_f32(0.0)}) {}
explicit FP32Vec16(const float* ptr)
: reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
vld1q_f32(ptr + 12)}) {}
explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
explicit FP32Vec16(const FP32Vec8& data) {
reg.val[0] = data.reg.val[0];
reg.val[1] = data.reg.val[1];
reg.val[2] = data.reg.val[0];
reg.val[3] = data.reg.val[1];
}
explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}
explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {}
#ifdef ARM_BF16_SUPPORT
explicit FP32Vec16(bfloat16x8x2_t v)
: reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]),
vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {};
#endif
explicit FP32Vec16(const FP32Vec4& data) {
reg.val[0] = data.reg;
reg.val[1] = data.reg;
reg.val[2] = data.reg;
reg.val[3] = data.reg;
};
#ifdef ARM_BF16_SUPPORT
explicit FP32Vec16(const BF16Vec16& v)
: reg({vcvtq_low_f32_bf16(v.reg.val[0]),
vcvtq_high_f32_bf16(v.reg.val[0]),
vcvtq_low_f32_bf16(v.reg.val[1]),
vcvtq_high_f32_bf16(v.reg.val[1])}) {};
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
#endif
explicit FP32Vec16(const FP16Vec16& v) {
reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
};
FP32Vec16 operator+(const FP32Vec16& b) const {
return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]),
vaddq_f32(reg.val[1], b.reg.val[1]),
vaddq_f32(reg.val[2], b.reg.val[2]),
vaddq_f32(reg.val[3], b.reg.val[3])}));
};
FP32Vec16 operator*(const FP32Vec16& b) const {
return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]),
vmulq_f32(reg.val[1], b.reg.val[1]),
vmulq_f32(reg.val[2], b.reg.val[2]),
vmulq_f32(reg.val[3], b.reg.val[3])}));
};
FP32Vec16 operator-(const FP32Vec16& b) const {
return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]),
vsubq_f32(reg.val[1], b.reg.val[1]),
vsubq_f32(reg.val[2], b.reg.val[2]),
vsubq_f32(reg.val[3], b.reg.val[3])}));
};
FP32Vec16 operator/(const FP32Vec16& b) const {
return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]),
vdivq_f32(reg.val[1], b.reg.val[1]),
vdivq_f32(reg.val[2], b.reg.val[2]),
vdivq_f32(reg.val[3], b.reg.val[3])}));
};
float reduce_sum() const {
AliasReg ar;
ar.reg = reg;
float answer = 0;
unroll_loop<int, VEC_ELEM_NUM>(
[&answer, &ar](int i) { answer += ar.values[i]; });
return answer;
};
template <int group_size>
float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
AliasReg ar;
ar.reg = reg;
float answer = 0;
const int start = idx * group_size;
unroll_loop<int, group_size>(
[&answer, &start, ar](int i) { answer += ar.values[start + i]; });
return answer;
};
void save(float* ptr) const {
vst1q_f32(ptr, reg.val[0]);
vst1q_f32(ptr + 4, reg.val[1]);
vst1q_f32(ptr + 8, reg.val[2]);
vst1q_f32(ptr + 12, reg.val[3]);
};
};
template <typename T>
struct VecType {
using vec_type = void;
};
template <typename T>
using vec_t = typename VecType<T>::vec_type;
template <>
struct VecType<float> {
using vec_type = FP32Vec8;
};
template <>
struct VecType<c10::Half> {
using vec_type = FP16Vec8;
};
#ifdef ARM_BF16_SUPPORT
template <>
struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
};
#endif
template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
template <>
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
*reinterpret_cast<__fp16*>(ptr) = v;
}
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
reg.val[0] = vcombine_f16(low_0, high_0);
reg.val[1] = vcombine_f16(low_1, high_1);
};
inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
reg = vcombine_f16(lower_half, upper_half);
};
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
};
#ifdef ARM_BF16_SUPPORT
inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
};
#endif
#ifdef ARM_BF16_SUPPORT
inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
: reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {
};
inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
: reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]),
v.reg.val[3])}) {};
#endif
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); };
#ifdef ARM_BF16_SUPPORT
template <>
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
*reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v);
};
#endif
}; // namespace vec_op

Some files were not shown because too many files have changed in this diff Show More