Compare commits

...

503 Commits

Author SHA1 Message Date
e53382cc2e Sage Moore fixes for full cuda graph support for DeepEP+DeepGEMM LL
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-24 11:21:52 -04:00
26d34eb67e refactor example - qwen3_reranker (#19847)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-24 14:03:20 +00:00
53da4cd397 [Bugfix][CPU] Fix InputBatch for pooling models in the CPU v1 (#20014)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-24 13:20:04 +00:00
9a3b88328f [PERF] Speedup of MRoPE prepare inputs (#19939)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
2025-06-23 23:01:26 -07:00
3014c920da add some examples for other benchmark scripts (#19893)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-24 05:57:46 +00:00
0eed516951 [doc] Fix broken link in the installation for CPU (#19980)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-06-24 12:04:11 +08:00
ee5ad8d2c5 [Misc][Tools][Benchmark] Add profile to autotune script (#19711)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-24 00:59:41 +00:00
a738dbb2a1 Update test case parameter to have the throughput above 8.0 (#19994)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-24 00:18:10 +00:00
33d5e29be9 [TPU] Fix tpu model runner test (#19995)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-23 16:04:28 -07:00
4671ac6e2a [Bugfix][Benchmark] Fix Marlin benchmark (#19929) 2025-06-24 07:25:12 +09:00
dd2ccf8dde Feat Dynamic Quantization for MoE Layers in GPTQ Marlin Backend (#19395) 2025-06-24 07:23:28 +09:00
a3bc76e4b5 [CI/Build] Push latest tag for cpu and neuron docker image (#19897)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-23 14:15:37 -07:00
e6327c9b3e [Feature] Support sequence parallelism for static fp8 quantization (#19181)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-06-23 16:09:02 -04:00
d0132f025d [Misc] Add type alias ReqId and EngineId for better readability (#19880)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-06-23 12:57:57 -07:00
61f4fc5dc6 [Bugfix][v1] Fix step pooler implementation and step pooling usage in v1 (#19956)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-23 18:38:06 +00:00
68aaeb3749 [EP+DP] Optimize the little operations in the DeepGEMM + DeepEP low latency case (#19885)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-23 11:07:47 -07:00
c3649e4fee [Docs] Fix syntax highlighting of shell commands (#19870)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-23 17:59:09 +00:00
53243e5c42 [doc] improve readability for long commands (#19920)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 14:27:07 +00:00
a6e6604d32 [Bugfix] Fix CI bitsandbytes failure (#19969)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-23 21:30:55 +08:00
b82e0f82cb [doc] use MkDocs collapsible blocks - supplement (#19973)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 10:54:16 +00:00
5111642a6f [Doc] Update V1 status for decoder-only embedding models (#19952)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-23 09:31:06 +00:00
1bcd15edc7 [BugFix][P/D] Fix for cases where _recving_transfers can be cleaned up when *all* transfer done (#19874)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-06-22 22:41:53 -07:00
2ebff5b77c [P/D][NixlConnector] Support tp_size > num_kv_heads deployments (#19691)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-22 22:41:50 -07:00
f17aec0d63 [doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 05:24:23 +00:00
493c275352 Fix(models/siglip): Add compatibility for Gemma models quantized by llm-compressor (#19643)
Signed-off-by: Vensenmu <vensenmu@gmail.com>
2025-06-23 03:40:28 +00:00
f39ab2d4bd [Misc] Configurable timeout for execute_model RPC calls via env var (#19544)
Signed-off-by: jinqinn <goodqinjin@163.com>
2025-06-22 20:36:26 -07:00
4a0f7888a3 [Core] feat: Implement Priority Scheduling in V1 Engine (#19057)
Signed-off-by: amit <amit.man@gmail.com>
Co-authored-by: Roger Wang <Rogerw0108@gmail.com>
2025-06-22 20:18:08 -07:00
c4cf260677 [Perf][CLI] Improve overall startup time (#19941) 2025-06-22 23:11:22 +00:00
33d51f599e [BugFix] Add an env to disable moe chunking to work around compile incompatibility (#19642)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-22 15:17:49 -07:00
e91386cde1 [Chore] dedup logs (#19955) 2025-06-22 19:43:07 +00:00
2c11a29f0b [Misc] Simplify vllm bench cli subcommand implementation (#19948) 2025-06-22 12:34:48 -04:00
c76a506bd6 [Misc] Update model-specific PR tagging (#19949)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-06-22 12:16:08 +00:00
ec0db6f51c [doc] use snippets for contact us (#19944)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-22 10:26:13 +00:00
c305a2109d [CI/Build] Auto tag perf benchmarks related PRs (#19943)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-22 08:46:21 +00:00
202c5df935 [Benchmark] fix request loss if "ping" is returned (#19535)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-22 07:21:04 +00:00
2bb246b8f7 [MISC] add cpu_kvcache_space_bytes to CacheConfig (#19812)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-22 13:39:09 +08:00
4c409cabc2 [Misc] add vllm_config in __init__ (#19866)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-21 23:10:46 -04:00
3b1e4c6a23 [Docs] Add GPT2ForSequenceClassification to supported models in docs (#19932)
Signed-off-by: nie3e <adrcwiek@gmail.com>
2025-06-21 20:57:19 +00:00
2c5302fadd [Multimodal] Optimize Qwen2/2.5-VL startup time (#19756)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-06-21 20:01:07 +00:00
caa680fd2e [doc] add contact us in community (#19922)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-21 17:29:06 +00:00
c3bf9bad11 [New model support]Support Tarsier2 (#19887)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-21 04:01:51 +00:00
6f170f11dd [Bugfix] Fix bnb 8bit model weights loading (#19917)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-21 03:29:09 +00:00
8ca81bb069 Fix: Check the type of params to be a Sequence not list. (#19910)
Signed-off-by: Rabin Adhikari <rabin.adk1@gmail.com>
2025-06-20 23:03:17 +00:00
e773a9e1c2 [Misc] Clean up useless code (#19889)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-06-20 21:09:09 +00:00
71baf85ae1 [Kernel] mark TorchSDPABackend swap_blocks NotImplementedError (#19749) 2025-06-20 18:18:11 +00:00
79f2f1c2a1 [CPU][CI] Fallback sliding window to v0 and fix CPU pooling model tests (#19901)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-20 15:30:36 +00:00
2e3e3c86dc Export NaNs in logits to scheduler_stats if output is corrupted (#18777)
Signed-off-by: Vlad Mihailescu <vtmihailescu@gmail.com>
2025-06-20 22:47:16 +08:00
7e8977fcd4 [custom_op][vllm-plugin] update custom_op class to use op_registry (#19164)
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
2025-06-20 07:44:56 -07:00
f1e840e842 [Model] GPT2ForSequenceClassification model (#19663)
Signed-off-by: nie3e <adrcwiek@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-20 12:07:41 +00:00
7771d1de88 [Fix] import regex instead of re (#19875)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-06-20 11:16:48 +00:00
71d1219545 [Kernel] correct cpu worker function parameter type (#19745)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-20 10:50:13 +00:00
e384f2f108 [Misc] refactor example - openai_transcription_client (#19851)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-20 08:02:21 +00:00
089a306f19 [Misc] update cuda version (#19526)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-20 07:25:15 +00:00
5e666f72cd [Bugfix][Ray] Set the cuda context eagerly in the ray worker (#19583) 2025-06-19 22:01:16 -07:00
e3a3e4db46 [Bugfix] Enable PP with AITER+V1 (#19822)
Signed-off-by: Qiang Li <qiang.li2@amd.com>
2025-06-20 12:43:20 +08:00
e41bf15cd0 [Chore]: qwen3-moe-type-hints-mistake (#19860)
Co-authored-by: xinnan.hou <hxn02029096@alibaba-inc.com>
2025-06-19 21:43:07 -07:00
5aa4a015ce [Benchmark] Fix Value of type "SampleRequest" is not indexable (#18032)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-06-19 21:28:55 -07:00
b6bad3d186 [CI][Neuron] Fail and exit on first error (#19622)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-20 12:27:51 +08:00
ee9a1531aa [CI/Build][Bugfix] Fix deadlock on v1 engine test CI (#19872)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-20 09:51:07 +08:00
10d82f9ac5 [Benchmark][Bugfix] Fix Dataset Length Calculation (#19868)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-06-19 18:30:41 -07:00
ea10dd9d9e [Frontend] early return chat format resolution when specified (#19735) 2025-06-19 18:49:59 +00:00
ead2110297 [Core][Bugfix] Fix Online MM Beam Search (#19688)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-06-19 17:18:07 +00:00
01220ce89a [CI][CPU] Improve dummy Triton interfaces and fix the CPU CI (#19838)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-19 15:46:09 +00:00
6f68c49220 [Doc] Update V1 user guide for embedding models (#19842)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-19 09:43:27 +00:00
4719460644 Fixing Chunked Prefill Test. (#19762)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-06-19 01:36:16 -07:00
466166dcfd [Frontend] Add optional token-level progress bar to LLM.beam_search (#19301)
Signed-off-by: Ruosen Li <rxl190028@utdallas.edu>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Ubuntu <ubuntu@ip-172-31-71-179.ec2.internal>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-19 03:21:41 -04:00
1d0ae26c85 Add xLAM tool parser support (#17148) 2025-06-19 14:26:41 +08:00
6021999573 [Minor] Allow redirecting model path for HfRunner in test (#19795)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-18 23:04:10 -07:00
c7b370c603 raise exception for pin_lora (#19809)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-18 22:57:35 -07:00
aa20d10a91 [Misc] [ROCm] Prevent surplus tensor reshape (#19803)
Signed-off-by: Zsolt Borbely <zsolt.borbely@htecgroup.com>
2025-06-19 13:57:16 +08:00
2de12be428 [ROCm] [AITER] [Bugfix] Patch for AITER commit 648764942e552a8bb5fe16026703716a81f05374 (#18990)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-06-18 22:56:31 -07:00
83ca9ae47b Mark invariant normalizer in Gemma as non-persistent (#19788)
Signed-off-by: Yu-Hang Tang <Tang.Maxin@gmail.com>
2025-06-18 22:56:03 -07:00
e2148dc5ea [Bugfix] Add check_health to v1 async client. (#19821)
Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-06-18 21:47:01 -07:00
b1098b4072 [Bugfix] Fix the linter (#19826)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 21:44:41 -07:00
799397ee4f Support embedding models in V1 (#16188)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Max de Bayser <maxdebayser@gmail.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-18 21:36:33 -07:00
4959915089 [Quantization] Modify the logic of BNB double quantization (#19742)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-19 03:52:09 +00:00
8d1e89d946 [Misc][ROCm] Enforce no unused variable in ROCm C++ files (#19796)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 20:25:15 -07:00
36239f79dd Fix FA2 fallback for Blackwell V1 (#19781)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-19 09:53:55 +08:00
dfada85eee [Frontend] Expose custom args in OpenAI APIs (#16862)
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-18 17:41:11 -07:00
ed33349738 [BugFix] Fix use_cudagraph=False (#19612)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-19 08:23:12 +08:00
d49adea1f9 [Multimodal] Use fast processor for Qwen2/2.5-VL (#19789) 2025-06-18 15:49:40 -07:00
14fdd21d39 [Core] More fixes to MultiModalEmbeddings type handling (#19715)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 22:48:29 +00:00
04fefe7c9a [TPU] Update torch-xla version to include paged attention tuned block change (#19813)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-18 22:41:13 +00:00
3b523e38d9 [Core] Do not copy array during hashing (#19484)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-18 15:36:55 -07:00
16c16301c8 Disable "Forbid direct 'import triton'" check for vllm/triton_utils/importing.py in an extensible way (#19783)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
2025-06-18 15:08:00 -07:00
9206d0ff01 docs: fix Slack bulletpoint in README (#19811)
Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
2025-06-18 20:47:08 +00:00
a89209b78d [v1] Support mamba2 (#19327)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-18 20:34:15 +00:00
ffacb222cb [Docs] Add Huzaifa Sidhpurwala to vuln mgmt team doc (#19808)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 20:22:28 +00:00
12575cfa7a [Bugfix] fix RAY_CGRAPH_get_timeout is not set successfully (#19725)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-18 10:26:16 -07:00
8b6e1d639c [Hardware][AMD] integrate aiter chunked prefill into vllm (#18596)
Signed-off-by: fsx950223 <fsx950223@outlook.com>
Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: fsx950223 <fsx950223@outlook.com>
Co-authored-by: charlifu <charlifu@amd.com>
2025-06-18 08:46:51 -07:00
735a9de71f [Qwen] Add tagging rule for Qwen related PRs (#19799)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 14:26:43 +00:00
257ab95439 [Platform] Allow platform use V1 Engine by default (#19792)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-06-18 13:03:36 +00:00
cca91a7a10 [doc] fix the incorrect label (#19787)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-18 10:30:58 +00:00
f04d604567 [Minor] Zero-initialize attn output buffer (#19784)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-18 06:59:27 +00:00
19a53b2783 [V1] Decouple GPU and TPU InputBatch (#19778)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
2025-06-18 06:38:13 +00:00
eccdc8318c [V1][P/D] An native implementation of xPyD based on P2P NCCL (#18242)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-06-18 06:32:36 +00:00
5f52a84685 [V1] Add API docs for EncoderCacheManager (#19294)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 13:37:01 +08:00
d4629dc43f [Misc] Add __str__ for RequestStatus (#19780)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-06-18 03:03:01 +00:00
6e9cc73f67 [MISC] correct DeviceConfig device field static type analysis (#19699)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-17 17:21:50 -07:00
c53711bd63 [MISC] correct copy_blocks src_to_dists param type (#19696)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-17 17:21:06 -07:00
dac8cc49f4 [TPU] Update torch version to include paged attention kernel change (#19706)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-17 22:24:49 +00:00
a44b1c951d [Feature][ROCm] Add full graph capture support for TritonAttentionBackend (#19158)
Signed-off-by: charlifu <charlifu@amd.com>
2025-06-17 17:03:06 -04:00
b447624ee3 [Bugfix] Fix faulty triton importing logic when using Ray for DP (#19734)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-17 20:59:29 +00:00
cda92307c1 [Misc] Update lmcache connector with the latest connector apis (#19441)
Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>
2025-06-17 19:57:54 +00:00
bf57ccc5c2 Remove sm120 arch from sm100 cutlass kernel arch list (#19716)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-17 11:49:39 -07:00
ffb2cd6b54 [Perf] Optimize moe_align_block_size CUDA kernel (#19572)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-06-17 11:49:26 -07:00
ca94d7fa00 [Bugfix] Update multimodel models mapping to fit new checkpoint after Transformers v4.52 (#19151)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-17 15:58:38 +00:00
5a1c2e15d8 [Mis] remove duplicate engine status checks (#19647)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-17 08:17:38 -07:00
4c8f64faa7 [V1][Kernel] Flashinfer HND KV cache layout (#19280)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-17 09:09:22 -04:00
93aee29fdb [doc] split "Other AI Accelerators" tabs (#19708) 2025-06-17 22:05:29 +09:00
154d063b9f [doc][mkdocs] Add edit button to documentation (#19637)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-17 11:10:31 +00:00
ccd7c05089 [Kernel] Add Split-KV Support to Unified Triton Attention Kernel (#19152)
Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
2025-06-17 10:45:07 +00:00
c48c6c4008 Add a doc on how to update PyTorch version (#19705) 2025-06-17 18:10:37 +08:00
aed8468642 [Doc] Add missing llava family multi-image examples (#19698)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-17 07:05:21 +00:00
5c76b9cdaf [Core] add remove_seq_from_computed_blocks_tracker to BlockSpaceManager (#19686)
Signed-off-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
Co-authored-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
2025-06-17 04:40:58 +00:00
ddfed314f9 Fixes IMA for TP w/ flex-attention (#19712)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-06-17 04:01:50 +00:00
5b3ad5ecf2 [DOC] fix doc typos (#19600)
Signed-off-by: Di Liu <liu-di@sjtu.edu.cn>
2025-06-17 11:34:53 +08:00
ede5c4ebdf [Frontend] add chunking audio for > 30s audio (#19597)
Signed-off-by: nguyenhoangthuan99 <thuanhppro12@gmail.com>
2025-06-17 11:34:00 +08:00
07334959d8 [Wheel Size] Only build FA2 8.0+PTX (#19336) 2025-06-17 12:32:49 +09:00
119f683949 [doc] add project flag to gcloud TPU command (#19664)
Signed-off-by: David Xia <david@davidxia.com>
2025-06-17 01:00:09 +00:00
0860087aff [Fix] Fall back to Gloo when NCCL backend is unavailable (#19641)
Signed-off-by: conroy-cheers <conroy@corncheese.org>
2025-06-17 08:42:14 +08:00
6bc7b57315 [Quantization] Remove FP4 emulation; Fall-back to marlin for device < 100 (#19563) 2025-06-16 17:33:51 -04:00
90f9c2eb5c [V1] Change return type on get_multimodal_embeddings() (#19446)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-16 13:32:15 -04:00
387bdf0ab9 [Model] Add support for MiniMaxM1ForCausalLM (shares architecture with MiniMaxText01ForCausalLM) (#19677)
Signed-off-by: QscQ <qscqesze@gmail.com>
2025-06-16 09:47:14 -07:00
5e5baa91aa [Kernels] Use empty for modular MoE workspaces (#19667)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-16 14:58:01 +00:00
836d4ce140 [Bugfix] fix missing 'finish_reason': null in streaming chat (#19662)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-16 14:10:39 +00:00
c3fec47bb7 [MISC] bump huggingface_hub pkg to 0.33.0 (#19547)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-16 05:22:28 -07:00
1173804dca [Bugfix] Fix TP inference for Flex attention backend (#19657)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-16 11:21:37 +00:00
4d5424029b [Feature]:Allow for Granite MoE Hybrid models with _only_ shared experts. (#19652)
Signed-off-by: Shawn Tan <shawntan@ibm.com>
2025-06-16 11:14:18 +00:00
3e7506975c [DOC] Add reasoning capability to vLLM streamlit code (#19557) 2025-06-16 07:09:12 -04:00
ee35e96ac3 [BugFix] Don't catch BaseException when dumping execute_model errors (#19626)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-16 11:01:08 +00:00
dec66d253b [Kernel] GGUF MMVQ kernel for multiple input vectors (#18754)
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
2025-06-16 17:33:26 +08:00
8d120701fd [Docs] Move multiproc doc to v1 dir (#19651)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-16 09:10:12 +00:00
f40f763f12 [CI] Add mteb testing for rerank models (#19344) 2025-06-16 01:36:43 -07:00
26bc46ef89 [MISC] typo fix (#19672)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-16 07:18:49 +00:00
a77aea59fd [TPU] support attention head dim smaller than 128 (#19620)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-06-16 06:40:53 +00:00
b692e9cd07 [Misc] Fix skipped max-model-len validation when deriving max model length from tokenizer config (#19660)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-16 06:30:29 +00:00
367871a469 [Misc][Frontend] passthrough bad_words (#19564)
Signed-off-by: Francesco Bertolotti <francesco.bertolotti@igenius.ai>
Co-authored-by: Francesco Bertolotti <francesco.bertolotti@igenius.ai>
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-06-16 05:05:13 +00:00
92183b41f3 [Bugfix][Core] Prefix caching causes incorrect outputs due to outdated ComputedBlocksTracker (#18957)
Signed-off-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
Co-authored-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
2025-06-15 21:56:37 -07:00
c6703d1e0d [MISC] Remove unused variableds in C++ (#19609)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-15 20:05:28 -07:00
a5e7242d5f [Misc] Remove duplicate multiproc method setting for CPU platform (#19649)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-16 02:26:58 +00:00
91b2c17a55 [CI/Build] Fix torch nightly CI dependencies part 2 (#19589) 2025-06-15 20:01:10 +08:00
055915e6ce Enable prefix caching with full cuda graphs (#19617)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-15 01:05:05 -07:00
3d330c4c09 [Benchmark] Refactor benchmark script for fp8 & int8 (#19627)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-15 15:15:37 +08:00
0b73736a0d [Kernel] Raise verbose error and consolidate num_heads/num_kv_heads divisibility check (#19339)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-15 13:43:48 +08:00
ee1531bc38 [Bugfix][2/n] Fix speculative decoding CI - Fix test_ngram_e2e_greedy_correctness (#19644) 2025-06-14 21:15:41 -07:00
e13945f9dd [Perf] Further tunings for SM100 FP8 CUTLASS kernel (#19566) 2025-06-14 17:25:10 -07:00
08500011d3 [Fix] Convert kv_transfer_config from dict to KVTransferConfig (#19262) 2025-06-14 12:32:07 -07:00
861a0a0a39 [Bugfix] Don't attempt to use triton if no driver is active (#19561) 2025-06-14 12:30:54 -07:00
bc956b38d0 Only build CUTLASS MoE kernels on Hopper (#19648) 2025-06-14 11:44:15 -07:00
294fc1e2c9 [Hardware][NVIDIA][kernel] Fp4 MOE quant kernel optimization (#19500) 2025-06-14 09:34:28 -07:00
2db9044ab6 [Bugfix] Fix auto dtype casting for BatchFeature (#19316)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-06-14 15:13:08 +00:00
6fa718a460 [Misc] Modularize CLI Argument Parsing in Benchmark Scripts (#19593)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-14 16:54:52 +08:00
06be858828 [Bugfix] Fix the speculative decoding test by setting the target dtype (#19633) 2025-06-13 20:57:32 -07:00
d1e34cc9ac [V1][Metrics] Deprecate metrics with gpu_ prefix for non GPU specific metrics. (#18354)
Signed-off-by: Saheli Bhattacharjee <saheli@krai.ai>
2025-06-14 11:07:36 +08:00
bd517eb9fe [BugFix] Fix DP Coordinator incorrect debug log message (#19624)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-14 00:18:03 +00:00
d65668b4e8 Adding "AMD: Multi-step Tests" to amdproduction. (#19508)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-13 17:08:51 -07:00
aafbbd981f [torch.compile] Use custom ops when use_inductor=False (#19618) 2025-06-13 15:05:54 -07:00
0f0874515a [Doc] Add troubleshooting section to k8s deployment (#19377)
Signed-off-by: Anna Pendleton <pendleton@google.com>
2025-06-13 21:47:51 +00:00
3597b06a4f [CUDA] Enable full cudagraph for FlashMLA (#18581)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-13 18:12:26 +00:00
1015296b79 [doc][mkdocs] fix the duplicate Supported features sections in GPU docs (#19606)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-13 16:25:08 +00:00
ce9dc02c93 [Refactor] Remove unused variables in moe_permute_unpermute_kernel.inl (#19573)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-13 06:12:15 -07:00
a24cb91600 [Model] Fix minimax model cache & lm_head precision (#19592)
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-06-13 12:08:20 +00:00
7e8d97dd3f [BugFix] Honor enable_caching in connector-delayed kvcache load case (#19435)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-13 09:46:32 +00:00
d70bc7c029 [torch.compile] reorganize the cache directory to support compiling multiple models (#19064)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-13 15:23:25 +08:00
ce688ad46e use base version for version comparison (#19587)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-06-13 15:09:34 +08:00
cefdb9962d [Fix] The zip function in Python 3.9 does not have the strict argument (#19549)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-13 14:57:48 +08:00
ace5cdaff0 [Fix] bump mistral common to support magistral (#19533)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-12 22:28:12 -07:00
6458721108 [CPU] Refine default config for the CPU backend (#19539)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-13 13:27:39 +08:00
bb4a0decef [Misc] Correct broken docs link (#19553)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-06-12 22:27:13 -07:00
c707cfc12e [doc] fix incorrect link (#19586)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-13 04:26:09 +00:00
7b3c9ff91d [Doc] uses absolute links for structured outputs (#19582)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-13 03:35:17 +00:00
c68698b326 [Bugfix] Fix EAGLE vocab embedding for multimodal target model (#19570)
Signed-off-by: qizixi <qizixi@meta.com>
2025-06-12 23:09:19 -04:00
e3b12667d4 [BugFix] : Fix Batched DeepGemm Experts (#19515)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-12 20:43:02 -06:00
e6aab5de29 Revert "[Build/CI] Add tracing deps to vllm container image (#15224)" (#19378) 2025-06-12 17:26:40 -07:00
c57bb199b3 [V1] Resolve failed concurrent structured output requests (#19565)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-12 23:30:09 +00:00
dba68f9159 [Doc] Unify structured outputs examples (#18196)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-12 22:50:31 +00:00
a3319f4f04 [Bugfix] Enforce contiguous input for dynamic_per_token FP8/INT8 quant (#19452)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-12 15:39:15 -04:00
9d880f594d [Misc] Turn MOE_DP_CHUNK_SIZE into an env var (#19506) 2025-06-12 18:01:16 +00:00
017ef648e9 [Spec Decode][Benchmark] Generalize spec decode offline benchmark to more methods and datasets (#18847) 2025-06-12 10:30:56 -07:00
4b25ab14e2 [doc] Make top navigation sticky (#19540)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-12 15:48:11 +00:00
f98548b9da [torch.compile][ROCm] Fuse quantization onto attention using a torch.compile pass (#16756)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
2025-06-12 08:31:04 -07:00
96846bb360 Fix TorchAOConfig skip layers (#19265)
Signed-off-by: mobicham <hicham@mobiuslabs.com>
2025-06-12 22:22:53 +08:00
b6efafd9e4 [Perf] Vectorize static / dynamic INT8 quant kernels (#19233)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-12 06:51:41 -07:00
1129e2b1ab [V1][NixlConnector] Drop num_blocks check (#19532)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-12 12:36:14 +00:00
c742438f8b [Doc] Add V1 column to supported models list (#19523)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-12 19:16:44 +08:00
73e2e0118f [Quantization] Improve AWQ logic (#19431)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-12 11:02:11 +00:00
c9280e6346 [Bugfix] Respect num-gpu-blocks-override in v1 (#19503)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-12 11:00:23 +00:00
af09b3f0a0 [Bugfix][V1] Allow manual FlashAttention for Blackwell (#19492)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-12 10:40:24 +00:00
4f6c42fa0a [Security] Prevent new imports of (cloud)pickle (#18018)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-06-12 10:30:17 +00:00
dff680001d Fix typo (#19525)
Signed-off-by: 2niuhe <carlton2tang@gmail.com>
2025-06-12 09:24:45 +00:00
2e090bd5df [AMD][Kernel][BugFix] fix test_rocm_compressed_tensors_w8a8 for rocm (#19509)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-06-12 07:14:24 +00:00
1b0b065eb5 [BugFix] Handle missing sep_token for Qwen3-Reranker in Score API (#19522)
Signed-off-by: strutive07 <strutive07@gmail.com>
2025-06-12 07:00:47 +00:00
d5bdf899e4 [BugFix] Work-around incremental detokenization edge case error (#19449)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-12 06:43:20 +00:00
7e3e74c97c [Frontend] Improve error message in tool_choice validation (#19239)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-12 01:13:00 -04:00
3f6341bf7f Add Triton Fused MoE kernel config for E=16 on B200 (#19518)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-06-12 04:31:51 +00:00
e5d35d62f5 [BugFix] Force registration of w8a8_block_fp8_matmul_deepgemm via lazy import (#19514)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-12 04:28:12 +00:00
2f1c19b245 [CI] change spell checker from codespell to typos (#18711)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-11 19:57:10 -07:00
42f52cc95b [CI/Build] Fix torch nightly CI dependencies (#19505)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-11 14:40:42 -07:00
97a9465bbc [UX] Add Feedback During CUDAGraph Capture (#19501)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-06-11 21:09:05 +00:00
c7ea0b56cd [AMD] [Quantization] Add override flag for attention dtype instead of using kv_cache_dtype trigger (#17331)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-06-11 15:53:28 -04:00
29fa5cac1c [Kernels] Add activation chunking logic to FusedMoEModularKernel (#19168)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-11 12:53:10 -04:00
b2d9be6f7d [Docs] Remove WIP features in V1 guide (#19498)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-11 09:15:03 -07:00
04a55612dd [Misc] Fix misleading ROCm warning (#19486)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-12 00:12:10 +08:00
89b0f84e17 [doc] fix "Other AI accelerators" getting started page (#19457)
Signed-off-by: David Xia <david@davidxia.com>
2025-06-11 16:11:17 +00:00
497a91e9f7 [CI] Update FlashInfer to 0.2.6.post1 (#19297)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 22:57:28 +08:00
943ffa5703 [Bugfix] Update the example code, make it work with the latest lmcache (#19453)
Signed-off-by: Runzhen Wang <wangrunzhen@gmail.com>
2025-06-11 12:42:20 +00:00
5c8d34a42c Support no privileged mode on CPU for docker and kubernetes deployments (#19241)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
2025-06-11 04:11:47 -07:00
3c8694eabe Fix some typo (#19475)
Signed-off-by: ximing.wxm <ximing.wxm@antgroup.com>
Co-authored-by: ximing.wxm <ximing.wxm@antgroup.com>
2025-06-11 10:36:04 +00:00
7484e1fce2 Add cache to cuda get_device_capability (#19436)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 17:37:05 +08:00
a2142f0196 Support non-string values in JSON keys from CLI (#19471)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 09:34:04 +00:00
871d6b7c74 [Misc] Reduce warning message introduced in env_override (#19476)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 17:29:54 +08:00
29a38f0352 [Doc] Support "important" and "announcement" admonitions (#19479)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 01:39:58 -07:00
a5115f4ff5 [Doc] Fix quantization link titles (#19478)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 01:27:22 -07:00
68b4a26149 [Doc] Update V1 User Guide for Hardware and Models (#19474)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 00:49:06 -07:00
b8e809a057 [Kernel] Support deep_gemm for linear methods (#19085)
Signed-off-by: artetaout <lulala341@gmail.com>
2025-06-11 15:14:45 +08:00
5039ec2336 [ROCm] Add rules to automatically label ROCm related PRs (#19405)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 15:09:18 +08:00
7c644ab6d5 Fix Typo in Documentation and Function Name (#19442) 2025-06-10 22:44:11 -07:00
2d40665fe8 Add fused MOE config for Qwen3 30B A3B on B200 (#19455)
Signed-off-by: Junhao Li <junhao@ubicloud.com>
2025-06-11 13:43:46 +08:00
96ada386b7 [Misc] Remove unused MultiModalHasher.hash_prompt_mm_data (#19422)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-11 05:18:57 +00:00
1e473b3010 [CI] Disable failing GGUF model test (#19454)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 05:12:38 +00:00
2b1e2111b0 Fix test_max_model_len in tests/entrypoints/llm/test_generate.py (#19451)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 12:54:59 +08:00
a45b979d9f [BugFix] Fix docker build cpu-dev image error (#19394)
Signed-off-by: niu_he <carlton2tang@gmail.com>
2025-06-10 20:56:40 -07:00
3952731e8f [New Model]: Support Qwen3 Embedding & Reranker (#19260) 2025-06-10 20:07:30 -07:00
77f0d465d0 [BugFix] Allow use_cudagraph to work with dynamic VLLM_USE_V1 (#19390)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-11 07:54:41 +08:00
22c3c0aa4a Add H20-3e fused MoE kernel tuning configs for Qwen3-235B-A22B-FP8 (#19401)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-06-11 07:23:57 +08:00
33f8dba7c6 [Model] use AutoWeightsLoader for commandr (#19399)
Signed-off-by: py-andy-c <pychen1017@gmail.com>
2025-06-10 22:42:21 +00:00
5241ca50d6 [ROCm][V1] Adding ROCm to the list of plaforms using V1 by default (#19440)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-10 22:06:15 +00:00
da9b523ce1 [Docs] Note that alternative structured output backends are supported (#19426)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-10 16:20:00 +00:00
b6553be1bc [Misc] Slight improvement of the BNB (#19418)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-10 13:51:49 +00:00
64a9af5afa Simplify ep kernels installation (#19412)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-10 20:06:08 +08:00
e4248849ec [BugFix][CPU] Fix CPU CI by ignore collecting test_pixtral (#19411)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-10 12:02:40 +00:00
467bef18a3 [BugFix][FlashInfer] Fix attention backend interface mismatch with unexpected keyword use_irope (#19134)
Signed-off-by: Yunqiu Guo <guorachel@meta.com>
2025-06-10 16:48:51 +08:00
5f1ac1e1d1 Revert "[v1] Add fp32 support to v1 engine through flex attn" (#19404) 2025-06-10 01:30:20 -07:00
9368cc90b2 Automatically bind CPU OMP Threads of a rank to CPU ids of a NUMA node. (#17930)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-06-10 06:22:05 +00:00
32b3946bb4 Add clear documentation around the impact of debugging flag (#19369)
Signed-off-by: Anna Pendleton <pendleton@google.com>
2025-06-10 06:16:09 +00:00
6b1391ca7e [Misc] refactor neuron_multimodal and profiling (#19397)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 06:12:42 +00:00
a3f66e75d1 Add security warning to bug report template (#19365)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-06-10 06:06:36 +00:00
319cb1e351 [Core] Batch multi modal input using pinned memory (#19169)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-10 13:44:59 +08:00
1efef71645 [Bugfix] Fix modelscope token passed in (#19389)
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-10 13:39:37 +08:00
646d62f636 [Core] Use tuple for kv cache group block ids (#19175)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-10 07:01:17 +02:00
6cd4ae8acd [Frontend] Add tqdm_leave_pbar to control progress bar visibility (#19357)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 04:55:09 +00:00
c016047ed7 Fix docs/mkdocs/hooks/remove_announcement.py (#19382) 2025-06-09 21:36:54 -07:00
9af6d22e4c Use xla flag to improve the quantized model performance (#19303)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-06-10 01:28:45 +00:00
4589b94032 [Bugfix] Fix benchmark_moe.py (#19016)
Signed-off-by: Tianyu Guo <guoty9@mail2.sysu.edu.cn>
2025-06-09 18:04:36 -07:00
cc867be19c [V1] Reuse V0's memory_profiling util for gpu worker memory profiling (#19312)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-10 08:40:01 +08:00
3a7cd627a8 [Misc] Fix a config typo in disable_hybrid_kv_cache_manager configuration (#19383)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-09 16:41:51 -07:00
8058c91108 [HOT-FIX] Add kv_sharing_target_layer_name argument to cutlass_mla backend (#19374)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-06-09 19:00:07 -04:00
7d44c469fe [TPU]Fix KV cache sharing tests (#19371) 2025-06-09 18:38:15 -04:00
31f58be96a [Frontend] Make TIMEOUT_KEEP_ALIVE configurable through env var (#18472)
Signed-off-by: liusiqian <liusiqian@tal.com>
2025-06-09 21:41:21 +00:00
ebb2f383b8 [Quantization] Bump compressed-tensors version (#19295)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-06-09 14:33:15 -07:00
c1c7dbbeeb [Bugfix][Core] Prevent token lengths exceeding max_model_len in V0 (#19348)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-09 23:01:29 +08:00
5cf2daea9a [Misc] Fixes and Optimizations for DeepEP + DeepGEMM combination. (#19298)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-09 10:50:39 -04:00
b8089195b4 [v1] Add fp32 support to v1 engine through flex attn (#19319)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-06-09 22:10:44 +08:00
770e5dcdb8 [full_graph] Fix query_start_loc padding (#19321)
Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
2025-06-09 21:32:56 +08:00
c57c9415b1 [Docs] Fix a bullet list in usage/security.md (#19358)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-06-09 13:28:51 +00:00
01810f9236 [CI] Introduce rules for llama auto-label (#19323)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-09 20:05:42 +08:00
59abbd84f9 [Fix] Allow kernel compilation for CUDA capability 8.7 (#19328)
Signed-off-by: Conroy Cheers <conroy@corncheese.org>
2025-06-09 02:57:23 -07:00
95a6568b5c [CI/Build] Fix LoRA test (#19350)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-09 09:52:10 +00:00
0eca5eacd0 [Doc] Fix description in the Automatic Prefix Caching design doc (#19333)
Signed-off-by: cr7258 <chengzw258@163.com>
2025-06-09 17:30:02 +08:00
12e5829221 [doc] improve ci doc (#19307)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-09 07:26:12 +00:00
3a4d417707 [Misc] Cleanup compilation tests (#19343)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-09 15:05:44 +08:00
8335667c22 [Frontend] Remove unreachable code from llm.py (#19288)
Signed-off-by: KsuParkhamchuk <k.parkhamchuk@gmail.com>
2025-06-09 10:22:10 +08:00
e1c4380d4c [Misc] Add documentation update reminder to PR template (#19289)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-09 10:20:53 +08:00
e31ae3de36 [Deprecation] Remove inputs arg fallback in Engine classes (#18799)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-09 10:19:56 +08:00
2ffb9b6e07 [Bugfix] model_max_length should consider max_model_len in tokenizer_config (#19201) 2025-06-08 07:17:53 -07:00
cda10fa3e2 [Multi Modal] Add an env var for message queue max chunk bytes (#19242)
Signed-off-by: yZhen <yZhen@fb.com>
Co-authored-by: yZhen <yZhen@fb.com>
2025-06-08 21:39:12 +08:00
c123bc33f9 [Quantization] Add compressed-tensors NVFP4 support (#18312) 2025-06-08 09:05:55 -04:00
b9a1791e2c [Hardware][POWER] Add IBM POWER11 Support to CPU Extension Detection (#19082)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-06-08 09:17:14 +00:00
989dcee981 Add H20-3e fused MoE kernel tuning configs for Qwen3-235B-A22B (#19315)
Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
2025-06-08 16:07:02 +08:00
3d64d366e0 [Misc] Change tests/compile to use VLLM_V1 by default (#19302)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-08 16:06:48 +08:00
eaa2e51088 [Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-08 08:56:12 +08:00
d77f7fb871 [Bugfix]: Fix TypeError: 'float' object cannot be interpreted as an integer (#19283)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-08 08:16:31 +08:00
2d8476e465 [BugFix][V1] Fix memory profiling bug (#18974)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-07 10:34:51 -07:00
88be823d57 [AMD] Update compatible packaging version (#19309)
Signed-off-by: pramkuma <Pramendra.Kumar@amd.com>
2025-06-07 20:55:09 +08:00
4e4f63ad45 [Nit][Benchmark]Fix example in benchmark_serving_structured_output.py (#19311)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-06-07 18:25:38 +08:00
d2f0e7e615 [CI/Build] Improve Llama GGUF test robustness (#19287)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-07 17:23:28 +08:00
122cdca5f6 [Misc] refactor context extension (#19246)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-07 05:13:21 +00:00
cf02f9b283 Add FlexAttention to V1 (#16078)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-06-06 21:58:55 -07:00
c4296b1a27 [CI][PowerPC] Use a more appropriate way to select testcase in tests/models/language/pooling/test_embedding.py (#19253)
Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
2025-06-07 11:52:52 +08:00
66c508b137 [TPU][Test] Add script to run benchmark on TPU for buildkite (#19039)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-06 20:10:24 -07:00
84166fee97 [Kernel] Integrate CUTLASS MoE kernel with PPLX (#18762)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-06 18:26:11 -07:00
6e0cd10f72 [Easy][Test] Simplify test_function_tool_use with multiple parametrizes (#19269)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-07 09:19:09 +08:00
e010688f50 [Build][ROCm] Update Dockerfile.rocm (#19296)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-06-06 19:35:16 -04:00
441b65d8c7 [Misc][Tools][Benchmark] Fix and improve auto tune script (#19163)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-06 23:31:19 +00:00
46ecc57973 [BugFix] Fix tpu_model_runner block_id concatenation (#19228)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:28:17 -07:00
b6a3a9f76d [Core] Fix abrupt request abort (#18485)
Signed-off-by: nicklucche <nlucches@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>

Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:27:59 -07:00
ca27f0f9c1 [Bugfix][Core] Update cancellation logic in generate() to handle Generator exits (#19225)
Co-authored-by: Adolfo Victoria <adovi@meta.com>
2025-06-06 20:17:54 +00:00
aad30bd306 [BugFix] Fix MultiConnector test after HMA changes (#19291)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 20:16:24 +00:00
94ecee6282 Fixed ppc build when it runs on non-RHEL based linux distros (#18422)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
Co-authored-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
2025-06-06 11:54:26 -07:00
8267f9916f improve logits bias (#19041) 2025-06-06 19:59:25 +08:00
7353492a47 [Core] Raise when non-multi-instance DP clients target a DP rank (#19227)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-06 19:03:01 +08:00
7661e92ef8 [Model] Optimize nemotron_h implementation (#19249)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-06 10:05:14 +00:00
f168b85725 Unit Test for run_dp_sharded_vision_model (#19103)
Signed-off-by: Siqi Yan <siqi@meta.com>
Co-authored-by: Siqi Yan <siqi@meta.com>
2025-06-06 16:24:02 +08:00
da511d54d8 Fix CompilationConfig repr (#19091)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-06 16:23:35 +08:00
65c69444b1 [Docs] Improve V1 KVConnector interface documentation (#19172)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:22:45 +08:00
94870359cd [Quantization] Bump compressed-tensors version; update NVFP4A16 test model (#19224)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-06-06 01:21:54 -07:00
0d49483ea9 [TPU] fix kv cache dtype in model runner (#19244)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 16:20:16 +08:00
90b78ec5f9 [v1][P/D] Fix a edge case in kv cache schedule (#19182)
Co-authored-by: jinghui <jinghui@fb.com>
2025-06-05 23:32:55 -07:00
91a2ef98ea [Chore] update CODEOWNERS (#19247)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-06 06:09:43 +00:00
3da2313d78 Support allowed_token_ids in ChatCompletionRequest (#19143)
Signed-off-by: Xu Song <xusong.vip@gmail.com>
2025-06-06 05:06:48 +00:00
b61dc5f972 [TPU] update torch_xla pin (#19231)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 04:27:38 +00:00
f8a1a2d108 [v1] Hybrid Memory Allocator (#17996)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-05 20:47:09 -07:00
3465b87ef8 [Bugfix] Fix EAGLE vocab embedding construction for Llama 70B (#19033)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-06-05 19:10:08 -07:00
c8134bea15 Fix AOPerModuleConfig name changes (#18869)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-06-05 18:51:32 -07:00
cb6d572e85 [Model] NemotronH support (#18863)
Signed-off-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
Co-authored-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
2025-06-05 21:29:28 +00:00
87360308b7 [V1] Use FlashInfer by default on Blackwell GPUs (#19118) 2025-06-05 15:40:39 -04:00
aa49f14832 [Quantization] Skip Fp4 Test for compressed-tensors (#19217) 2025-06-05 18:21:53 +00:00
9ef9173cfa [P/D][NixlConnector] Enable FlashInfer backend (#19090) 2025-06-05 17:10:15 +00:00
85e2b7bb13 [MISC][Bugfix] Use less CPU when message queue has been empty for some time (#16226)
Signed-off-by: Povilas Kanapickas <povilas@radix.lt>
2025-06-05 16:53:08 +00:00
61059bee40 [Hardware][NVIDIA] FP4 MoE kernel optimization (#19110)
Signed-off-by: Chiyue Wei <chiyuew@nvidia.com>
Co-authored-by: Chiyue Wei <chiyuew@nvidia.com>
2025-06-05 09:48:26 -07:00
ec89524f50 Add H20-3e fused MoE kernel tuning configs for DeepSeek-R1/V3 (#19205) 2025-06-05 16:38:54 +00:00
f20f9f063b [mistral_common] Add v11 tokenizer (#19193)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-06-05 08:27:41 -07:00
9bc8bb07cf [Bugfix] properly catch PIL-related errors for vision models when incorrect data urls are provided (#19202)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-06-05 12:59:28 +00:00
1aeb925f34 [Frontend] improve vllm run-batch --help display (#19187)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 11:16:25 +00:00
188a4590d8 [Misc] Do not override NCCL_CUMEM_ENABLE if set explicitly (#19105)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-05 11:14:32 +00:00
18093084be [Misc] Remove unnecessary fallback to prefill-decode attention (#19138)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-06-05 16:08:26 +08:00
da40380214 [Build] Annotate wheel and container path for release workflow (#19162)
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-04 23:24:56 -07:00
8fc57501d3 [Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (#19135)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-05 06:24:24 +00:00
af7fc84fd2 [BugFix][Minor] Fix full cuda graph bug when max_num_seqs < 512 (#19171)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-05 13:41:25 +08:00
0678b52251 Handle non-serializable objects when dumping benchmark results (#19114) 2025-06-04 22:40:04 -07:00
25b918eee6 [Torch Nightly]add missing dependency (#18770)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-06-04 21:56:12 -07:00
a408820f2f [Bugfix] Fix port handling in make_zmq_path (#19117) 2025-06-04 21:00:59 -06:00
c56ed8bb0e [Bugfix][Nixl] Fix full prefix cache hit bug (#18632)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-05 02:07:32 +00:00
78dcf56cb3 [doc] small fix (#19167)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 09:13:50 +08:00
b2fac67130 [P/D] Heterogeneous TP (#18833)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-04 23:25:34 +00:00
23027e2daf [Misc] refactor: simplify EngineCoreClient.make_async_mp_client in AsyncLLM (#18817)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-04 15:37:25 -07:00
c3fd4d669a [Kernel] Integrate batched/masked deepgemm kernel (#19111)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-04 21:59:18 +00:00
ef3f98b59f [Bugfix] fix v1 cpu worker fails on macOS (#19121) 2025-06-04 20:17:38 +00:00
7ee2590478 [TPU] Update dynamo dump file name in compilation test (#19108)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 16:13:43 -04:00
53a5a0ce30 [Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-04 10:46:28 -07:00
d459fae0a2 [Bugfix][EP+DP] Fix internode check (#19112)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
2025-06-04 23:39:23 +08:00
c8dcc15921 Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-04 08:26:47 -07:00
8f4ffbd373 [Doc] Update V1 Guide for embedding models (#19141)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 22:57:55 +08:00
5f2cd251d2 Sm100 blockwise fp8 swap ab (#18564) 2025-06-04 07:48:45 -07:00
02658c2dfe Add DeepSeek-R1-0528 function call chat template (#18874)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-06-04 13:24:18 +00:00
01dc9a76db [CI/Build][Bugfix] Ensure compatibility with transformers 4.52 (#18678)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 04:49:20 -07:00
35cf32df30 Improve the output precision of embedding models (#19092) 2025-06-04 11:48:57 +00:00
8711bc5e68 [Misc] Add packages for benchmark as extra dependency (#19089)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-04 04:18:48 -07:00
2669a0d7b5 Fix ValueError: Missing value for tag key(s): model_name,engine. (#19113)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-06-04 17:10:45 +08:00
8e972d9c44 [TPU] Skip hanging tests (#19115)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 01:43:00 -07:00
3336c8cfbe Fix #19130 (#19132)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-04 01:42:06 -07:00
b124e1085b [Bugfix] Fix FA3 full cuda graph correctness (#19106)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-03 23:10:15 -07:00
41aa578428 [NVIDIA] Add Cutlass MLA backend (#17625) 2025-06-03 21:40:26 -07:00
8d646c2e53 [Cleanup][v1]:remote guided-decoding-backend for example (#19059)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-04 04:23:26 +00:00
5d6d1adf15 [KERNEL] Sampler. CUDA kernel for applying repetition penalty (#18437) 2025-06-03 21:13:01 -07:00
1409ef9134 [Core] Cast multimodal input in hf processor (#18862)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-03 20:24:56 -07:00
4555143ea7 [CPU] V1 support for the CPU backend (#16441) 2025-06-03 18:43:01 -07:00
52dceb172d [Docs] Add developer doc about CI failures (#18782)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-04 01:09:13 +00:00
abd7df2fca [Misc] Fix path and python alias errors in disagg_prefill exmaples (#18919) 2025-06-03 17:15:18 -07:00
b712be98c7 feat: add data parallel rank to KVEventBatch (#18925) 2025-06-03 17:14:20 -07:00
a8da78eac9 [Bugfix] Max concurrency estimation and check_enough_kv_cache_memory for models with sliding window layers (#19029)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-04 00:14:06 +00:00
5d96533e22 [Bugfix][P/D] Fix Prefix Cache Bug (#18411)
Signed-off-by: nicklucche <nlucches@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-06-03 23:53:16 +00:00
4de790fcad [Bugfix]: Fix the incompatibility issue with tool_choice 'required' when Thinking is enabled (#19075)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-03 23:27:24 +00:00
b5fd9506c1 [Bugfix] get_num_blocks_to_allocate with null_block (#19031)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 15:30:55 -07:00
135cf55cd1 [V1][Spec Decode][Ngram] 1.35x gain -> 1.95x gain on InstructCoder with prompt fix (#18971) 2025-06-03 15:26:33 -07:00
6cac54f4d1 [v1] Re-init input batch for multiple kv cache groups (#18654)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 21:41:36 +00:00
6865fe0074 Fix interaction between Optional and Annotated in CLI typing (#19093)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikun@apache.org>
2025-06-03 21:07:19 +00:00
e31446b6c8 [Perf] Tune scaled_fp8_quant by increasing vectorization (#18844)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 13:48:25 -07:00
bdf13965ab [V1] Support cross-layer KV sharing (#18212)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-06-03 20:33:07 +00:00
fa98d77773 [Kernel] DeepEP dispatch-combine kernel integration (#18434)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-03 12:30:02 -07:00
01eee40536 [doc] update docker version (#19074)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 19:08:21 +00:00
19bdaf32b1 [Doc] Readme standardization (#18695)
Co-authored-by: Soren Dreano <soren@numind.ai>
2025-06-03 11:50:55 -07:00
02f0c7b220 [Misc] Add SPDX-FileCopyrightText (#19100)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-06-03 11:20:17 -07:00
d054da1992 [Misc] fix: add miss best_of param validation (#18555)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-03 11:02:07 -07:00
4b7817c119 [Misc] Add missing _Backend enums (#19081)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-03 16:15:16 +00:00
d00dd65cd4 [Doc] Improve the Pull Request template with key components (#19086)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 23:44:34 +08:00
d81edded69 [Bugfix] disable processor cache (#19068)
Signed-off-by: raushan <raushan@huggingface.co>
2025-06-03 15:06:04 +00:00
476844d44c Fix underscores in dict keys passed via CLI (#19030)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-06-03 14:39:24 +00:00
4e68ae5e59 [CI/Build] Remove V0 LoRA test (#19066)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 14:30:18 +00:00
4e88723f32 [doc] clarify windows support (#19088)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-03 21:42:17 +08:00
118ff92111 [Doc] Update V1 user guide for embedding and enc-dec models (#19060)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-03 02:29:41 -07:00
ec2dcd80bc [Misc] Update WeightsMapper for qwen2-vl/qwen2.5-vl (#19054)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-03 09:08:20 +00:00
42243fbda0 [Doc] Add InternVL LoRA support (#19055)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 09:08:03 +00:00
6d18ed2a2e Update docker docs with ARM CUDA cross-compile (#19037)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-06-03 08:21:53 +00:00
f32fcd9444 [v1][KVCacheManager] Rename BlockHashType to BlockHash (#19015)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 08:01:48 +00:00
d32aa2e670 [Bugfix] Use cmake 3.26.1 instead of 3.26 to avoid build failure (#19019)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 00:16:17 -07:00
cc977286e7 Reduce logs in CLI scripts and plugin loader (#18970)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 06:00:45 +00:00
17430e3653 [bugfix] small fix logic issue (#18999)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 05:35:12 +00:00
1282bd812e Add tarsier model support (#18985)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-03 13:13:13 +08:00
bdce64f236 [V1] Support DP with Ray (#18779) 2025-06-02 21:15:13 -07:00
9e6f61e8c3 [ROCm][Build] Clean up the ROCm build (#19040)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-02 20:47:47 -07:00
8655f47f37 [CPU][CI] Re-enable the CPU CI tests (#19046)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-02 20:46:47 -07:00
4ce42f9204 Adding "LoRA Test %N" to AMD production tests (#18929)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
2025-06-02 20:46:44 -07:00
8a57872b2a [Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-03 11:36:51 +08:00
5bc1ad6cee [Doc] Remove duplicate TOCs during MkDocs migration (#19021)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-06-02 19:49:48 -07:00
9112b443a0 [Hardware][TPU] Initial support of model parallelism with single worker using SPMD (#18011)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Hossein Sarshar <hossein.sarshar@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-06-03 00:06:20 +00:00
c57d577e8d add an absolute path for run.sh (#18258)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-02 19:38:23 +00:00
ca2f6b9c30 [Bugfix][Model] Attempt to fix eagle in V0. (#18978)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-02 08:15:53 -07:00
20133cfee2 [Frontend] enable custom logging for the uvicorn server (OpenAI API server) (#18403)
Signed-off-by: François Paupier <francois.paupier@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-02 15:04:23 +00:00
ebb1ec9318 [Model] enable data parallel for Llama4 vision encoder (#18368)
Signed-off-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
Co-authored-by: yZhen <yZhen@fb.com>
Co-authored-by: yzhen <yzhen@devgpu093.cco2.facebook.com>
2025-06-02 19:22:54 +08:00
5b168b6d7a [doc] add pytest tips (#19010)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-02 11:07:26 +00:00
9760fd8f6a [Core] Support inplace model weights loading (#18745)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-02 17:38:50 +08:00
b9f61e1387 [Bugfix][Nixl] Fix DP Metadata Handshake (#19008)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-06-02 03:30:41 +00:00
d6fd3a33b8 [Misc] reuse num_tokens_across_dp of get_dp_padding to avoid unnecessary dp all reduce in set_forward_context (#18935)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
2025-06-01 19:41:18 +00:00
432ec9926e [doc] wrong output (#19000)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-01 11:26:14 +00:00
2b102d51ad [BugFix] Fix incorrect metrics shutdown error log message (#18992)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-01 11:42:23 +08:00
aa54a7bf7b [BugFix] fix data parallel construct ipv6 url addres (#18991)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-06-01 11:42:10 +08:00
2ad6194a02 Let max_num_batched_tokens use human_readable_int for large numbers (#18968)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-01 11:41:29 +08:00
c594cbf565 [doc] small fix - mkdocs (#18996)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 20:23:43 -07:00
a35ca765a5 [LoRA] Support dynamically initialize packed_modules_mapping for VLM with arbitrary components (#18987)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-01 11:06:57 +08:00
6aa8f9a4e7 [Core] Rework dtype resolution (#18751)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-01 11:04:23 +08:00
1bc86a3da1 [Bugfix] Fix EAGLE3 broken logits (#18909)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-05-31 19:58:07 -07:00
bbfa0c61d1 [Misc][Benchmark] Add support for CustomDataset (#18511) 2025-05-31 19:07:38 +00:00
20079c6e36 [Misc] add return token strs for tokenize (#18941)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 18:00:11 +00:00
9a1b9b99d7 [BugFix] Fix multi-node offline data-parallel (#18981)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-05-31 08:34:52 -07:00
8bf507d766 [P/D] NixlConnector use cache device index for memory registration (#18969)
Signed-off-by: Piotr Tarasiewicz <ptarasiewicz@nvidia.com>
2025-05-31 11:19:18 -04:00
306d60401d [ROCm][Kernel] Add gfx950 support for skinny gemms (#18010)
Signed-off-by: charlifu <charlifu@amd.com>
2025-05-31 07:40:05 -07:00
f2c3f66d59 [Bugfix] Fix for issue 17396 (#18773)
Signed-off-by: Fred Reiss <frreiss@us.ibm.com>
2025-05-31 11:58:17 +00:00
0f5e0d567e [FEAT][ROCm] Add AITER grouped topk for DeepSeekV2 (#18825)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-31 03:39:31 -07:00
c55d804672 [BugFix] Pydantic part 2 (#18911)
Signed-off-by: luka <luka@neuralmagic.com>
2025-05-31 03:39:28 -07:00
749f5bdd38 [doc] fix the list rendering issue - security.md (#18982)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-31 10:39:21 +00:00
2a50ef5760 [Neuron] Add Multi-Modal model support for Neuron (#18921)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
Co-authored-by: Ashraf Mahgoub <ashymahg@amazon.com>
Co-authored-by: Rohith Nallamaddi <nalrohit@amazon.com>
Co-authored-by: FeliciaLuo <luof@amazon.com>
Co-authored-by: Elaine Zhao <elaineyz@amazon.com>
2025-05-31 10:39:11 +00:00
b8b904795d fix security issue of logging llm output (#18980)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-05-31 10:38:56 +00:00
ba5111f237 [Bugfix]: Fix the incompatibility issue with Structured Outputs when Thinking is disabled (#18879)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-31 09:20:54 +00:00
1e123529d7 [Misc] Fix estimated max model len msg (#18966)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-05-31 16:43:44 +08:00
dff80b0e42 [Frontend] Add rerank support to run_batch endpoint (#16278)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2025-05-31 07:40:01 +00:00
7782464a17 create util function for batched arange (#18937) 2025-05-31 13:50:38 +08:00
0f71e24034 [Docs] Correct multiprocessing design doc (#18964)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-31 01:30:15 +00:00
1dab4d5718 Tool parser regex timeout handling (#18960)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-05-30 21:02:54 +00:00
7f21e8052b [Misc] add group_size is -1 in awq quantization (#18910)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-05-30 17:34:22 +00:00
5a8641638a [VLM] Add PP support and fix GPTQ inference for Ovis models (#18958)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-30 17:11:44 +00:00
f49239cb45 Benchmark script for fp8 vs bf16 gemm (#17126)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 10:56:11 -06:00
2dbe8c0774 [Perf] API-server scaleout with many-to-many server-engine comms (#17546) 2025-05-30 08:17:00 -07:00
84ec470fca Improve "failed to get the hash of the compiled graph" error (#18956)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-30 15:00:54 +00:00
b29ca5c4d5 [Docs] Update SECURITY.md with link to our security guide (#18961)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-05-30 07:37:27 -07:00
ec6833c5e9 [doc] show the count for fork and watch (#18950)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-30 06:45:59 -07:00
e1fadf1197 [Feature] minicpm eagle support (#18943)
Signed-off-by: huangyuxiang03 <huangyx0321@gmail.com>
Co-authored-by: huangyuxiang03 <huangyx0321@gmail.com>
2025-05-30 06:45:56 -07:00
43ff405b90 [CI/Build] remove regex from build dependencies (#18945)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-30 04:02:50 -07:00
fba02e3bd1 [Bugfix][TPU] Fix tpu model runner testcase failure (#18810)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-30 18:04:03 +08:00
4577fc9abb [Misc]Fix typo (#18947) 2025-05-30 02:21:35 -07:00
5f1d0c8118 [Bugfix][Failing Test] Fix test_vllm_port.py (#18618)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-30 17:13:47 +08:00
c3bb9f2331 [Model] Use in-place adds in SigLIP (#18922)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-30 17:12:59 +08:00
8f8900cee9 [doc] add mkdocs doc (#18930)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-30 07:58:44 +00:00
6acb7a6285 [Misc]Fix benchmarks/README.md for speculative decoding (#18897)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-30 07:58:04 +00:00
4f4a6b844a [Deprecation] Remove mean pooling default for Qwen2EmbeddingModel (#18913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-30 06:53:37 +00:00
4d0a1541be [Bugfix] Remove NVFP4 scales assertions to fix load_format=dummy (#18861)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 13:37:36 +08:00
77b6e74fe2 [ROCm] Remove unnecessary assertion of max_model_len in ROCM_AITER_MLA attention backend. (#18938)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-05-29 22:33:17 -07:00
H
5acf828d99 [docs] fix: fix markdown syntax (#18927) 2025-05-30 05:20:48 +00:00
3987e2ae96 [Model] Use AutoWeightsLoader for mamba2 (#18918)
Signed-off-by: iLeGend <824040212@qq.com>
2025-05-30 04:50:10 +00:00
77164dad5e [Bugfix] Consistent ascii handling in tool parsers (#18883)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-05-30 04:44:43 +00:00
3de3eadf5b improve the robustness of parsing vlms config in AutoRound (#18894)
Signed-off-by: wenhuach21 <wenhua.cheng@intel.com>
2025-05-29 19:24:47 -07:00
3132290a14 [TPU][CI/CD] Clean up docker for TPU tests. (#18926)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-30 10:24:19 +08:00
1aa2f81b43 [Misc] Update type annotation for rotary embedding base (#18914)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-30 10:17:01 +08:00
d54af615d5 [Bugfix] Fix PP default fallback behavior for V1 (#18915)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-05-30 10:13:17 +08:00
a1cc9f33a3 [TPU] remove transpose ops in moe kernel (#18923)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-05-29 23:00:11 +00:00
a521ef06e5 Use standalone_compile by default in torch >= 2.8.0 (#18846)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-30 06:41:58 +08:00
64eaf5fe05 [P/D] NixlConnector DP fixes (#18903)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-05-29 18:08:40 +00:00
d1d61f3351 [BugFix] Make DP work with connector-delayed new requests (#18559)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Will Eaton <weaton@redhat.com>
2025-05-29 18:04:18 +00:00
32ce3cf7c9 [V1] Allocate kv_cache with stride order for V1 (#18775)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-05-29 17:54:16 +00:00
d58f9c7f7a [Misc] Remove duplicate init for self.vllm_config (#18896)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-05-29 17:26:07 +00:00
c29034037d [Deprecation] Disallow pos-args other than model when initializing LLM (#18802)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-29 09:36:58 -07:00
1b7cfd5a36 [ROCm][V0][Attention] Revert to the previous FA triton kernel (#18226)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-29 12:13:18 -04:00
da4b69d0b4 [Attention][V1] Toggle for v1 attention backend (#18275)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-05-29 10:48:24 -04:00
c9479b2920 [Bugfix] Fix the failing gte embedding test (#18720)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-29 07:39:25 -07:00
6f2909405e [Doc] Fix codeblocks formatting in LoRA adapters documentation (#18907)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-05-29 07:38:55 -07:00
b169d5f7b6 [Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692)
Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
2025-05-29 20:02:08 +08:00
f8977c233f Fix an error in dummy weight loading for quantization models (#18855)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-05-29 03:07:20 -07:00
f274581f44 [BugFix] Update pydantic to fix error on python 3.10 (#18852)
Signed-off-by: luka <luka@neuralmagic.com>
2025-05-29 03:05:46 -07:00
0b1447f890 [Bugfix] Ensure tensors are contiguous during serialisation (#18860)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-29 03:05:20 -07:00
24d0ef8970 [Misc] Replace TODO in serving transcription (#18895)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-05-29 02:58:14 -07:00
7fcfd954ff [Bugfix] Fix misleading information in the documentation (#18845)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-29 02:54:14 -07:00
e740d07f07 [doc] add CLI doc (#18871)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-29 09:51:36 +00:00
a652e71dd0 [Doc] Remove redundant spaces from compatibility_matrix.md (#18891)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-05-29 02:51:20 -07:00
34d6c447c4 [LoRA] Add LoRA support for InternVL (#18842)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-05-29 08:46:24 +00:00
972eddf7c9 [Neuron] Add multi-LoRA support for Neuron. (#18284)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-29 16:41:22 +08:00
fd7bb88d72 Fixes a dead link in nightly benchmark readme (#18856)
Signed-off-by: Brent Salisbury <bsalisbu@redhat.com>
2025-05-29 04:41:39 +00:00
3c49dbdd03 Skip device and quant Pydantic validation to make plugin device work (#18843)
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
2025-05-28 20:12:30 -07:00
1661a9c28f [Doc][Neuron] Update documentation for Neuron (#18868)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
2025-05-28 19:44:01 -07:00
8e882ffdc0 [Bugfix][TPU] fix moe custom kernel import (#18853)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-05-28 19:34:19 -07:00
26b4fa45be Add ability to use CUDAGraphs with use_inductor=False (#17345)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-29 10:16:52 +08:00
515b413ebf Prevent the cross-encoder logic from being applied to classification tasks (#18838)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-05-28 19:16:17 -07:00
269d901734 [Bugfix][ROCm] fix the power of 2 exception from triton_unified_attention.py when running llama4 models and unit test fix (#18100)
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-05-29 07:21:46 +08:00
7951d78738 [Core] Enable CUDA graphs for DP + All2All kernels (#18724)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-05-28 22:55:30 +00:00
6dbe5b5c93 Remove checks for None for fields which should never be None (#17985)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-28 21:32:19 +00:00
643622ba46 [Hardware][TPU][V1] Multi-LoRA Optimisations for the V1 TPU backend (#15655)
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: xihajun <junfan@krai.ai>
Signed-off-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Signed-off-by: Jorge de Freitas <jorge@krai.ai>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: xihajun <junfan@krai.ai>
Co-authored-by: Jorge de Freitas <jorge.de-freitas22@imperial.ac.uk>
Co-authored-by: Jorge de Freitas <jorge@krai.ai>
2025-05-28 19:59:09 +00:00
a09c7ca9f2 [Chore][Spec Decode] Update check NoneType instead of assigning variables (#18836)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-28 18:57:19 +00:00
0e98964e94 [V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-05-28 18:54:12 +00:00
c68b5c63eb [Misc] fix olmoe model layer can't laod in tp gt 1 (#18828)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-05-28 17:36:21 +00:00
fced756923 [Chore] update ty configuration (#18839)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-05-28 08:59:11 -07:00
321331b8ae [Core] Add Lora Support to Beam Search (#18346)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-05-28 08:58:24 -07:00
6e4cea1cc5 decrement server_load on listen for disconnect (#18784)
Signed-off-by: Daniel Salib <danielsalib@meta.com>
2025-05-28 22:15:12 +08:00
435fa95444 [Frontend] add run batch to CLI (#18804)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-05-28 07:08:57 -07:00
4c2b38ce9e Enable Pydantic mypy checks and convert configs to Pydantic dataclasses (#17599)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-28 12:46:04 +00:00
d781930f90 [Platform][Dist] Make torch distributed process group extendable (#18763)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-05-28 10:52:34 +00:00
ce75efeecb [BugFix] FA2 MLA Accuracy Issue (#18807)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-05-28 08:59:39 +00:00
aa42561e40 Fix PiecewiseCompileInterpreter (#17338)
Signed-off-by: rzou <zou3519@gmail.com>
2025-05-28 08:40:53 +00:00
de65fc8e1e [CI] improve embed testing (#18747) 2025-05-28 00:16:35 -07:00
0c492b7824 [Deprecation] Remove fallbacks for Embeddings API (#18795)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:09:04 +08:00
0f0926b43f [Deprecation] Remove unused sync methods in async_timeout (#18792)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:08:48 +08:00
7f2c1a87e9 [Deprecation] Require overriding get_dummy_text and get_dummy_mm_data (#18796)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-05-28 15:08:35 +08:00
b78f844a67 [Bugfix][FailingTest]Fix test_model_load_with_params.py (#18758)
Signed-off-by: rabi <ramishra@redhat.com>
2025-05-28 05:42:54 +00:00
5e13c07d00 [V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (2) (#18781)
Signed-off-by: Ronald Xu <ronaldxu@amazon.com>
2025-05-28 05:09:14 +00:00
774c5fde30 [V1] fix torch profiling for V1 offline scenarios (#18445)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-05-28 04:16:30 +00:00
9a21e331ff [Bugfix]: correctly propagate errors message caught at the chat_templating step to the client (#18769)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-05-28 03:35:43 +00:00
3e9ce609bd [Bugfix] Fix nomic max_model_len (#18755) 2025-05-27 20:29:53 -07:00
794ae1f551 [rocm] Fix wrong attention log (#18764)
Signed-off-by: Felix Marty <felmarty@amd.com>
2025-05-27 19:45:41 -07:00
d73a9457a5 [Core] Improve Tensor serialisation (#18774)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-05-28 09:46:21 +08:00
a3896c7f02 [Build] Fixes for CMake install (#18570) 2025-05-27 20:49:24 -04:00
51e98e4ffd [Bugfix] Disable prefix caching by default for benchmark (#18771)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-05-28 08:18:09 +08:00
e56f44d9ec Support datasets in vllm bench serve and sync with benchmark_[serving,datasets].py (#18566) 2025-05-27 19:59:48 -04:00
e0cbad4e30 [Neuron] Support quantization on neuron (#18283)
Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
2025-05-27 22:10:33 +00:00
b48d5cca16 [CI/Build] [TPU] Fix TPU CI exit code (#18282)
Signed-off-by: Carol Zheng <cazheng@google.com>
2025-05-27 14:54:59 -07:00
1712 changed files with 47287 additions and 15144 deletions

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml

View File

@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.

View File

@ -16,7 +16,7 @@ Please download the visualization scripts in the post
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
```console
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
import json

View File

@ -1,5 +1,6 @@
steps:
- label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
@ -11,6 +12,7 @@ steps:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
commands:
@ -28,6 +30,7 @@ steps:
- label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents:
queue: cpu_queue_postmerge
commands:
@ -44,6 +47,7 @@ steps:
- label: "Build release image"
depends_on: block-release-image-build
id: build-release-image
agents:
queue: cpu_queue_postmerge
commands:
@ -51,6 +55,18 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
@ -70,9 +86,10 @@ steps:
DOCKER_BUILDKIT: "1"
- input: "Provide Release version here"
id: input-release-version
fields:
- text: "What is the release version?"
key: "release-version"
key: release-version
- block: "Build CPU release image"
key: block-cpu-release-image-build
@ -85,6 +102,7 @@ steps:
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
@ -100,6 +118,7 @@ steps:
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"

View File

@ -0,0 +1,31 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -0,0 +1,17 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \

View File

@ -7,6 +7,7 @@ set -ex
# Setup cleanup
remove_docker_container() {
if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true
fi
podman system prune -f
@ -37,7 +38,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.

View File

@ -6,75 +6,82 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model
pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
@ -83,7 +90,7 @@ function cpu_tests() {
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
@ -91,4 +98,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
--name "${container_name}" \
${image_name} \
/bin/bash -c "
set -e; # Exit on first error
python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo 'Running test file: '$f;
echo \"Running test file: \$f\";
python3 -m pytest \$f -v --capture=tee-sys;
done
"

View File

@ -2,102 +2,184 @@
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest pytest-asyncio tpu-info \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_XLA_CACHE_PATH= \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \
&& tpu-info \
&& { \
echo TEST_0: Running test_perf.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
echo TEST_0_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_1: Running test_compilation.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
echo TEST_1_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_2: Running test_basic.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
echo TEST_2_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
echo TEST_3_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_4: Running test_quantization_accuracy.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
echo TEST_4_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_5: Running examples/offline_inference/tpu.py; \
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
echo TEST_5_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_6: Running test_tpu_model_runner.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
echo TEST_6_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_7: Running test_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_8: Running test_topk_topp_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
echo TEST_8_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_9: Running test_multimodal.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
echo TEST_9_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_10: Running test_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
echo TEST_10_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_11: Running test_struct_output_generate.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
echo TEST_11_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_12: Running test_moe_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
echo TEST_12_EXIT_CODE: \$?; \
} & \
# Disable the TPU LoRA tests until the feature is activated
# & { \
# echo TEST_13: Running test_moe_pallas.py; \
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# echo TEST_13_EXIT_CODE: \$?; \
# } & \
wait \
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
"
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 0 "test_perf.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
run_and_track_test 1 "test_compilation.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
run_and_track_test 8 "test_topk_topp_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -0,0 +1,18 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@ -0,0 +1,24 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@ -0,0 +1,14 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=256
MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -0,0 +1,102 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build docker image.
# TODO: build the image outside the script and share the image with other
# tpu test if building time is too long.
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg GIT_REPO_CHECK=0 \
--tag vllm/vllm-tpu-bm \
--progress plain -f docker/Dockerfile.tpu .
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@ -0,0 +1,94 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@ -89,7 +89,7 @@ steps:
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_chunked_prefill
@ -145,6 +145,7 @@ steps:
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py
- tests/v1/engine/test_engine_core_client.py
commands:
# test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -154,6 +155,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
@ -175,6 +177,11 @@ steps:
- tests/tracing
commands:
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s tracing
##### fast check tests #####
@ -199,8 +206,9 @@ steps:
- tests/test_sequence
- tests/test_config
- tests/test_logger
- tests/test_vllm_port
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
@ -263,6 +271,15 @@ steps:
commands:
- pytest -v -s prefix_caching
- label: Platform Tests (CUDA)
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Samplers Test # 36min
mirror_hardwares: [amdexperimental]
source_file_dependencies:
@ -274,17 +291,6 @@ steps:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min
mirror_hardwares: [amdexperimental]
source_file_dependencies:
@ -297,7 +303,7 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/lora
- tests/lora
@ -313,6 +319,7 @@ steps:
commands:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
@ -328,6 +335,7 @@ steps:
# these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction]
@ -397,6 +405,17 @@ steps:
- pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test
mirror_hardwares: [amdexperimental, amdproduction]
soft_fail: true
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/.buildkite"
@ -420,6 +439,9 @@ steps:
- vllm/model_executor/layers/quantization
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
@ -617,9 +639,11 @@ steps:
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
@ -660,7 +684,7 @@ steps:
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:

16
.github/CODEOWNERS vendored
View File

@ -10,15 +10,17 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb
/vllm/v1/structured_output @mgoin @russellb @aarnphm
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96
@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee
# Docs
/docs @hmellor
mkdocs.yaml @hmellor
mkdocs.yaml @hmellor

View File

@ -8,6 +8,16 @@ body:
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
attributes:
value: |
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
- Passwords or authentication credentials
- Private URLs or endpoints
- Personal or confidential data
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
- type: textarea
attributes:
label: Your current environment

View File

@ -1,6 +1,18 @@
FILL IN THE PR DESCRIPTION HERE
## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
FIX #xxxx (*link existing issues this PR will resolve*)
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
## Purpose
## Test Plan
## Test Result
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

62
.github/mergify.yml vendored
View File

@ -36,6 +36,21 @@ pull_request_rules:
add:
- frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
- title~=(?i)llama
actions:
label:
add:
- llama
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
@ -51,6 +66,53 @@ pull_request_rules:
add:
- multi-modality
- name: label-performance
description: Automatically apply performance label
conditions:
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
- files~=^tests/benchmarks/
- files~=^\.buildkite/nightly-benchmarks/
actions:
label:
add:
- performance
- name: label-qwen
description: Automatically apply qwen label
conditions:
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
- files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen
actions:
label:
add:
- qwen
- name: label-rocm
description: Automatically apply rocm label
conditions:
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- files~=^vllm/attention/backends/rocm.*\.py
- files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
- title~=(?i)ROCm
actions:
label:
add:
- rocm
- name: label-structured-output
description: Automatically apply structured-output label
conditions:

2
.gitignore vendored
View File

@ -200,5 +200,5 @@ benchmarks/**/*.json
actionlint
shellcheck*/
# Ingore moe/marlin_moe gen code
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -11,6 +11,8 @@ repos:
hooks:
- id: yapf
args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
@ -18,12 +20,10 @@ repos:
args: [--output-format, github, --fix]
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
- repo: https://github.com/crate-ci/typos
rev: v1.32.0
hooks:
- id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- id: typos
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:
@ -58,7 +58,7 @@ repos:
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
@ -115,6 +115,11 @@ repos:
entry: python tools/check_spdx_header.py
language: python
types: [python]
- id: check-root-lazy-imports
name: Check root lazy imports
entry: python tools/check_init_lazy_imports.py
language: python
types: [python]
- id: check-filenames
name: Check for spaces in all filenames
entry: bash
@ -143,6 +148,13 @@ repos:
types: [python]
pass_filenames: false
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/check_pickle_imports.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [pathspec, regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
@ -179,9 +182,6 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -189,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
#
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -243,6 +242,7 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
#
@ -420,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@ -542,10 +542,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
#
@ -785,5 +785,7 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)
endif ()

View File

@ -58,8 +58,8 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
- Speculative decoding
- Chunked prefill
@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
- Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
- Prefix caching support
- Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g. E5-Mistral)
- Embedding Models (e.g., E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -154,12 +154,14 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
<!-- --8<-- [end:contact-us] -->
## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)

View File

@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
---
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.

View File

@ -64,6 +64,12 @@ become available.
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>data.jsonl</code></td>
</tr>
</tbody>
</table>
@ -124,6 +130,38 @@ P99 ITL (ms): 8.39
==================================================
```
### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```
```bash
# run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
### VisionArena Benchmark for Vision Language Models
```bash
@ -146,9 +184,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
``` bash
@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \
--seed 42
```
**`philschmid/mt-bench`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
@ -273,9 +321,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
```
@ -339,3 +387,178 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test
```
---
## Example - Structured Output Benchmark
Benchmark the performance of structured output generation (JSON, grammar, regex).
### Server Setup
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
```
### JSON Schema Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
### Grammar-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
### Regex-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
### Choice-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
### XGrammar Benchmark Dataset
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
---
## Example - Long Document QA Throughput Benchmark
Benchmark the performance of long document question-answering with prefix caching.
### Basic Long Document QA Test
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
### Different Repeat Modes
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
---
## Example - Prefix Caching Benchmark
Benchmark the efficiency of automatic prefix caching.
### Fixed Prompt with Prefix Caching
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
### ShareGPT Dataset with Prefix Caching
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
---
## Example - Request Prioritization Benchmark
Benchmark the performance of request prioritization in vLLM.
### Basic Prioritization Test
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
### Multiple Sequences per Prompt
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```

View File

@ -10,11 +10,16 @@
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
@ -30,31 +35,31 @@
TAG=$(date +"%Y_%m_%d_%H_%M")
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
SYSTEM="TPU"
TP=1
DOWNLOAD_DIR=""
INPUT_LEN=4000
OUTPUT_LEN=16
MIN_CACHE_HIT_PCT_PCT=0
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
echo "result file$ $RESULT"
echo "result file: $RESULT"
echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
mkdir -p $LOG_FOLDER
mkdir -p $PROFILE_PATH
cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install datasets
pip install -q datasets
current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
@ -64,53 +69,88 @@ best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
# start the server
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
local profile_dir=$5
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization 0.98 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size 1 \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir $DOWNLOAD_DIR \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then
echo "Application started"
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
# echo "wait for 10 seconds..."
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
update_best_profile() {
local profile_dir=$1
local profile_index=$2
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
mkdir -p $profile_dir
pkill -f vllm
local profile_index=0
echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
else
echo "server started."
fi
echo
echo "run benchmark test..."
echo
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@ -118,30 +158,32 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--port 8004 \
--profile &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
request_rate=inf
fi
if (( ! meet_latency_requirement )); then
# start from request-rate as int(through_put) + 1
request_rate=$((${through_put%.*} + 1))
# start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
profile_index=$((profile_index+1))
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
@ -149,19 +191,18 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore_eos \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@ -173,13 +214,19 @@ run_benchmark() {
fi
# write the results and update the best result.
if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
best_throughput=$through_put
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
best_throughput=$throughput
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
if [[ "$SYSTEM" == "TPU" ]]; then
update_best_profile "$profile_dir/plugins/profile" $profile_index
fi
if [[ "$SYSTEM" == "GPU" ]]; then
update_best_profile "$profile_dir" $profile_index
fi
fi
else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -188,25 +235,42 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20)
return 0
}
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
num_seqs_list="128 256"
num_batched_tokens_list="512 1024 2048 4096"
for num_seqs in $num_seqs_list; do
for num_batched_tokens in $num_batched_tokens_list; do
run_benchmark $num_seqs $num_batched_tokens
exit 0
# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done
done
echo "finish permutations"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
import json
@ -324,7 +325,7 @@ async def async_request_openai_completions(
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
if usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
if first_chunk_received:
output.success = True
@ -403,8 +404,14 @@ async def async_request_openai_chat_completions(
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: SSE comments (often used as pings) start with a colon.
# These are not JSON data payload and should be skipped.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
@ -611,6 +618,7 @@ ASYNC_REQUEST_FUNCS = {
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
"llama.cpp": async_request_openai_completions,
}
OPENAI_COMPATIBLE_BACKENDS = [

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample
@ -9,9 +10,6 @@ generation. Supported dataset types include:
- BurstGPT
- HuggingFace
- VisionArena
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
SampleRequest instances, similar to the approach used in ShareGPT.
"""
import base64
@ -355,7 +353,7 @@ class RandomDataset(BenchmarkDataset):
: input_lens[i]
]
prompt = tokenizer.decode(re_encoded_sequence)
total_input_len = prefix_len + int(input_lens[i])
total_input_len = len(re_encoded_sequence)
requests.append(
SampleRequest(
prompt=prompt,
@ -442,6 +440,97 @@ class ShareGPTDataset(BenchmarkDataset):
return samples
# -----------------------------------------------------------------------------
# Custom Dataset Implementation
# -----------------------------------------------------------------------------
class CustomDataset(BenchmarkDataset):
"""
Implements the Custom dataset. Loads data from a JSONL file and generates
sample requests based on conversation turns. E.g.,
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
# self.data will be a list of dictionaries
# e.g., [{"prompt": "What is the capital of India?"}, ...]
# This will be the standardized format which load_data()
# has to convert into depending on the filetype of dataset_path.
# sample() will assume this standardized format of self.data
self.data = []
# Load the JSONL file
if self.dataset_path.endswith(".jsonl"):
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
# check if the JSONL file has a 'prompt' column
if "prompt" not in jsonl_data.columns:
raise ValueError("JSONL file must contain a 'prompt' column.")
# Convert each row to a dictionary and append to self.data
# This will convert the DataFrame to a list of dictionaries
# where each dictionary corresponds to a row in the DataFrame.
# This is the standardized format we want for self.data
for _, row in jsonl_data.iterrows():
self.data.append(row.to_dict())
else:
raise NotImplementedError(
"Only JSONL format is supported for CustomDataset."
)
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
**kwargs,
) -> list:
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = item["prompt"]
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
)
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# Sonnet Dataset Implementation
# -----------------------------------------------------------------------------
@ -776,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset):
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
the code, do not include any explanation."
# apply template
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark the latency of processing a single batch of requests."""
import argparse
@ -6,13 +7,12 @@ import dataclasses
import json
import os
import time
from pathlib import Path
from typing import Any, Optional
import numpy as np
import torch
from tqdm import tqdm
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
@ -80,17 +80,9 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)
),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
llm.start_profile()
llm_generate()
llm.stop_profile()
else:
start_time = time.perf_counter()
llm_generate()
@ -103,11 +95,7 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None)
if args.profile:
profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = (
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
)
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir)
return
@ -135,7 +123,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, results)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of "
"requests till completion."
@ -164,15 +152,6 @@ if __name__ == "__main__":
action="store_true",
help="profile the generation process of a single batch",
)
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser.add_argument(
"--output-json",
type=str,
@ -192,5 +171,16 @@ if __name__ == "__main__":
# V1 enables prefix caching by default which skews the latency
# numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Offline benchmark to test the long document QA throughput.
@ -141,7 +142,7 @@ def main(args):
)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the performance with or "
"without automatic prefix caching."
@ -191,5 +192,11 @@ if __name__ == "__main__":
)
parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the efficiency of prefix caching.
@ -217,7 +218,7 @@ def main(args):
)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the performance with or without "
"automatic prefix caching."
@ -267,5 +268,11 @@ if __name__ == "__main__":
)
parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline prioritization."""
import argparse
@ -160,7 +161,7 @@ def main(args: argparse.Namespace):
json.dump(results, f, indent=4)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument(
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
@ -203,6 +204,12 @@ if __name__ == "__main__":
)
parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput.
On the server side, run one of the following commands:
@ -60,6 +61,7 @@ from benchmark_dataset import (
ASRDataset,
BurstGPTDataset,
ConversationDataset,
CustomDataset,
HuggingFaceDataset,
InstructCoderDataset,
MTBenchDataset,
@ -627,7 +629,16 @@ def main(args: argparse.Namespace):
"'--dataset-path' if required."
)
if args.dataset_name == "sonnet":
if args.dataset_name == "custom":
dataset = CustomDataset(dataset_path=args.dataset_path)
input_requests = dataset.sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.custom_output_len,
skip_chat_template=args.custom_skip_chat_template,
)
elif args.dataset_name == "sonnet":
dataset = SonnetDataset(dataset_path=args.dataset_path)
# For the "sonnet" dataset, formatting depends on the backend.
if args.backend == "openai-chat":
@ -762,6 +773,10 @@ def main(args: argparse.Namespace):
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
if args.backend == "llama.cpp":
# Disable prompt caching in llama.cpp backend
sampling_params["cache_prompt"] = False
# Avoid GC processing "static" data - reduce pause times.
gc.collect()
gc.freeze()
@ -834,6 +849,8 @@ def main(args: argparse.Namespace):
]:
if field in result_json:
del result_json[field]
if field in benchmark_result:
del benchmark_result[field]
# Save to file
base_model_id = model_id.split("/")[-1]
@ -846,6 +863,7 @@ def main(args: argparse.Namespace):
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
os.makedirs(args.result_dir, exist_ok=True)
file_name = os.path.join(args.result_dir, file_name)
with open(
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
@ -857,7 +875,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput."
)
@ -886,7 +904,7 @@ if __name__ == "__main__":
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument(
@ -1056,6 +1074,19 @@ if __name__ == "__main__":
)
# group for dataset specific arguments
custom_group = parser.add_argument_group("custom dataset options")
custom_group.add_argument(
"--custom-output-len",
type=int,
default=256,
help="Number of output tokens per request, used only for custom dataset.",
)
custom_group.add_argument(
"--custom-skip-chat-template",
action="store_true",
help="Skip applying chat template to prompt, used only for custom dataset.",
)
sonnet_group = parser.add_argument_group("sonnet dataset options")
sonnet_group.add_argument(
"--sonnet-input-len",
@ -1194,6 +1225,10 @@ if __name__ == "__main__":
"script chooses a LoRA module at random.",
)
args = parser.parse_args()
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands:
@ -11,7 +12,6 @@ On the client side, run:
--model <your_model> \
--dataset json \
--structured-output-ratio 1.0 \
--structured-output-backend auto \
--request-rate 10 \
--num-prompts 1000
@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
json.dump(results, outfile, indent=4)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput."
)
@ -1034,5 +1034,10 @@ if __name__ == "__main__":
help="Ratio of Structured Outputs requests",
)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput."""
import argparse
@ -96,7 +97,7 @@ def run_vllm(
assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
output_len = requests[0].expected_output_len
for request in requests:
assert request.expected_output_len == output_len
start = time.perf_counter()
@ -594,7 +595,7 @@ def validate_args(args):
)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument(
"--backend",
@ -716,6 +717,12 @@ if __name__ == "__main__":
)
parser = AsyncEngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)
json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
from collections.abc import Iterable

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import itertools

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle as pkl
import time

View File

@ -0,0 +1,158 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"fp8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"fp8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"fp8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"fp8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"fp8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"fp8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"fp8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"fp8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
else:
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
return b_fp8.t(), scale_b_fp8
def build_fp8_runner(cfg, a, b, dtype, device):
b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
scale_a_const = (
torch.ones(1, device=device, dtype=torch.float32)
if cfg["a"] == "tensor"
else None
)
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
else:
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
def run():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
if cfg["a"] == "tensor":
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs FP8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_fp8_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_fp8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -0,0 +1,169 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"int8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"int8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"int8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"int8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"int8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"int8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"int8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"int8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
def _quant_weight(b, w_type, device):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
assert scale_b_int8.numel() == 1
else: # channel
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
assert scale_b_int8.numel() == b.shape[0]
return b_int8.t(), scale_b_int8
def build_int8_runner(cfg, a, b, dtype, device):
# quant before running the kernel
b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
scale_a_const = None
if cfg["a"] == "tensor":
scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
# no quant, create activation ahead
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
else: # token
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
def run_quant():
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
# dynamic quant, create activation inside
if cfg["a"] == "tensor":
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
else: # token
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=[k for k in _enabled],
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs INT8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_int8_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
KN_model_names = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
KN_model_names.append(KN)
return KN_model_names
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
)
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_int8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@ -90,7 +91,7 @@ def bench_run(
score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
quant_blocksize = 16
w1_blockscale = torch.empty(

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.utils.benchmark as benchmark
@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
cutlass_moe_fp8,
fused_experts,
fused_topk,
)
@ -69,18 +70,9 @@ def bench_run(
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
@ -121,10 +113,6 @@ def bench_run(
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
num_repeats: int,
):
for _ in range(num_repeats):
@ -132,14 +120,10 @@ def bench_run(
a,
w1,
w2,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
w1_scale,
w2_scale,
a1_scale=a_scale,
)
@ -152,10 +136,6 @@ def bench_run(
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@ -164,14 +144,10 @@ def bench_run(
a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
w1_scale,
w2_scale,
a1_scale=a_scale,
)
@ -217,10 +193,6 @@ def bench_run(
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
)
torch.cuda.synchronize()
@ -229,8 +201,8 @@ def bench_run(
with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph(
a,
w1_q_notransp,
w2_q_notransp,
w1_q,
w2_q,
topk_weights,
topk_ids,
w1_scale,
@ -249,18 +221,12 @@ def bench_run(
"w2": w2,
"score": score,
"topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params
"a_scale": a_scale,
"w1_q": w1_q,
"w2_q": w2_q,
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"ab_strides1": ab_strides1,
"c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
@ -278,8 +244,8 @@ def bench_run(
# Warmup
run_triton_moe(
a,
w1_q_notransp,
w2_q_notransp,
w1_q,
w2_q,
topk_weights,
topk_ids,
w1_scale,
@ -290,7 +256,7 @@ def bench_run(
results.append(
benchmark.Timer(
stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@ -321,16 +287,12 @@ def bench_run(
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
num_warmup,
)
results.append(
benchmark.Timer(
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.utils.benchmark as benchmark
@ -21,8 +22,16 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
MARLIN_SUPPORTED_GROUP_SIZES,
query_marlin_supported_quant_types,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
FP4_MARLIN_SUPPORTED_GROUP_SIZES,
rand_marlin_weight_fp4_like,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
marlin_quant_fp8_torch,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace,
awq_marlin_quantize,
marlin_quantize,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@ -34,7 +43,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights,
sort_weights,
)
from vllm.scalar_type import ScalarType
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -56,80 +65,144 @@ def bench_run(
size_n: int,
):
label = "Quant Matmul"
sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
)
print(f"Testing: {sub_label}")
a = torch.randn(size_m, size_k).to(torch.half).cuda()
b = torch.rand(size_k, size_n).to(torch.half).cuda()
has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
if act_order and (group_size == -1 or group_size == size_k or has_zp):
return
if size_k % group_size != 0:
return
a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda()
# Marlin quant
(
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_g_idx,
marlin_sort_indices,
marlin_rand_perm,
) = marlin_quantize(b, quant_type, group_size, act_order)
# Marlin_24 quant
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
marlin_24_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
# GPTQ quant
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
repack_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
)
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
# Prepare
marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
)
marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
)
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
# AllSpark W8A16 quant
as_supported_case = (
allspark_supported = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1
and not act_order
and is_k_full
)
if as_supported_case:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90
as_supported_case = as_supported_case and supported_arch
if supported_arch:
has_zp = False
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp
def gen_marlin_params():
# Marlin quant
marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
if quant_type == scalar_types.float4_e2m1f:
if group_size != 16 or act_order:
return
marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
b.T, group_size
)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
elif quant_type == scalar_types.float8_e4m3fn:
if group_size not in [-1, 128] or act_order:
return
marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
elif group_size == 16:
return
elif has_zp:
marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
b, quant_type, group_size
)
else:
marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
marlin_quantize(b, quant_type, group_size, act_order)
)
return (
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx,
marlin_sort_indices,
)
def gen_marlin_24_params():
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
if marlin_24_supported:
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
)
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
def gen_repack_params():
q_w_gptq = None
repack_sort_indices = None
if repack_supported:
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
)
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
return q_w_gptq, repack_sort_indices
def gen_allspark_params():
qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
CUBLAS_M_THRESHOLD
) = None
nonlocal allspark_supported
if allspark_supported:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90
allspark_supported = allspark_supported and supported_arch
if supported_arch:
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp
)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
return (
qw_reorder,
s_reorder,
zp_reorder,
sm_count,
sm_version,
CUBLAS_M_THRESHOLD,
)
(
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx,
marlin_sort_indices,
) = gen_marlin_params()
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
gen_marlin_24_params()
)
q_w_gptq, repack_sort_indices = gen_repack_params()
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
gen_allspark_params()
)
# Prepare
marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
)
marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
)
globals = {
# Gen params
@ -139,15 +212,14 @@ def bench_run(
"size_n": size_n,
"size_k": size_k,
"a": a,
"a_tmp": a_tmp,
# Marlin params
"marlin_w_ref": marlin_w_ref,
"marlin_q_w": marlin_q_w,
"marlin_s": marlin_s,
"marlin_s2": marlin_s2,
"marlin_zp": marlin_zp,
"marlin_g_idx": marlin_g_idx,
"marlin_sort_indices": marlin_sort_indices,
"marlin_rand_perm": marlin_rand_perm,
"marlin_workspace": marlin_workspace,
"is_k_full": is_k_full,
# Marlin_24 params
@ -160,12 +232,12 @@ def bench_run(
"q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices,
# AllSpark W8A16 params
"qw_reorder": qw_reorder if as_supported_case else None,
"s_reorder": s_reorder if as_supported_case else None,
"zp_reorder": zp_reorder if as_supported_case else None,
"sm_count": sm_count if as_supported_case else None,
"sm_version": sm_version if as_supported_case else None,
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None,
"qw_reorder": qw_reorder,
"s_reorder": s_reorder,
"zp_reorder": zp_reorder,
"sm_count": sm_count,
"sm_version": sm_version,
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
# Kernels
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
@ -176,7 +248,7 @@ def bench_run(
min_run_time = 1
# Warmup pytorch
for i in range(5):
for _ in range(5):
torch.matmul(a, marlin_w_ref)
results.append(
@ -191,17 +263,17 @@ def bench_run(
results.append(
benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_gemm_fp16",
description="gptq_marlin_gemm",
).blocked_autorange(min_run_time=min_run_time)
)
results.append(
benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@ -209,10 +281,7 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time)
)
if (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
):
if marlin_24_supported:
results.append(
benchmark.Timer(
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
@ -223,17 +292,18 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time)
)
results.append(
benchmark.Timer(
stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_repack",
).blocked_autorange(min_run_time=min_run_time)
)
if repack_supported:
results.append(
benchmark.Timer(
stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_repack",
).blocked_autorange(min_run_time=min_run_time)
)
if as_supported_case:
if allspark_supported:
results.append(
benchmark.Timer(
stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501
@ -249,7 +319,6 @@ def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
results: list[benchmark.Measurement] = []
for model in args.models:
@ -277,14 +346,17 @@ def main(args):
):
continue
for quant_type in query_marlin_supported_quant_types(False):
for quant_type in query_marlin_supported_quant_types():
if (
len(args.limit_num_bits) > 0
and quant_type.size_bits not in args.limit_num_bits
):
continue
for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
for group_size in (
MARLIN_SUPPORTED_GROUP_SIZES
+ FP4_MARLIN_SUPPORTED_GROUP_SIZES
):
if (
len(args.limit_group_size) > 0
and group_size not in args.limit_group_size

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
@ -6,7 +7,6 @@ import time
from contextlib import nullcontext
from datetime import datetime
from itertools import product
from types import SimpleNamespace
from typing import Any, TypedDict
import ray
@ -42,7 +42,7 @@ def benchmark_config(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@ -399,7 +399,7 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int] = None,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
@ -531,7 +531,7 @@ def save_configs(
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int],
block_quant_shape: list[int],
) -> None:
dtype_str = get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
if args.model_prefix:
config = getattr(config, args.model_prefix)
config = SimpleNamespace(**config)
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = (
torch.float16
if current_platform.is_rocm()
else getattr(torch, config.torch_dtype)
)
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)

View File

@ -0,0 +1,159 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import itertools
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton,
)
from vllm.triton_utils import triton
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
return torch.stack(
[
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
for _ in range(num_tokens)
]
)
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
"""
Verifies vllm vs. Triton
"""
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
# 1. malloc space for triton and vllm
# malloc enough space (max_num_tokens_padded) for the sorted ids
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids_triton = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
)
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
expert_ids_triton = torch.zeros(
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
)
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
sorted_ids_vllm.fill_(topk_ids.numel())
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
# 2. run implementations
moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
)
ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids_vllm,
expert_ids_vllm,
num_tokens_post_pad_vllm,
)
print(f"✅ VLLM implementation works with {num_experts} experts!")
# 3. compare results
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
):
print("✅ Triton and VLLM implementations match.")
else:
print("❌ Triton and VLLM implementations DO NOT match.")
print("Triton expert_ids:", expert_ids_triton)
print("VLLM expert_ids:", expert_ids_vllm)
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
# test configurations
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm", "triton"], # "triton"
line_names=["VLLM", "Triton"], # "Triton"
plot_name="moe-align-block-size-performance",
args={},
)
)
def benchmark(num_tokens, num_experts, topk, provider):
"""Benchmark function for Triton."""
block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
sorted_ids.fill_(topk_ids.numel())
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
elif provider == "triton":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--num_experts",
type=int,
default=64,
choices=[8, 16, 32, 64, 128, 256],
)
parser.add_argument(
"--topk",
type=int,
default=8,
choices=[2, 4, 8],
help="Top-k value for correctness check.",
)
args = parser.parse_args()
print("Running correctness check...")
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
from typing import Any, TypedDict

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Optional, Union

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate
from typing import Optional
@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
seed: int,
device: str,
max_position: int = 8192,
base: int = 10000,
base: float = 10000,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]],

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from sglang quantization/tuning_block_wise_kernel.py
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off
# ruff: noqa: E501
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import pickle

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Iterable

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
@ -48,4 +49,50 @@ WEIGHT_SHAPES = {
([16384, 106496], 1),
([53248, 16384], 0),
],
"meta-llama/Llama-3.1-8B-Instruct": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-3.3-70B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
"mistralai/Mistral-Large-Instruct-2407": [
([12288, 14336], 1),
([12288, 12288], 0),
([12288, 57344], 1),
([28672, 12288], 0),
],
"Qwen/Qwen2.5-7B-Instruct": [
([3584, 4608], 1),
([3584, 3584], 0),
([3584, 37888], 1),
([18944, 3584], 0),
],
"Qwen/Qwen2.5-32B-Instruct": [
([5120, 7168], 1),
([5120, 5120], 0),
([5120, 55296], 1),
([27648, 5120], 0),
],
"Qwen/Qwen2.5-72B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 59136], 1),
([29568, 8192], 0),
],
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
([2048, 3072], 1),
([2048, 4096], 1),
([2048, 2048], 0),
([2048, 576], 0),
([2048, 21888], 1),
([10944, 2048], 0),
([2048, 2816], 1),
([1408, 2048], 0),
],
}

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cProfile
import pstats

View File

@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
else()
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA")
elseif (POWER9_FOUND OR POWER10_FOUND)
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
message(STATUS "PowerPC detected")
# Check for PowerPC VSX support
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=native"
"-mtune=native")
if (POWER9_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power9"
"-mtune=power9")
elseif (POWER10_FOUND OR POWER11_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power10"
"-mtune=power10")
endif()
elseif (ASIMD_FOUND)
message(STATUS "ARMv8 or later architecture detected")

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
@ -46,22 +46,38 @@ else()
endif()
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
# Make sure vllm-flash-attn install rules are nested under vllm/
# This is here to support installing all components under the same prefix with cmake --install.
# setup.py installs every component separately but uses the same prefix for all.
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
# and these statements don't hurt when installing neither component.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
DESTINATION vllm/vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
DESTINATION vllm/vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py"
)

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# A command line tool for running pytorch's hipify preprocessor on CUDA

View File

@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
add_custom_target(
hipify${NAME}
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
BYPRODUCTS ${HIP_SRCS}
COMMENT "Running hipify on ${NAME} extension source files.")
@ -122,6 +122,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-Werror=unused-variable"
"-fno-gpu-rdc")
endif()

View File

@ -143,6 +143,14 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
"output heads must be contiguous in memory");
TORCH_CHECK(
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
"prefix_output heads must be contiguous in memory");
TORCH_CHECK(
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
"suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr<float>();

View File

@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
static_cast<ElementAcc*>(nullptr), stride_LSE},
hw_info,
-1, // split_kv
1, // split_kv
nullptr, // is_var_split_kv
};
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute

View File

@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr =
alibi_slopes
@ -193,4 +190,4 @@ void paged_attention_v1(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
#undef DIVIDE_ROUND_UP

View File

@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr =
alibi_slopes
@ -203,4 +200,4 @@ void paged_attention_v2(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
#undef DIVIDE_ROUND_UP

View File

@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
}
template <typename T>
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
const int size) {
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
const int size) {
T max = max_data[0];
for (int i = 1; i < size; ++i) {
max = max >= max_data[i] ? max : max_data[i];
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
if (partition_num == 1) continue;
reducePartitonSoftmax(
reducePartitionSoftmax(
max_logits + seq_idx * num_heads * max_num_partitions +
head_idx * max_num_partitions,
exp_sums + seq_idx * num_heads * max_num_partitions +

View File

@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
explicit FP16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
// non-temporal load
explicit FP16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
explicit BF16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
// non-temporal load
explicit BF16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
// normal load
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
// non-temproal load
// non-temporal load
explicit FP32Vec16(bool, void* ptr)
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
// normal load
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
// non-temproal load
// non-temporal load
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
_mm512_mask_storeu_epi8(ptr, mask, reg);
}
// non-temproal save
// non-temporal save
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
};
#endif

View File

@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
int page_num = numa_migrate_pages(pid, src_mask, mask);
if (page_num == -1) {
TORCH_CHECK(false,
"numa_migrate_pages failed. errno: " + std::to_string(errno));
TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
}
// restrict memory allocation node.
@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
return ss.str();
}
#endif
#endif

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum
from typing import Union

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import itertools
import os

View File

@ -13,232 +13,45 @@
namespace vllm {
namespace moe {
namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
int32_t col) {
// don't worry about overflow because num_experts is relatively small
return row * total_col + col;
}
} // namespace
template <typename scalar_t, typename token_cnts_t>
__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
int32_t* sorted_token_ids,
int32_t* expert_ids,
int32_t* total_tokens_post_pad,
int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1)
token_cnts_t* tokens_cnts =
(token_cnts_t*)(shared_mem + num_experts +
1); // 2d tensor with shape (blockDim.x + 1, num_experts)
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// TODO(simon): this is temporarily adapted from
// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
// we did this to unblock Deepseek V3 but there should be a better
// implementation to manage shared memory.
template <typename scalar_t>
__global__ void moe_align_block_size_global_mem_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
__global__ void moe_align_block_size_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
size_t numel, int32_t* __restrict__ cumsum) {
extern __shared__ int32_t shared_counts[];
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
*total_tokens_post_pad = cumsum[num_experts];
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t>
__global__ void sgl_moe_align_block_size_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* cumsum) {
__shared__ int32_t shared_counts[32][8];
const int warp_id = threadIdx.x / 32;
const int experts_per_warp = 8;
const int warp_id = threadIdx.x / WARP_SIZE;
const int my_expert_start = warp_id * experts_per_warp;
// Initialize shared_counts for this warp's experts
for (int i = 0; i < experts_per_warp; ++i) {
if (my_expert_start + i < num_experts) {
shared_counts[warp_id][i] = 0;
if (my_expert_start + i < padded_num_experts) {
shared_counts[warp_id * experts_per_warp + i] = 0;
}
}
__syncthreads();
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
const size_t tid = threadIdx.x;
const size_t stride = blockDim.x;
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
for (size_t i = tid; i < numel; i += stride) {
int expert_id = topk_ids[i];
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
}
__syncthreads();
// Single thread computes cumulative sum and total tokens
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
int expert_count = 0;
int warp_idx = (i - 1) / experts_per_warp;
int expert_offset = (i - 1) % experts_per_warp;
expert_count = shared_counts[warp_idx][expert_offset];
expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
cumsum[i] =
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
@ -248,7 +61,6 @@ __global__ void sgl_moe_align_block_size_kernel(
__syncthreads();
// Assign expert IDs to blocks
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
@ -257,13 +69,11 @@ __global__ void sgl_moe_align_block_size_kernel(
}
}
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t>
__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
int32_t* sorted_token_ids,
int32_t* cumsum_buffer,
size_t numel) {
__global__ void count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
size_t numel) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
@ -290,132 +100,138 @@ __global__ void moe_sum_kernel(
}
}
template <typename scalar_t>
__global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tid = threadIdx.x;
const size_t stride = blockDim.x;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem;
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
}
for (size_t i = tid; i < numel; i += stride) {
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
}
__syncthreads();
if (threadIdx.x < num_experts) {
tokens_cnts[threadIdx.x] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[i * num_experts + threadIdx.x] +=
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
}
}
__syncthreads();
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] =
cumsum[i - 1] +
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
int32_t rank_post_pad =
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[threadIdx.x * num_experts + expert_id];
}
}
} // namespace moe
} // namespace vllm
// taken from
// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int device_max_shared_mem;
auto dev = topk_ids.get_device();
cudaDeviceGetAttribute(&device_max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
const int32_t shared_mem_i32 =
((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
const int32_t shared_mem_i16 =
((num_thread + 1) * num_experts) * sizeof(uint16_t) +
(num_experts + 1) * sizeof(int32_t);
bool use_global_memory = false;
bool use_i16 = false; // Use uint16_t for shared memory token counts
if (shared_mem_i32 < device_max_shared_mem) {
// Do nothing in this case. We're all set to use int32_t token counts
} else if (shared_mem_i16 < device_max_shared_mem &&
topk_ids.numel() <= 65535) {
// when nelements of topk_ids is smaller than 65535 (max value of uint16),
// element value of token_cnts would also smaller than 65535,
// so we can use uint16 as dtype of token_cnts
use_i16 = true;
} else {
use_global_memory = true;
}
if (use_global_memory) {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
// tensors
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
auto options_int = torch::TensorOptions()
.dtype(torch::kInt)
.device(topk_ids.device());
torch::Tensor token_cnts_buffer =
torch::empty({(num_experts + 1) * num_experts}, options_int);
torch::Tensor cumsum_buffer =
torch::empty({num_experts + 1}, options_int);
auto kernel =
vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
kernel<<<1, num_thread, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>());
});
} else if (use_i16) {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// set dynamic shared mem
auto kernel =
vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem_i16));
kernel<<<1, num_thread, shared_mem_i16, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
});
} else {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
auto kernel =
vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem_i32));
kernel<<<1, num_thread, shared_mem_i32, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
});
}
}
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size,
torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TORCH_CHECK(num_experts == 256,
"sgl_moe_align_block_size kernel only supports deepseek v3.");
int64_t padded_num_experts =
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
int experts_per_warp = WARP_SIZE;
int threads = 1024;
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// calc needed amount of shared mem for `cumsum` tensors
auto options_int =
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
torch::Tensor cumsum_buffer =
torch::zeros({num_experts + 1}, options_int);
bool small_batch_expert_mode =
(topk_ids.numel() < 1024) && (num_experts <= 64);
auto align_kernel =
vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
align_kernel<<<1, 1024, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
if (small_batch_expert_mode) {
const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
const int32_t shared_mem_size =
((threads + 1) * num_experts + (num_experts + 1)) *
sizeof(int32_t);
const int block_threads = 256;
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
auto small_batch_expert_kernel =
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
scalar_t>;
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
} else {
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
size_t shared_mem_size =
num_warps * experts_per_warp * sizeof(int32_t);
align_kernel<<<1, threads, shared_mem_size, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
padded_num_experts, experts_per_warp, block_size,
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
const int block_threads = std::min(256, (int)threads);
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
auto sort_kernel =
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
}
});
}

View File

@ -12,12 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size,
torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
#ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales,
@ -30,4 +24,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
int64_t BLOCK_SIZE_K, int64_t bit);
#endif
bool moe_permute_unpermute_supported();
bool moe_permute_unpermute_supported();
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor);

View File

@ -12,7 +12,7 @@ void moe_permute(
const torch::Tensor& input, // [n_token, hidden]
const torch::Tensor& topk_weights, //[n_token, topk]
torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& token_expert_indicies, // [n_token, topk]
const torch::Tensor& token_expert_indices, // [n_token, topk]
const std::optional<torch::Tensor>& expert_map, // [n_expert]
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
@ -27,15 +27,15 @@ void moe_permute(
"expert_first_token_offset must be int64");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32");
TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
"token_expert_indicies must be int32");
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
"token_expert_indices must be int32");
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
"src_row_id2dst_row_id_map must be int32");
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
"expert_first_token_offset shape != n_local_expert+1")
TORCH_CHECK(
src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
"token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
auto n_token = input.sizes()[0];
auto n_hidden = input.sizes()[1];
auto align_block_size_value =
@ -71,7 +71,7 @@ void moe_permute(
expert_map_ptr, n_expert, stream);
}
// expert sort topk expert id and scan expert id get expert_first_token_offset
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
get_ptr<int>(permuted_experts_id),
get_ptr<int>(dst_row_id2src_row_id_map),
get_ptr<int64_t>(expert_first_token_offset), n_token,
@ -130,11 +130,67 @@ void moe_unpermute(
});
}
template <typename T>
__global__ void shuffleInputRowsKernel(const T* input,
const int32_t* dst2src_map, T* output,
int64_t num_src_rows,
int64_t num_dst_rows, int64_t num_cols) {
int64_t dest_row_idx = blockIdx.x;
int64_t const source_row_idx = dst2src_map[dest_row_idx];
if (blockIdx.x < num_dst_rows) {
// Load 128-bits per thread
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
auto const* source_row_ptr =
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
auto* dest_row_ptr =
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
int64_t const start_offset = threadIdx.x;
int64_t const stride = blockDim.x;
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
for (int elem_index = start_offset; elem_index < num_elems_in_col;
elem_index += stride) {
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
}
}
}
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor) {
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
"Input and output tensors must have the same data type");
auto stream = at::cuda::getCurrentCUDAStream().stream();
int64_t const blocks = output_tensor.size(0);
int64_t const threads = 256;
int64_t const num_dest_rows = output_tensor.size(0);
int64_t const num_src_rows = input_tensor.size(0);
int64_t const num_cols = input_tensor.size(1);
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
"num_cols must be divisible by 128 / "
"sizeof(input_tensor.scalar_type()) / 8");
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
dst2src_map.data_ptr<int32_t>(),
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
num_dest_rows, num_cols);
});
}
#else
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const torch::Tensor& token_expert_indices,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
@ -147,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
void moe_unpermute(const torch::Tensor& input,
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const torch::Tensor& token_expert_indices,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,

View File

@ -14,12 +14,13 @@
__VA_ARGS__(); \
break; \
}
#define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
#define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
#define MOE_DISPATCH(TYPE, ...) \
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@ -39,6 +40,11 @@ template <>
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
using type = __nv_bfloat16;
};
// uint8 for packed fp4
template <>
struct ScalarType2CudaType<at::ScalarType::Byte> {
using type = uint8_t;
};
// #if __CUDA_ARCH__ >= 890
// fp8

View File

@ -20,7 +20,6 @@ __global__ void expandInputRowsKernel(
int expert_id = sorted_experts[expanded_dest_row];
extern __shared__ int64_t smem_expert_first_token_offset[];
int64_t align_expanded_row_accumulate = 0;
if constexpr (ALIGN_BLOCK_SIZE) {
// load g2s
for (int idx = threadIdx.x; idx < num_local_experts + 1;
@ -63,7 +62,6 @@ __global__ void expandInputRowsKernel(
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
int64_t const source_k_rank = expanded_source_row / num_rows;
int64_t const source_row = expanded_source_row % num_rows;
auto const* source_row_ptr =
@ -160,7 +158,6 @@ __global__ void finalizeMoeRoutingKernel(
elem_index += stride) {
ComputeElem thread_output;
thread_output.fill(0);
float row_rescale{0.f};
for (int k_idx = 0; k_idx < k; ++k_idx) {
int64_t const expanded_original_row = original_row + k_idx * num_rows;
int64_t const expanded_permuted_row =
@ -177,8 +174,6 @@ __global__ void finalizeMoeRoutingKernel(
auto const* expanded_permuted_rows_row_ptr =
expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
int64_t const expert_idx = expert_for_source_row[k_offset];
ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
expanded_permuted_rows_row_ptr[elem_index]);
thread_output = thread_output + row_scale * (expert_result);

View File

@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
gating_output, nullptr, topk_weights, topk_indicies, \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, \
stream);
@ -433,7 +433,7 @@ template <typename IndType>
void topkGatingSoftmaxKernelLauncher(
const float* gating_output,
float* topk_weights,
IndType* topk_indicies,
IndType* topk_indices,
int* token_expert_indices,
float* softmax_workspace,
const int num_tokens,
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, softmax_workspace, num_experts);
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
num_experts, topk, 0, num_experts);
}
}
@ -516,9 +516,8 @@ void topk_softmax(
topk,
stream);
}
else
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
{
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
@ -530,4 +529,17 @@ void topk_softmax(
topk,
stream);
}
else {
assert(topk_indices.scalar_type() == at::ScalarType::Int64);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
}

View File

@ -22,15 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// temporarily adapted from
// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
m.def(
"sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
#ifndef USE_ROCM
m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
@ -66,7 +57,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def(
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
"Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
"int n_local_expert,"
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
@ -81,6 +72,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
// Row shuffle for MoE
m.def(
"shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
"output_tensor) -> ()");
m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
#endif
}

View File

@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
torch::Tensor& weight, double epsilon);
void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale,
double epsilon);
@ -231,7 +236,8 @@ void cutlass_moe_mm(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides);
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch);
void cutlass_fp4_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@ -243,7 +249,16 @@ void get_cutlass_moe_mm_data(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k);
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k);
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,

View File

@ -274,7 +274,6 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
[[maybe_unused]] int block_tables_stride = block_tables.stride(0);
TORCH_CHECK((blocks * threads > num_queries),
"multi-step: not enough threads to map to num_queries = ",
num_queries, " block_tables.stride(0) = ", block_tables.stride(0),

Some files were not shown because too many files have changed in this diff Show More