Compare commits

...

2096 Commits

Author SHA1 Message Date
4db5176d97 bump version to v0.5.4 (#7139) 2024-08-05 14:39:48 -07:00
4cf1dc39be [Bugfix][CI/Build] Fix CUTLASS FetchContent (#7171) 2024-08-05 14:22:57 -07:00
6e4852ce28 [CI/Build] Suppress divide-by-zero and missing return statement warnings (#7001) 2024-08-05 16:00:01 -04:00
8571ac4672 [Kernel] Update CUTLASS to 3.5.1 (#7085) 2024-08-05 15:13:43 -04:00
997cf78308 [Misc] Fix typo in GroupCoordinator.recv() (#7167)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2024-08-05 11:10:16 -07:00
57f560aa23 [BugFix] Use args.trust_remote_code (#7121) 2024-08-05 09:26:14 -07:00
003f8ee128 [BugFix] Use IP4 localhost form for zmq bind (#7163) 2024-08-05 08:41:03 -07:00
e9630458c7 [SpecDecode] Support FlashInfer in DraftModelRunner (#6926) 2024-08-05 08:05:05 -07:00
82a1b1a82b [Speculative decoding] Add periodic log with time spent in proposal/scoring/verification (#6963) 2024-08-05 08:46:44 +00:00
c0d8f1636c [Model] SiglipVisionModel ported from transformers (#6942)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-08-05 06:22:12 +00:00
cc08fc7225 [Frontend] Reapply "Factor out code for running uvicorn" (#7095) 2024-08-04 20:40:51 -07:00
7b86e7c9cd [Model] Add multi-image support for minicpmv (#7122)
Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-08-05 09:23:17 +08:00
f80ab3521c Clean up remaining Punica C information (#7027) 2024-08-04 15:37:08 -07:00
16a1cc9bb2 [misc][distributed] improve libcudart.so finding (#7127) 2024-08-04 11:31:51 -07:00
b1c9aa3daa [Bugfix] [SpecDecode] Default speculative_draft_tensor_parallel_size to 1 when using MLPSpeculator (#7105)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-08-04 07:13:18 -07:00
179a6a36f2 [Model]Refactor MiniCPMV (#7020)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-08-04 08:12:41 +00:00
83c644fe7e [core][misc] simply output processing with shortcut code path (#7117) 2024-08-04 00:22:19 -07:00
9fadc7b7a0 [misc] add zmq in collect env (#7119) 2024-08-03 22:03:46 -07:00
654bc5ca49 Support for guided decoding for offline LLM (#6878)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-08-04 03:12:09 +00:00
825b044863 [Frontend] Warn if user max_model_len is greater than derived max_model_len (#7080)
Signed-off-by: Jefferson Fialho <jfialho@ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-08-03 16:01:38 -07:00
44dcb52e39 [ci][test] finalize fork_new_process_for_each_test (#7114) 2024-08-03 10:44:53 -07:00
67d745cc68 [CI] Temporarily turn off H100 performance benchmark (#7104) 2024-08-02 23:52:44 -07:00
99d7cabd7b [LoRA] ReplicatedLinear support LoRA (#7081) 2024-08-02 22:40:19 -07:00
fb2c1c86c1 [Bugfix] Fix block table for seqs that have prefix cache hits (#7018) 2024-08-02 22:38:15 -07:00
0c25435daa [Model] Refactor and decouple weight loading logic for InternVL2 model (#7067) 2024-08-02 22:36:14 -07:00
a0d164567c [ci][distributed] disable ray dag tests (#7099) 2024-08-02 22:32:04 -07:00
04e5583425 [ci][distributed] merge distributed test commands (#7097)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-08-02 21:33:53 -07:00
8c025fa703 [Frontend] Factor out chat message parsing (#7055) 2024-08-02 21:31:27 -07:00
69ea15e5cc [ci][distributed] shorten wait time if server hangs (#7098) 2024-08-02 21:05:16 -07:00
ed812a73fa [ Frontend ] Multiprocessing for OpenAI Server with zeromq (#6883)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <joe@joerun.de>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-08-02 18:27:28 -07:00
708989341e [misc] add a flag to enable compile (#7092) 2024-08-02 16:18:45 -07:00
22e718ff1a [Misc] Revive to use loopback address for driver IP (#7091)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2024-08-02 15:50:00 -07:00
05308891e2 [Core] Pipeline parallel with Ray ADAG (#6837)
Support pipeline-parallelism with Ray accelerated DAG.

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2024-08-02 13:55:40 -07:00
a8d604ca2a [Misc] Disambiguate quantized types via a new ScalarType (#6396) 2024-08-02 13:51:58 -07:00
b482b9a5b1 [CI/Build] Add support for Python 3.12 (#7035) 2024-08-02 13:51:22 -07:00
806949514a [ci] set timeout for test_oot_registration.py (#7082) 2024-08-02 10:03:24 -07:00
c16eaac500 [Hardware][Intel CPU] Update torch 2.4.0 for CPU backend (#6931) 2024-08-02 08:55:58 -07:00
db35186391 [Core] Comment out unused code in sampler (#7023) 2024-08-02 00:58:26 -07:00
660dea1235 [cuda][misc] remove error_on_invalid_device_count_status (#7069) 2024-08-02 00:14:21 -07:00
cf2a1a4d9d Fix tracing.py (#7065) 2024-08-01 23:28:00 -07:00
252357793d [ci][distributed] try to fix pp test (#7054) 2024-08-01 22:03:12 -07:00
3bb4b1e4cd [mypy] Speed up mypy checking (#7056) 2024-08-01 19:49:43 -07:00
954f7305a1 [Kernel] Fix input for flashinfer prefill wrapper. (#7008) 2024-08-01 18:44:16 -07:00
6ce01f3066 [Performance] Optimize get_seqs (#7051) 2024-08-01 18:29:52 -07:00
6a11fdfbb8 [CI/Build][Bugfix] Fix CUTLASS header-only line (#7034) 2024-08-01 13:51:15 -07:00
805a8a75f2 [Misc] Support attention logits soft-capping with flash-attn (#7022) 2024-08-01 13:14:37 -07:00
562e580abc Update run-amd-test.sh (#7044) 2024-08-01 13:12:37 -07:00
fc912e0886 [Models] Support Qwen model with PP (#6974)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-08-01 12:40:43 -07:00
f4fd390f5d [Bugfix] Lower gemma's unloaded_params exception to warning (#7002) 2024-08-01 12:01:07 -07:00
fb3db61688 [CI/Build] Remove sparseml requirement from testing (#7037) 2024-08-01 12:00:51 -07:00
2dd34371a6 [Bugfix] Fix RMSNorm forward in InternViT attention qk_layernorm (#6992) 2024-08-01 12:00:28 -07:00
7e0861bd0b [CI/Build] Update PyTorch to 2.4.0 (#6951)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-08-01 11:11:24 -07:00
a72a424b3e [Build/CI] Fixing Docker Hub quota issue. (#7043) 2024-08-01 11:07:37 -07:00
c8a7e93273 [core][scheduler] simplify and improve scheduler (#6867) 2024-07-31 23:51:09 -07:00
3c10591ef2 [Bugfix] Set SamplingParams.max_tokens for OpenAI requests if not provided by user (#6954) 2024-07-31 21:13:34 -07:00
0437492ea9 PP comm optimization: replace send with partial send + allgather (#6695)
Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
2024-07-31 20:15:42 -07:00
630dd9e0ae [Bugfix][Model] Skip loading lm_head weights if using tie_word_embeddings (#6758)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-31 19:49:11 -07:00
23993a7997 [Bugfix][TPU] Do not use torch.Generator for TPUs (#6981) 2024-07-31 18:50:28 -07:00
1d2e7fb73f [Model] Pipeline parallel support for Qwen2 (#6924) 2024-07-31 18:49:51 -07:00
7ecee34321 [Kernel][RFC] Refactor the punica kernel based on Triton (#5036) 2024-07-31 17:12:24 -07:00
7eb0cb4a14 Revert "[Frontend] Factor out code for running uvicorn" (#7012)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-07-31 16:34:26 -07:00
a0dce9383a [Misc] Add compressed-tensors to optimized quant list (#7006) 2024-07-31 14:40:44 -07:00
35e9c12bfa [Kernel] Tuned int8 Cutlass Kernels for SM75 (T4) (#6996)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-07-31 14:40:32 -07:00
93548eb37e [Kernel] Enable FP8 Cutlass for Ada Lovelace (#6950)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-07-31 14:40:22 -07:00
460c1884e3 [Bugfix] Support cpu offloading with fp8 quantization (#6960) 2024-07-31 12:47:46 -07:00
bd70013407 [MISC] Introduce pipeline parallelism partition strategies (#6920)
Co-authored-by: youkaichao <youkaichao@126.com>
2024-07-31 12:02:17 -07:00
2ee8d3ba55 [Model] use FusedMoE layer in Jamba (#6935) 2024-07-31 12:00:24 -07:00
daed30c4a9 [Bugfix] Fix feature size calculation for LLaVA-NeXT (#6982) 2024-07-31 23:46:17 +08:00
2f4e108f75 [Bugfix] Clean up MiniCPM-V (#6939)
Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-31 14:39:19 +00:00
6512937de1 Support W4A8 quantization for vllm (#5218) 2024-07-31 07:55:21 -06:00
Fei
c0644cf9ce [Bugfix] fix logit processor excceed vocab size issue (#6927) 2024-07-31 16:16:01 +08:00
533d1932d2 [Bugfix][TPU] Set readonly=True for non-root devices (#6980) 2024-07-31 00:19:28 -07:00
9f0e69b653 [CI/Build] Fix mypy errors (#6968) 2024-07-30 19:49:48 -07:00
f230cc2ca6 [Bugfix] Fix broadcasting logic for multi_modal_kwargs (#6836) 2024-07-31 10:38:45 +08:00
da1f7cc12a [mypy] Enable following imports for some directories (#6681) 2024-07-31 10:38:03 +08:00
c32ab8be1a [Speculative decoding] Add serving benchmark for llama3 70b + speculative decoding (#6964) 2024-07-31 00:53:21 +00:00
fb4f530bf5 [CI] [nightly benchmark] Do not re-download sharegpt dataset if exists (#6706) 2024-07-30 16:28:49 -07:00
79319cedfa [Nightly benchmarking suite] Remove pkill python from run benchmark suite (#6965) 2024-07-30 16:28:05 -07:00
40c27a7cbb [Build] Temporarily Disable Kernels and LoRA tests (#6961) 2024-07-30 14:59:48 -07:00
6ca8031e71 [core][misc] improve free_finished_seq_groups (#6865)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-07-30 14:32:12 -07:00
d7a299edaa [Kernel] Remove scaled_fp8_quant kernel padding footgun (#6842) 2024-07-30 16:37:01 -04:00
052b6f8ca4 [Bugfix] Fix tensorizer memory profiling bug during testing (#6881) 2024-07-30 11:48:50 -07:00
5895b24677 [OpenVINO] Updated OpenVINO requirements and build docs (#6948) 2024-07-30 11:33:01 -07:00
cbbc904470 [Kernel] Squash a few more warnings (#6914) 2024-07-30 13:50:42 -04:00
5cf9254a9c [BugFix] Fix use of per-request seed with pipeline parallel (#6698) 2024-07-30 10:40:08 -07:00
f058403683 [Doc] Super tiny fix doc typo (#6949) 2024-07-30 09:14:03 -07:00
c66c7f86ac [Bugfix] Fix PaliGemma MMP (#6930) 2024-07-30 02:20:57 -07:00
6e063ea35b [TPU] Fix greedy decoding (#6933) 2024-07-30 02:06:29 -07:00
af647fb8b3 [Kernel] Tuned int8 kernels for Ada Lovelace (#6848)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-07-29 20:24:58 -06:00
61a97c32f6 [Kernel] Fix marlin divide-by-zero warnings (#6904) 2024-07-30 01:26:07 +00:00
4fbf4aa128 [ci] GHA workflow to remove ready label upon "/notready" comment (#6921)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-29 17:03:45 -07:00
aae6d36f7e [Kernel] Remove unused variables in awq/gemm_kernels.cu (#6908) 2024-07-29 18:01:17 -06:00
9f69d8245a [Frontend] New allowed_token_ids decoding request parameter (#6753) 2024-07-29 23:37:27 +00:00
9a7e2d0534 [Bugfix] Allow vllm to still work if triton is not installed. (#6786)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-29 14:51:27 -07:00
7f8d612d24 [TPU] Support tensor parallelism in async llm engine (#6891) 2024-07-29 12:42:21 -07:00
60d1c6e584 [Kernel] Fix deprecation function warnings squeezellm quant_cuda_kernel (#6901) 2024-07-29 09:59:02 -07:00
db9e5708a9 [Core] Reduce unnecessary compute when logprobs=None (#6532) 2024-07-29 16:47:31 +00:00
766435e660 [Kernel] Tuned FP8 Kernels for Ada Lovelace (#6677)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-07-29 09:42:35 -06:00
7cbd9ec7a9 [Model] Initialize support for InternVL2 series models (#6514)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-29 10:16:30 +00:00
3eeb148f46 [Misc] Pass cutlass_fp8_supported correctly in fbgemm_fp8 (#6871) 2024-07-28 11:13:49 -04:00
b1366a9534 Add Nemotron to PP_SUPPORTED_MODELS (#6863) 2024-07-27 15:05:17 -07:00
75acdaa4b6 [Kernel] Increase precision of GPTQ/AWQ Marlin kernel (#6795) 2024-07-27 17:52:33 -04:00
fad5576c58 [TPU] Reduce compilation time & Upgrade PyTorch XLA version (#6856) 2024-07-27 10:28:33 -07:00
f954d0715c [Docs] Add RunLLM chat widget (#6857) 2024-07-27 09:24:46 -07:00
1ad86acf17 [Model] Initial support for BLIP-2 (#5920)
Co-authored-by: ywang96 <ywang@roblox.com>
2024-07-27 11:53:07 +00:00
ecb33a28cb [CI/Build][Doc] Update CI and Doc for VLM example changes (#6860) 2024-07-27 09:54:14 +00:00
a57d75821c [bugfix] make args.stream work (#6831) 2024-07-27 09:07:02 +00:00
925de97e05 [Bugfix] Fix VLM example typo (#6859) 2024-07-27 14:24:08 +08:00
aa46953a20 [Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-07-26 22:44:13 -07:00
593e79e733 [Bugfix] torch.set_num_threads() in multiproc_gpu_executor (#6802)
[Bugfix] Use torch.set_num_threads() to configure parallelism in multiproc_gpu_executor (#6802)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-26 22:15:20 -07:00
c53041ae3b [Doc] Add missing mock import to docs conf.py (#6834) 2024-07-27 04:47:33 +00:00
52f07e3dec [Hardware][TPU] Implement tensor parallelism with Ray (#5871) 2024-07-26 20:54:27 -07:00
Joe
14dbd5a767 [Model] H2O Danube3-4b (#6451) 2024-07-26 20:47:50 -07:00
ed94e4f427 [Bugfix][Model] Jamba assertions and no chunked prefill by default for Jamba (#6784) 2024-07-26 20:45:31 -07:00
3c3012398e [Doc] add VLLM_TARGET_DEVICE=neuron to documentation for neuron (#6844)
Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>
2024-07-26 20:20:16 -07:00
ced36cd89b [ROCm] Upgrade PyTorch nightly version (#6845) 2024-07-26 20:16:13 -07:00
969d032265 [Bugfix]: Fix Tensorizer test failures (#6835) 2024-07-26 20:02:25 -07:00
55712941e5 [Bug Fix] Illegal memory access, FP8 Llama 3.1 405b (#6852) 2024-07-27 02:27:44 +00:00
981b0d5673 [Frontend] Factor out code for running uvicorn (#6828) 2024-07-27 09:58:25 +08:00
d09b94ca58 [TPU] Support collective communications in XLA devices (#6813) 2024-07-27 01:45:57 +00:00
bb5494676f enforce eager mode with bnb quantization temporarily (#6846) 2024-07-27 01:32:20 +00:00
b5f49ee55b Update README.md (#6847) 2024-07-27 00:26:45 +00:00
150a1ffbfd [Doc] Update SkyPilot doc for wrong indents and instructions for update service (#4283) 2024-07-26 14:39:10 -07:00
281977bd6e [Doc] Add Nemotron to supported model docs (#6843) 2024-07-26 17:32:44 -04:00
3bbb4936dc [Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-26 13:50:10 -07:00
aa4867791e [Misc][TPU] Support TPU in initialize_ray_cluster (#6812) 2024-07-26 19:39:49 +00:00
71734f1bf2 [Build/CI][ROCm] Minor simplification to Dockerfile.rocm (#6811) 2024-07-26 12:28:32 -07:00
50704f52c4 [Bugfix][Kernel] Promote another index to int64_t (#6838) 2024-07-26 18:41:04 +00:00
07278c37dd [Model] Support Nemotron models (Nemotron-3, Nemotron-4, Minitron) (#6611) 2024-07-26 14:33:42 -04:00
85ad7e2d01 [doc][debugging] add known issues for hangs (#6816) 2024-07-25 21:48:05 -07:00
89a84b0bb7 [Core] Use array to speedup padding (#6779) 2024-07-25 21:31:31 -07:00
084a01fd35 [Bugfix] [Easy] Fixed a bug in the multiprocessing GPU executor. (#6770) 2024-07-25 21:25:35 -07:00
062a1d0fab Fix ReplicatedLinear weight loading (#6793) 2024-07-25 19:24:58 -07:00
2eb9f4ff26 [ci] Mark tensorizer as soft fail and separate from grouped test (#6810)
[ci] Mark tensorizer test as soft fail and separate it from grouped test in fast check (#6810)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-25 18:08:33 -07:00
443c7cf4cf [ci][distributed] fix flaky tests (#6806) 2024-07-25 17:44:09 -07:00
1adddb14bf [Core] Fix ray forward_dag error mssg (#6792) 2024-07-25 16:53:25 -07:00
b7215de2c5 [Docs] Publish 5th meetup slides (#6799) 2024-07-25 16:47:55 -07:00
f3ff63c3f4 [doc][distributed] improve multinode serving doc (#6804) 2024-07-25 15:38:32 -07:00
cd7edc4e87 [Bugfix] Fix empty (nullptr) channelwise scales when loading wNa16 using compressed tensors (#6798) 2024-07-25 15:05:09 -07:00
6a1e25b151 [Doc] Add documentations for nightly benchmarks (#6412) 2024-07-25 11:57:16 -07:00
95db75de64 [Bugfix] Add synchronize to prevent possible data race (#6788)
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2024-07-25 10:40:01 -07:00
65b1f121c8 [Bugfix] Fix kv_cache_dtype=fp8 without scales for FP8 checkpoints (#6761) 2024-07-25 09:46:15 -07:00
889da130e7 [ Misc ] fp8-marlin channelwise via compressed-tensors (#6524)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-07-25 09:46:04 -07:00
b75e314fff [Bugfix] Add image placeholder for OpenAI Compatible Server of MiniCPM-V (#6787)
Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-25 09:42:49 -07:00
316a41ac1d [Bugfix] Fix encoding_format in examples/openai_embedding_client.py (#6755) 2024-07-24 22:48:07 -07:00
0310029a2f [Bugfix] Fix awq_marlin and gptq_marlin flags (#6745) 2024-07-24 22:34:11 -07:00
309aaef825 [Bugfix] Fix decode tokens w. CUDA graph (#6757) 2024-07-24 22:33:56 -07:00
9e169a4c61 [Model] Adding support for MiniCPM-V (#4087) 2024-07-24 20:59:30 -07:00
5689e256ba [Frontend] Represent tokens with identifiable strings (#6626) 2024-07-25 09:51:00 +08:00
740374d456 [core][distributed] fix zmq hang (#6759) 2024-07-24 17:37:12 -07:00
d88c458f44 [Doc][AMD][ROCm]Added tips to refer to mi300x tuning guide for mi300x users (#6754) 2024-07-24 14:32:57 -07:00
421e218b37 [Bugfix] Bump transformers to 4.43.2 (#6752) 2024-07-24 13:22:16 -07:00
5448f67635 [Core] Tweaks to model runner/input builder developer APIs (#6712) 2024-07-24 12:17:12 -07:00
0e63494cf3 Add fp8 support to reshape_and_cache_flash (#6667) 2024-07-24 18:36:52 +00:00
ee812580f7 [Frontend] split run_server into build_server and run_server (#6740) 2024-07-24 10:36:04 -07:00
40468b13fa [Bugfix] Miscalculated latency lead to time_to_first_token_seconds inaccurate. (#6686) 2024-07-24 08:58:42 -07:00
2cf0df3381 [Bugfix] Fix speculative decode seeded test (#6743) 2024-07-24 08:58:31 -07:00
545146349c Adding f-string to validation error which is missing (#6748) 2024-07-24 08:55:53 -07:00
f4f8a9d892 [Bugfix]fix modelscope compatible issue (#6730) 2024-07-24 05:04:46 -07:00
b570811706 [Build/CI] Update run-amd-test.sh. Enable Docker Hub login. (#6711) 2024-07-24 05:01:14 -07:00
ccc4a73257 [Docs][ROCm] Detailed instructions to build from source (#6680) 2024-07-24 01:07:23 -07:00
0a740a11ba [Bugfix] Fix token padding for chameleon (#6724) 2024-07-24 01:05:09 -07:00
c882a7f5b3 [SpecDecoding] Update MLPSpeculator CI tests to use smaller model (#6714) 2024-07-24 07:34:22 +00:00
5e8ca973eb [Bugfix] fix flashinfer cudagraph capture for PP (#6708) 2024-07-24 01:49:44 +00:00
87525fab92 [bitsandbytes]: support read bnb pre-quantized model (#5753)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-23 23:45:09 +00:00
2f808e69ab [Bugfix] StatLoggers: cache spec decode metrics when they get collected. (#6645)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-23 23:05:05 +00:00
01c16ede6b [CI] Add smoke test for non-uniform AutoFP8 quantization (#6702) 2024-07-23 22:45:12 +00:00
72fc704803 [build] relax wheel size limit (#6704) 2024-07-23 14:03:49 -07:00
1bedf210e3 Bump transformers version for Llama 3.1 hotfix and patch Chameleon (#6690) 2024-07-23 13:47:48 -07:00
507ef787d8 [Model] Pipeline Parallel Support for DeepSeek v2 (#6519)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-23 12:22:09 -07:00
58f53034ad [Frontend] Add Usage data in each chunk for chat_serving. #6540 (#6652) 2024-07-23 11:41:55 -07:00
0eb0757bef [Misc] Add ignored layers for fp8 quantization (#6657) 2024-07-23 14:04:04 -04:00
38c4b7e863 Bump version to 0.5.3.post1 (#6696) 2024-07-23 10:08:59 -07:00
a112a84aad [BugFix] Fix RoPE error in Llama 3.1 (#6693) 2024-07-23 09:46:05 -07:00
461089a21a [Bugfix] Fix a log error in chunked prefill (#6694) 2024-07-23 09:27:58 -07:00
71950af726 [doc][distributed] fix doc argument order (#6691) 2024-07-23 08:55:33 -07:00
cb1362a889 [Docs] Announce llama3.1 support (#6688) 2024-07-23 08:18:15 -07:00
bb2fc08072 Bump version to v0.5.3 (#6674) 2024-07-23 00:00:08 -07:00
3eda4ec780 support ignore patterns in model loader (#6673) 2024-07-22 23:59:42 -07:00
22fa2e35cb [VLM][Model] Support image input for Chameleon (#6633) 2024-07-22 23:50:48 -07:00
c5201240a4 [misc] only tqdm for first rank (#6672) 2024-07-22 21:57:27 -07:00
97234be0ec [Misc] Manage HTTP connections in one place (#6600) 2024-07-22 21:32:02 -07:00
c051bfe4eb [doc][distributed] doc for setting up multi-node environment (#6529)
[doc][distributed] add more doc for setting up multi-node environment (#6529)
2024-07-22 21:22:09 -07:00
9e0b558a09 [Misc] Support FP8 kv cache scales from compressed-tensors (#6528) 2024-07-23 04:11:50 +00:00
e519ae097a add tqdm when loading checkpoint shards (#6569)
Co-authored-by: tianyi.zhao <tianyi.zhao@transwarp.io>
Co-authored-by: youkaichao <youkaichao@126.com>
2024-07-22 20:48:01 -07:00
7c2749a4fd [misc] add start loading models for users information (#6670) 2024-07-22 20:08:02 -07:00
729171ae58 [Misc] Enable chunked prefill by default for long context models (#6666) 2024-07-22 20:03:13 -07:00
c5e8330997 [Bugfix] Fix null modules_to_not_convert in FBGEMM Fp8 quantization (#6665) 2024-07-22 19:25:05 -07:00
e0c15758b8 [Core] Modulize prepare input and attention metadata builder (#6596) 2024-07-23 00:45:24 +00:00
bdf5fd1386 [Misc] Remove deprecation warning for beam search (#6659) 2024-07-23 00:21:58 +00:00
5a96ee52a3 [ci][build] add back vim in docker (#6661) 2024-07-22 16:26:29 -07:00
42c7f66a38 [Core] Support dynamically loading Lora adapter from HuggingFace (#6234)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-07-22 15:42:40 -07:00
69d5ae38dc [ci] Use different sccache bucket for CUDA 11.8 wheel build (#6656)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-22 14:20:41 -07:00
fea59c7712 [Bugfix][Kernel] Use int64_t for indices in fp8 quant kernels (#6649) 2024-07-22 14:08:30 -06:00
739b61a348 [Frontend] Refactor prompt processing (#4028)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-22 10:13:53 -07:00
89c1c6a196 [Bugfix] Fix vocab_size field access in llava_next.py (#6624) 2024-07-22 05:02:51 +00:00
42de2cefcb [Misc] Add a wrapper for torch.inference_mode (#6618) 2024-07-21 18:43:11 -07:00
c9eef37f32 [Model] Initial Support for Chameleon (#5770) 2024-07-21 17:37:51 -07:00
396d92d5e0 [Kernel][Core] Add AWQ support to the Marlin kernel (#6612) 2024-07-21 19:41:42 -04:00
25e778aa16 [Model] Refactor and decouple phi3v image embedding (#6621) 2024-07-21 16:07:58 -07:00
b6df37f943 [Misc] Remove abused noqa (#6619) 2024-07-21 23:47:04 +08:00
14f91fe67c [Spec Decode] Disable Log Prob serialization to CPU for spec decoding for both draft and target models. (#6485) 2024-07-20 23:58:58 -07:00
d7f4178dd9 [Frontend] Move chat utils (#6602)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-21 08:38:17 +08:00
082ecd80d5 [ Bugfix ] Fix AutoFP8 fp8 marlin (#6609) 2024-07-20 17:25:56 -06:00
f952bbc8ff [Misc] Fix input_scale typing in w8a8_utils.py (#6579) 2024-07-20 23:11:13 +00:00
9364f74eee [ Kernel ] Enable fp8-marlin for fbgemm-fp8 models (#6606) 2024-07-20 18:50:10 +00:00
06d6c5fe9f [Bugfix][CI/Build][Hardware][AMD] Fix AMD tests, add HF cache, update CK FA, add partially supported model notes (#6543) 2024-07-20 09:39:07 -07:00
683e3cb9c4 [ Misc ] fbgemm checkpoints (#6559) 2024-07-20 09:36:57 -07:00
9042d68362 [Misc] Consolidate and optimize logic for building padded tensors (#6541) 2024-07-20 04:17:24 +00:00
3f8d42c81f Pipeline Parallel: Guard for KeyErrors at request abort (#6587)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-19 19:18:19 -07:00
7bd82002ae [Core] Allow specifying custom Executor (#6557) 2024-07-20 01:25:06 +00:00
2e26564259 [ Kernel ] FP8 Dynamic Per Token Quant - Add scale_ub (#6593)
Co-authored-by: Varun Sundar Rabindranth <varun@neuralmagic.com>
2024-07-19 18:15:26 -07:00
e81522e879 [build] add ib in image for out-of-the-box infiniband support (#6599)
[build] add ib so that multi-node support with infiniband can be supported out-of-the-box (#6599)
2024-07-19 17:16:57 -07:00
45ceb85a0c [Docs] Update PP docs (#6598) 2024-07-19 16:38:21 -07:00
4cc24f01b1 [ Kernel ] Enable Dynamic Per Token fp8 (#6547) 2024-07-19 23:08:15 +00:00
07eb6f19f3 [bugfix][distributed] fix multi-node bug for shared memory (#6597) 2024-07-19 15:34:34 -07:00
f0bbfaf917 [Bugfix] [SpecDecode] AsyncMetricsCollector: update time since last collection (#6578) 2024-07-19 14:01:03 -07:00
30efe41532 [Docs] Update docs for wheel location (#6580) 2024-07-19 12:14:11 -07:00
9ed82e7074 [Misc] Small perf improvements (#6520) 2024-07-19 12:10:56 -07:00
51f8aa90ad [Bugfix][Frontend] remove duplicate init logger (#6581) 2024-07-19 10:16:27 -07:00
a5314e8698 [Model] RowParallelLinear: pass bias to quant_method.apply (#6327)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-19 07:15:22 -06:00
a921e86392 [BUGFIX] Raise an error for no draft token case when draft_tp>1 (#6369) 2024-07-19 06:01:09 -07:00
6366efc67b [Bugfix][Frontend] Fix missing /metrics endpoint (#6463) 2024-07-19 03:55:13 +00:00
dbe5588554 [ Misc ] non-uniform quantization via compressed-tensors for Llama (#6515) 2024-07-18 22:39:18 -04:00
d4201e06d5 [Bugfix] Make spec. decode respect per-request seed. (#6034)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-07-18 19:22:08 -07:00
b5672a112c [Core] Multiprocessing Pipeline Parallel support (#6130)
Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-18 19:15:52 -07:00
c5df56f88b Add support for a rope extension method (#6553) 2024-07-19 01:53:03 +00:00
1689219ebf [CI/Build] Build on Ubuntu 20.04 instead of 22.04 (#6517) 2024-07-18 17:29:25 -07:00
4ffffccb7e [Kernel] Implement fallback for FP8 channelwise using torch._scaled_mm (#6552) 2024-07-18 23:52:22 +00:00
f53b8f0d05 [ci][test] add correctness test for cpu offloading (#6549) 2024-07-18 23:41:06 +00:00
2d4733ba2d Fix PR comment bot (#6554)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-18 14:48:29 -07:00
15c6a079b1 [Model] Support Mistral-Nemo (#6548) 2024-07-18 20:31:50 +00:00
ecdb462c24 [ci] Reword Github bot comment (#6534) 2024-07-18 08:01:45 -07:00
58ca663224 [ Misc ] Improve Min Capability Checking in compressed-tensors (#6522) 2024-07-18 14:39:12 +00:00
4634c8728b [TPU] Refactor TPU worker & model runner (#6506) 2024-07-18 01:34:16 -07:00
c8a7d51c49 [Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2 OpenAI Server Crash (#6501) 2024-07-18 07:47:13 +00:00
e2fbaee725 [BugFix][Frontend] Use LoRA tokenizer in OpenAI APIs (#6227)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-18 15:13:30 +08:00
8a74c68bd1 [Misc] Minor patch for draft model runner (#6523) 2024-07-18 06:06:21 +00:00
61e592747c [Core] Introduce SPMD worker execution using Ray accelerated DAG (#6032)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Stephanie Wang <swang@cs.berkeley.edu>
2024-07-17 22:27:09 -07:00
d25877dd9b [BugFix] Avoid secondary error in ShmRingBuffer destructor (#6530) 2024-07-17 22:24:43 -07:00
1c27d25fb5 [core][model] yet another cpu offload implementation (#6496)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-17 20:54:35 -07:00
18fecc3559 [ Kernel ] Fp8 Channelwise Weight Support (#6487) 2024-07-18 03:18:13 +00:00
b5af8c223c [Model] Pipeline parallel support for Mixtral (#6516) 2024-07-17 19:26:04 -07:00
b5241e41d9 [ Kernel ] FP8 Dynamic-Per-Token Quant Kernel (#6511)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-07-18 01:38:35 +00:00
e76466dde2 [Core] draft_model_runner: Implement prepare_inputs on GPU for advance_step (#6338) 2024-07-17 14:30:28 -07:00
5f0b9933e6 [Bugfix] Fix Ray Metrics API usage (#6354) 2024-07-17 19:40:10 +00:00
a38524f338 [DOC] - Add docker image to Cerebrium Integration (#6510) 2024-07-17 10:22:53 -07:00
2fa4623d9e [Core] Refactor _prepare_model_input_tensors - take 2 (#6164) 2024-07-17 09:37:16 -07:00
a9a2e74d21 [Misc] Use torch.Tensor for type annotation (#6505) 2024-07-17 13:01:10 +00:00
e09ce759aa [TPU] Remove multi-modal args in TPU backend (#6504) 2024-07-17 04:02:53 -07:00
5fa6e9876e [Bugfix] Fix for multinode crash on 4 PP (#6495)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-17 08:25:10 +00:00
5bf35a91e4 [Doc][CI/Build] Update docs and tests to use vllm serve (#6431) 2024-07-17 07:43:21 +00:00
a19e8d3726 [Misc][Speculative decoding] Typos and typing fixes (#6467)
Co-authored-by: caishangming.csm <caishangming.csm@alibaba-inc.com>
2024-07-17 07:17:07 +00:00
10383887e0 [ROCm] Cleanup Dockerfile and remove outdated patch (#6482) 2024-07-16 22:47:02 -07:00
1d094fd7c0 [Distributed][PP] only create embedding & lm head when necessary (#6455)
original title: [Distributed][Model] Rank-based Component Creation for Pipeline Parallelism Memory Optimization
2024-07-16 19:20:26 -07:00
ce37be7ba0 [misc][distributed] add seed to dummy weights (#6491) 2024-07-16 19:16:34 -07:00
7f62077af5 [misc][distributed] improve tests (#6488) 2024-07-16 17:35:52 -07:00
09c2eb85dd [ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00
978aed5300 [Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081) 2024-07-16 15:31:32 -07:00
160e1d8c99 [Misc] Log spec decode metrics (#6454) 2024-07-16 20:37:10 +00:00
94162beb9f [Doc] Fix the lora adapter path in server startup script (#6230) 2024-07-16 10:11:04 -07:00
c467dff24f [Hardware][TPU] Support MoE with Pallas GMM kernel (#6457) 2024-07-16 09:56:28 -07:00
9f4ccec761 [doc][misc] remind to cancel debugging environment variables (#6481)
[doc][misc] remind users to cancel debugging environment variables after debugging (#6481)
2024-07-16 09:45:30 -07:00
38ef94888a [CI/Build] Remove "boardwalk" image asset (#6460) 2024-07-16 08:59:36 -07:00
2bb0489cb3 [Core] Use numpy to speed up padded token processing (#6442) 2024-07-16 08:13:25 -07:00
7508a3dc34 [Misc] Fix typos in spec. decode metrics logging. (#6470)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-16 13:55:15 +00:00
7a3d2a5b95 [Frontend] Support for chat completions input in the tokenize endpoint (#5923) 2024-07-16 20:18:09 +08:00
d97011512e [CI/Build] vLLM cache directory for images (#6444) 2024-07-15 23:12:25 -07:00
37d776606f [Docs] Announce 5th meetup (#6458) 2024-07-15 21:04:58 -07:00
Joe
d92b3c5cde [Bugfix][CI/Build] Test prompt adapters in openai entrypoint tests (#6419) 2024-07-15 18:54:15 -07:00
9ad32dacd9 [BugFix][Model] Jamba - Handle aborted requests, Add tests and fix cleanup bug (#6425)
Co-authored-by: Mor Zusman <morz@ai21.com>
2024-07-16 01:32:55 +00:00
d6f3b3d5c4 Pin sphinx-argparse version (#6453)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-16 01:26:11 +00:00
4552e37b55 [CI/Build][TPU] Add TPU CI test (#6277)
Co-authored-by: kevin <kevin@anyscale.com>
2024-07-15 14:31:16 -07:00
ec9933f4a5 [Misc] Add CustomOp Interface to UnquantizedFusedMoEMethod (#6289) 2024-07-15 19:02:14 +00:00
3dee97b05f [Docs] Add Google Cloud to sponsor list (#6450) 2024-07-15 11:58:10 -07:00
4cf256ae7f [misc][distributed] fix pp missing layer condition (#6446) 2024-07-15 10:32:35 -07:00
64fdc08c72 bump version to v0.5.2 (#6433) 2024-07-15 17:27:40 +00:00
4ef95b0f06 [Bugfix] use float32 precision in samplers/test_logprobs.py for comparing with HF (#6409)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-15 13:14:49 -04:00
eaec4b9153 [Bugfix] Add custom Triton cache manager to resolve MoE MP issue (#6140)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Chih-Chieh-Yang <chih.chieh.yang@ibm.com>
2024-07-15 10:12:47 -07:00
a63a4c6341 [Misc] Use 0.0.9 version for flashinfer (#6447)
Co-authored-by: Pernekhan Utemuratov <pernekhan@deepinfra.com>
2024-07-15 10:10:26 -07:00
c8fd97f26d [Kernel] Use CUTLASS kernels for the FP8 layers with Bias (#6270) 2024-07-15 13:05:52 -04:00
94b82e8c18 [doc][distributed] add suggestion for distributed inference (#6418) 2024-07-15 09:45:51 -07:00
6ae1597ddf [VLM] Minor space optimization for ClipVisionModel (#6436) 2024-07-15 17:29:51 +08:00
22e79ee8f3 [doc][misc] doc update (#6439) 2024-07-14 23:33:25 -07:00
de19916314 [Bugfix] Convert image to RGB by default (#6430) 2024-07-15 05:39:15 +00:00
69672f116c [core][distributed] simplify code to support pipeline parallel (#6406) 2024-07-14 21:20:51 -07:00
44874a0bf9 [Doc] add env docs for flashinfer backend (#6437) 2024-07-14 21:16:51 -07:00
b47008b4d2 [BugFix] BatchResponseData body should be optional (#6345)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-15 04:06:09 +00:00
9bfece89fd Add FUNDING.yml (#6435) 2024-07-14 20:36:16 -07:00
32c9d7f765 Report usage for beam search (#6404) 2024-07-14 19:37:35 -07:00
ccb20db8bd [Bugfix] Benchmark serving script used global parameter 'args' in function 'sample_random_requests' (#6428) 2024-07-14 19:27:01 -07:00
a754dc2cb9 [CI/Build] Cross python wheel (#6394) 2024-07-14 18:54:46 -07:00
61e85dbad8 [Doc] xpu backend requires running setvars.sh (#6393) 2024-07-14 17:10:11 -07:00
dbfe254eda [Feature] vLLM CLI (#5090)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-07-14 15:36:43 -07:00
73030b7dae [ Misc ] Enable Quantizing All Layers of DeekSeekv2 (#6423) 2024-07-14 21:38:42 +00:00
ccd3c04571 [ci][build] fix commit id (#6420)
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-07-14 22:16:21 +08:00
9dad5cc859 [Kernel] Turn off CUTLASS scaled_mm for Ada Lovelace (#6384) 2024-07-14 13:37:19 +00:00
6ef3bf912c Remove unnecessary trailing period in spec_decode.rst (#6405) 2024-07-14 07:58:09 +00:00
540c0368b1 [Model] Initialize Fuyu-8B support (#3924)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-14 05:27:14 +00:00
fb6af8bc08 [ Misc ] Apply MoE Refactor to Deepseekv2 To Support Fp8 (#6417) 2024-07-13 20:03:58 -07:00
eeceadaecc [Misc] Add deprecation warning for beam search (#6402) 2024-07-13 11:52:22 -07:00
babf52dade [ Misc ] More Cleanup of Marlin (#6359)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-07-13 10:21:37 +00:00
9da4aad44b Updating LM Format Enforcer version to v10.3 (#6411) 2024-07-13 10:09:12 +00:00
41708e5034 [ci] try to add multi-node tests (#6280)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-12 21:51:48 -07:00
d80aef3776 [Docs] Clean up latest news (#6401) 2024-07-12 19:36:53 -07:00
e1684a766a [Bugfix] Fix hard-coded value of x in context_attention_fwd (#6373)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-12 18:30:54 -07:00
a27f87da34 [Doc] Fix Typo in Doc (#6392)
Co-authored-by: Saliya Ekanayake <esaliya@d-matrix.ai>
2024-07-13 00:48:23 +00:00
16ff6bd58c [ci] Fix wording for GH bot (#6398)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-12 16:34:37 -07:00
f8f9ff57ee [Bugfix][TPU] Fix megacore setting for v5e-litepod (#6397) 2024-07-12 15:59:47 -07:00
6bc9710f6e Fix release pipeline's dir permission (#6391) 2024-07-12 15:52:43 -07:00
111fc6e7ec [Misc] Add generated git commit hash as vllm.__commit__ (#6386) 2024-07-12 22:52:15 +00:00
75f64d8b94 [Bugfix] Fix illegal memory access in FP8 MoE kernel (#6382) 2024-07-12 21:33:33 +00:00
21b2dcedab Fix release pipeline's -e flag (#6390) 2024-07-12 14:08:04 -07:00
07b35af86d Fix interpolation in release pipeline (#6389) 2024-07-12 14:03:39 -07:00
bb1a784b05 Fix release-pipeline.yaml (#6388) 2024-07-12 14:00:57 -07:00
d719ba24c5 Build some nightly wheels by default (#6380) 2024-07-12 13:56:59 -07:00
aa48e502fb [MISC] Upgrade dependency to PyTorch 2.3.1 (#5327) 2024-07-12 12:04:26 -07:00
4dbebd03cc [ci] Add GHA workflows to enable full CI run (#6381)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-12 11:36:26 -07:00
b75bce1008 [ci] Add grouped tests & mark tests to run by default for fastcheck pipeline (#6365)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-12 09:58:38 -07:00
b039cbbce3 [Misc] add fixture to guided processor tests (#6341) 2024-07-12 09:55:39 -07:00
f9d25c2519 [Build/CI] Checking/Waiting for the GPU's clean state (#6379) 2024-07-12 09:42:24 -07:00
024ad87cdc [Bugfix] Fix dtype mismatch in PaliGemma (#6367) 2024-07-12 08:22:18 -07:00
aea19f0989 [ Misc ] Support Models With Bias in compressed-tensors integration (#6356) 2024-07-12 11:11:29 -04:00
f7160d946a [Misc][Bugfix] Update transformers for tokenizer issue (#6364) 2024-07-12 08:40:07 +00:00
6047187cd8 [ Misc ] Remove separate bias add (#6353) 2024-07-12 05:06:09 +00:00
b6c16cf8ff [ROCm][AMD] unify CUDA_VISIBLE_DEVICES usage in cuda/rocm (#6352) 2024-07-11 21:30:46 -07:00
d26a8b3f1f [CI/Build] (2/2) Switching AMD CI to store images in Docker Hub (#6350) 2024-07-11 21:26:26 -07:00
d59eb98489 [Model][Phi3-Small] Remove scipy from blocksparse_attention (#6343) 2024-07-12 10:47:17 +08:00
adf32e0a0f [Bugfix] Fix usage stats logging exception warning with OpenVINO (#6349) 2024-07-12 10:47:00 +08:00
2b0fb53481 [distributed][misc] be consistent with pytorch for libcudart.so (#6346)
[distributed][misc] keep consistent with how pytorch finds libcudart.so (#6346)
2024-07-11 19:35:17 -07:00
d6ab528997 [Misc] Remove flashinfer warning, add flashinfer tests to CI (#6351) 2024-07-12 01:32:06 +00:00
7ed6a4f0e1 [ BugFix ] Prompt Logprobs Detokenization (#6223)
Co-authored-by: Zifei Tong <zifeitong@gmail.com>
2024-07-11 22:02:29 +00:00
a4feba929b [CI/Build] Add nightly benchmarking for tgi, tensorrt-llm and lmdeploy (#5362) 2024-07-11 13:28:38 -07:00
2d23b42d92 [doc] update pipeline parallel in readme (#6347) 2024-07-11 11:38:40 -07:00
1df43de9bb [bug fix] Fix llava next feature size calculation. (#6339)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
2024-07-11 17:21:10 +00:00
52b7fcb35a Benchmark: add H100 suite (#6047) 2024-07-11 09:17:07 -07:00
b675069d74 [ Misc ] Refactor Marlin Python Utilities (#6082)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-07-11 15:40:11 +00:00
55f692b46e [BugFix] get_and_reset only when scheduler outputs are not empty (#6266) 2024-07-11 07:40:20 -07:00
8a1415cf77 [Bugfix] GPTBigCodeForCausalLM: Remove lm_head from supported_lora_modules. (#6326)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-11 07:05:59 -07:00
546b101fa0 [BugFix]: fix engine timeout due to request abort (#6255)
Signed-off-by: yatta zhang <ytzhang01@foxmail.com>
Signed-off-by: zhangyuntao.dev <zhangyuntao.dev@bytedance.com>
Co-authored-by: zhangyuntao.dev <zhangyuntao.dev@bytedance.com>
2024-07-11 06:46:31 -07:00
3963a5335b [Misc] refactor(config): clean up unused code (#6320) 2024-07-11 09:39:07 +00:00
c4774eb841 [Bugfix] Fix snapshot download in serving benchmark (#6318) 2024-07-11 07:04:05 +00:00
fc17110bbe [BugFix]: set outlines pkg version (#6262) 2024-07-11 04:37:11 +00:00
439c84581a [Doc] Update description of vLLM support for CPUs (#6003) 2024-07-10 21:15:29 -07:00
99ded1e1c4 [Doc] Remove comments incorrectly copied from another project (#6286) 2024-07-10 17:05:26 -07:00
997df46a32 [Bugfix][Neuron] Fix soft prompt method error in NeuronExecutor (#6313) 2024-07-10 16:39:02 -07:00
ae151d73be [Speculative Decoding] Enabling bonus token in speculative decoding for KV cache based models (#5765) 2024-07-10 16:02:47 -07:00
44cc76610d [Bugfix] Fix OpenVINOExecutor abstractmethod error (#6296)
Signed-off-by: sangjune.park <sangjune.park@navercorp.com>
2024-07-10 10:03:32 -07:00
b422d4961a [CI/Build] Enable mypy typing for remaining folders (#6268) 2024-07-10 22:15:55 +08:00
c38eba3046 [Bugfix] MLPSpeculator: Use ParallelLMHead in tie_weights=False case. (#6303)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-10 09:04:07 -04:00
e72ae80b06 [Bugfix] Support 2D input shape in MoE layer (#6287) 2024-07-10 09:03:16 -04:00
8a924d2248 [Doc] Guide for adding multi-modal plugins (#6205) 2024-07-10 14:55:34 +08:00
5ed3505d82 [Bugfix][TPU] Add prompt adapter methods to TPUExecutor (#6279) 2024-07-09 19:30:56 -07:00
da78caecfa [core][distributed] zmq fallback for broadcasting large objects (#6183)
[core][distributed] add zmq fallback for broadcasting large objects (#6183)
2024-07-09 18:49:11 -07:00
2416b26e11 [Speculative Decoding] Medusa Implementation with Top-1 proposer (#4978) 2024-07-09 18:34:02 -07:00
d3a245138a [Bugfix]fix and needs_scalar_to_array logic check (#6238)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-07-09 23:43:24 +00:00
673dd4cae9 [Docs] Docs update for Pipeline Parallel (#6222)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-07-09 16:24:58 -07:00
4d6ada947c [CORE] Adding support for insertion of soft-tuned prompts (#4645)
Co-authored-by: Swapnil Parekh <swapnilp@ibm.com>
Co-authored-by: Joe G <joseph.granados@h2o.ai>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-07-09 13:26:36 -07:00
a0550cbc80 Add support for multi-node on CI (#5955)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-09 12:56:56 -07:00
08c5bdecae [Bugfix][TPU] Fix outlines installation in TPU Dockerfile (#6256) 2024-07-09 02:56:06 -07:00
5d5b4c5fe5 [Bugfix][TPU] Add missing None to model input (#6245) 2024-07-09 00:21:37 -07:00
70c232f85a [core][distributed] fix ray worker rank assignment (#6235) 2024-07-08 21:31:44 -07:00
a3c9435d93 [hardware][cuda] use device id under CUDA_VISIBLE_DEVICES for get_device_capability (#6216) 2024-07-08 20:02:15 -07:00
4f0e0ea131 Add FlashInfer to default Dockerfile (#6172) 2024-07-08 13:38:03 -07:00
ddc369fba1 [Bugfix] Mamba cache Cuda Graph padding (#6214) 2024-07-08 11:25:51 -07:00
185ad31f37 [Bugfix] use diskcache in outlines _get_guide #5436 (#6203) 2024-07-08 11:23:24 -07:00
543aa48573 [Kernel] Correctly invoke prefill & decode kernels for cross-attention (towards eventual encoder/decoder model support) (#4888)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-07-08 17:12:15 +00:00
f7a8fa39d8 [Kernel] reloading fused_moe config on the last chunk (#6210) 2024-07-08 08:00:38 -07:00
717f4bcea0 Feature/add benchmark testing (#5947)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-08 07:52:06 +00:00
16620f439d do not exclude object field in CompletionStreamResponse (#6196) 2024-07-08 10:32:57 +08:00
3b08fe2b13 [misc][frontend] log all available endpoints (#6195)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-07-07 15:11:12 -07:00
abfe705a02 [ Misc ] Support Fp8 via llm-compressor (#6110)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-07-07 20:42:11 +00:00
333306a252 add benchmark for fix length input and output (#5857)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-07 07:42:13 +00:00
6206dcb29e [Model] Add PaliGemma (#5189)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-07-07 09:25:50 +08:00
9389380015 [Doc] Move guide for multimodal model and other improvements (#6168) 2024-07-06 17:18:59 +08:00
175c43eca4 [Doc] Reorganize Supported Models by Type (#6167) 2024-07-06 05:59:36 +00:00
bc96d5c330 Move release wheel env var to Dockerfile instead (#6163) 2024-07-05 17:19:53 -07:00
f0250620dd Fix release wheel build env var (#6162) 2024-07-05 16:24:31 -07:00
2de490d60f Update wheel builds to strip debug (#6161) 2024-07-05 14:51:25 -07:00
79d406e918 [Docs] Fix readthedocs for tag build (#6158) 2024-07-05 12:44:40 -07:00
abad5746a7 bump version to v0.5.1 (#6157) 2024-07-05 12:04:51 -07:00
e58294ddf2 [Bugfix] Add verbose error if scipy is missing for blocksparse attention (#5695) 2024-07-05 10:41:01 -07:00
f1e15da6fe [Frontend] Continuous usage stats in OpenAI completion API (#5742) 2024-07-05 10:37:09 -07:00
0097bb1829 [Bugfix] Use templated datasource in grafana.json to allow automatic imports (#6136)
Signed-off-by: Christian Rohmann <christian.rohmann@inovex.de>
2024-07-05 09:49:47 -07:00
ea4b570483 [VLM] Cleanup validation and update docs (#6149) 2024-07-05 05:49:38 +00:00
a41357e941 [VLM] Improve consistency between feature size calculation and dummy data for profiling (#6146) 2024-07-05 09:29:47 +08:00
ae96ef8fbd [VLM] Calculate maximum number of multi-modal tokens by model (#6121) 2024-07-04 16:37:23 -07:00
69ec3ca14c [Kernel][Model] logits_soft_cap for Gemma2 with flashinfer (#6051)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-07-04 16:35:51 -07:00
81d7a50f24 [Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008)
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
2024-07-04 15:22:12 -07:00
27902d42be [misc][doc] try to add warning for latest html (#5979) 2024-07-04 09:57:09 -07:00
56b325e977 [ROCm][AMD][Model]Adding alibi slopes support in ROCm triton flash attention and naive flash attention (#6043)
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
2024-07-03 22:19:38 -07:00
3dd507083f [CI/Build] Cleanup VLM tests (#6107) 2024-07-03 18:58:18 -07:00
0ed646b7aa [Distributed][Core] Support Py39 and Py38 for PP (#6120)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-03 17:52:29 -07:00
1dab9bc8a9 [Bugfix] set OMP_NUM_THREADS to 1 by default for multiprocessing (#6109)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-07-03 16:56:59 -07:00
3de6e6a30e [core][distributed] support n layers % pp size != 0 (#6115) 2024-07-03 16:40:31 -07:00
966fe72141 [doc][misc] bump up py version in installation doc (#6119) 2024-07-03 15:52:04 -07:00
62963d129e [ Misc ] Clean Up CompressedTensorsW8A8 (#6113) 2024-07-03 22:50:08 +00:00
d9e98f42e4 [vlm] Remove vision language config. (#6089)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-03 22:14:16 +00:00
3c6325f0fc [core][distributed] custom allreduce when pp size > 1 (#6117) 2024-07-03 14:41:32 -07:00
47f0954af0 [Kernel] Expand FP8 support to Ampere GPUs using FP8 Marlin (#5975) 2024-07-03 17:38:00 +00:00
7cd2ebb025 [Bugfix] Fix compute_logits in Jamba (#6093) 2024-07-03 00:32:35 -07:00
f1c78138aa [Doc] Fix Mock Import (#6094) 2024-07-03 00:13:56 -07:00
3a86b54fb0 [VLM][Frontend] Proper Image Prompt Formatting from OpenAI API (#6091)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-02 23:41:23 -07:00
f666207161 [misc][distributed] error on invalid state (#6092) 2024-07-02 23:37:29 -07:00
d830656a97 [BugFix] Avoid unnecessary Ray import warnings (#6079) 2024-07-03 14:09:40 +08:00
d18bab3587 [CI] Fix base url doesn't strip "/" (#6087) 2024-07-02 21:31:25 -07:00
9831aec49f [Core] Dynamic image size support for VLMs (#5276)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: ywang96 <ywang@roblox.com>
Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-07-02 20:34:00 -07:00
482045ee77 [hardware][misc] introduce platform abstraction (#6080) 2024-07-02 20:12:22 -07:00
9d6a8daa87 [Model] Jamba support (#4115)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Erez Schwartz <erezs@ai21.com>
Co-authored-by: Mor Zusman <morz@ai21.com>
Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Tomer Asida <tomera@ai21.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-02 23:11:29 +00:00
ee93f4f92a [CORE] Quantized lm-head Framework (#4442)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
Co-authored-by: ZX <zx@lbx.dev>
2024-07-02 22:25:17 +00:00
7c008c51a9 [ Misc ] Refactor MoE to isolate Fp8 From Mixtral (#5970)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-02 21:54:35 +00:00
4d26d806e1 Update conftest.py (#6076) 2024-07-02 20:14:22 +00:00
c5832d2ae9 [Core] Pipeline Parallel Support (#4412)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-02 10:58:08 -07:00
15aba081f3 [Speculative Decoding] MLPSpeculator Tensor Parallel support (1/2) (#6050)
Co-authored-by: Sirej Dua <sirej.dua@databricks.com>
Co-authored-by: Sirej Dua <Sirej Dua>
2024-07-02 07:20:29 -07:00
31354e563f [Doc] Reinstate doc dependencies (#6061) 2024-07-02 10:53:16 +00:00
98d6682cd1 [VLM] Remove image_input_type from VLM config (#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-02 07:57:09 +00:00
2c37540aa6 [Frontend] Add template related params to request (#5709) 2024-07-01 23:01:57 -07:00
3476ed0809 [Core] Optimize block_manager_v2 vs block_manager_v1 (to make V2 default) (#5602) 2024-07-01 20:10:37 -07:00
54600709b6 [Model] Changes to MLPSpeculator to support tie_weights and input_scale (#5965)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Joshua Rosenkranz <jmrosenk@us.ibm.com>
2024-07-01 16:40:02 -07:00
e373853e12 [Frontend] Relax api url assertion for openai benchmarking (#6046) 2024-07-01 23:39:10 +00:00
c87ebc3ef9 [BugFix] Ensure worker model loop is always stopped at the right time (#5987) 2024-07-01 16:17:58 -07:00
c4059ea54f [Bugfix] Add explicit end_forward calls to flashinfer (#6044) 2024-07-01 23:08:58 +00:00
8e0817c262 [Bugfix][Doc] Fix Doc Formatting (#6048) 2024-07-01 15:09:11 -07:00
83bdcb6ac3 add FAQ doc under 'serving' (#5946) 2024-07-01 14:11:36 -07:00
12a59959ed [Bugfix] adding chunking mechanism to fused_moe to handle large inputs (#6029) 2024-07-01 21:08:29 +00:00
dec6fc6f3b [Bugfix] Use RayActorError for older versions of Ray in RayTokenizerGroupPool (#6039) 2024-07-01 20:12:40 +00:00
8893130b63 [doc][misc] further lower visibility of simple api server (#6041)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-07-01 10:50:56 -07:00
bb60326836 [Misc] update benchmark backend for scalellm (#6018) 2024-07-01 10:20:33 -07:00
4050d646e5 [doc][misc] remove deprecated api server in doc (#6037) 2024-07-01 12:52:43 -04:00
d76084c12f [ CI ] Re-enable Large Model LM Eval (#6031) 2024-07-01 12:40:45 -04:00
80ca1e6a3a [Speculative Decoding 2/2 ] Integrate typical acceptance sampler into Spec Decode Worker (#5348) 2024-07-01 00:33:05 -07:00
614aa51203 [misc][cuda] use nvml to avoid accidentally cuda initialization (#6007) 2024-06-30 20:07:34 -07:00
af9ad46fca [ Misc ] Refactor w8a8 to use process_weights_after_load (Simplify Weight Loading) (#5940)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-30 23:06:27 +00:00
7836fdcc11 [Misc] Fix get_min_capability (#5971) 2024-06-30 20:15:16 +00:00
deacb7ec44 [ CI ] Temporarily Disable Large LM-Eval Tests (#6005)
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic>
2024-06-30 11:56:56 -07:00
f5e73c9f1b [Lora] Use safetensor keys instead of adapter_config.json to find unexpected modules. (#5909)
Co-authored-by: sang <sangcho@anyscale.com>
2024-06-30 17:11:15 +00:00
c6c240aa0a [Frontend]: Support base64 embedding (#5935)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-06-30 23:53:00 +08:00
2be6955a3f [ci][distributed] fix device count call
[ci][distributed] fix some cuda init that makes it necessary to use spawn (#5991)
2024-06-30 08:06:13 +00:00
9d47f64eb6 [CI/Build] [3/3] Reorganize entrypoints tests (#5966) 2024-06-30 12:58:49 +08:00
cff6a1fec1 [CI/Build] Reuse code for checking output consistency (#5988) 2024-06-30 11:44:25 +08:00
bcc6a09b63 [CI/Build] Temporarily Remove Phi3-Vision from TP Test (#5989) 2024-06-30 09:18:31 +08:00
9def10664e [Bugfix][CI/Build][Hardware][AMD] Install matching torchvision to fix AMD tests (#5949) 2024-06-29 12:47:58 -07:00
75aa1442db [ CI/Build ] LM Eval Harness Based CI Testing (#5838)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-29 13:04:30 -04:00
99397da534 [CI/Build] Add TP test for vision models (#5892) 2024-06-29 15:45:54 +00:00
8dbfcd35bf [ CI/Build ] Added E2E Test For Compressed Tensors (#5839)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-29 21:12:58 +08:00
f7dac83d95 [Kernel] Raise an exception in MoE kernel if the batch size is larger then 65k (#5939) 2024-06-29 21:04:20 +08:00
7c01f70641 [Core] Optimize SequenceStatus.is_finished by switching to IntEnum (#5974) 2024-06-29 12:47:53 +00:00
51e971d39e [Bugfix] Support eos_token_id from config.json (#5954) 2024-06-29 11:19:02 +00:00
329df38f1a [Misc] Update Phi-3-Vision Example (#5981)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-06-29 14:34:29 +08:00
580353da93 [Bugfix] Fix precisions in Gemma 1 (#5913) 2024-06-29 03:10:21 +00:00
ba4994443a [Kernel] Add punica dimensions for Granite 3b and 8b (#5930)
Signed-off-by: Joe Runde <joe@joerun.de>
2024-06-29 10:48:25 +08:00
906a19cdb0 [Misc] Extend vLLM Metrics logging API (#5925)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-06-29 10:36:06 +08:00
c4bca740e8 [Bugfix] fix missing last itl in openai completions benchmark (#5926) 2024-06-29 10:34:42 +08:00
7f83f40dee [Bugfix][TPU] Fix pad slot id (#5977) 2024-06-28 18:55:17 -07:00
54814fd85b [Bugfix][TPU] Fix TPU sampler output (#5978) 2024-06-28 18:14:16 -07:00
7041de4384 [Kernel] Flashinfer for prefill & decode, with Cudagraph support for decode (#4628)
Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>, bong-furiosa <bongwon.jang@furiosa.ai>
2024-06-28 15:28:49 -07:00
6a62cb82cc [Bugfix] Fix Engine Failing After Invalid Request - AsyncEngineDeadError (#5963)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-28 17:46:30 -04:00
5d2a1a9cf0 Unmark more files as executable (#5962) 2024-06-28 17:34:56 -04:00
4bf35ed9ae [Bugfix] Only add Attention.kv_scale if kv cache quantization is enabled (#5936) 2024-06-28 21:12:40 +00:00
be0b3af9e0 Support Deepseek-V2 (#4650)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
2024-06-28 13:24:57 -07:00
2cd402e169 [ Bugfix ] Enabling Loading Models With Fused QKV/MLP on Disk with FP8 (#5921)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-28 18:43:49 +00:00
b185230744 [ Misc ] Remove fp8_shard_indexer from Col/Row Parallel Linear (Simplify Weight Loading) (#5928)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-28 13:49:57 -04:00
6a2d659d28 [Bugfix] Fix compute datatype for cutlass 3.x epilogues (#5931) 2024-06-28 17:10:34 +00:00
b2c620230a [Spec Decode] Introduce DraftModelRunner (#5799) 2024-06-28 09:17:51 -07:00
b90d8cd832 [Distributed] Make it clear that % should not be in tensor dict keys. (#5927)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
2024-06-28 15:20:22 +00:00
3b752a6555 [CI/Build] [2/3] Reorganize entrypoints tests (#5904) 2024-06-28 07:59:18 -07:00
ec1ad0046c [Bugfix] Better error message for MLPSpeculator when num_speculative_tokens is set too high (#5894)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-28 07:42:17 -07:00
57f09a419c [Hardware][Intel] OpenVINO vLLM backend (#5379) 2024-06-28 13:50:16 +00:00
5932634409 Unmark fused_moe config json file as executable (#5960) 2024-06-28 06:36:12 -07:00
5cbe8d155c [Core] Registry for processing model inputs (#5214)
Co-authored-by: ywang96 <ywang@roblox.com>
2024-06-28 12:09:56 +00:00
0d0e3a42ac [Bugfix][Hardware][Intel CPU] Fix unpassed multi_modal_kwargs for CPU runner (#5956) 2024-06-28 12:03:41 +00:00
74d55c065b [VLM][BugFix] Make sure that multi_modal_kwargs can broadcast properly with ring buffer. (#5905)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-28 07:29:13 +00:00
f136da15e1 [Hardware][TPU] Optimize KV cache swapping (#5878) 2024-06-27 21:12:13 -07:00
c3dde367f1 [Kernel][ROCm][AMD] fused_moe Triton configs v2 for mi300X (#5932) 2024-06-27 13:41:08 -07:00
64e8d2a783 [core][misc] remove logical block (#5882) 2024-06-27 13:34:55 -07:00
79c92c7c8a [Model] Add Gemma 2 (#5908) 2024-06-27 13:33:56 -07:00
736ed38849 [CI/Build] Fix Args for _get_logits_warper in Sampler Test (#5922) 2024-06-27 11:43:04 -07:00
365791ff81 [BugFix] Fix min_tokens behaviour for multiple eos tokens (#5849) 2024-06-27 11:31:11 -07:00
691e29ecf3 [BugFix] Fix MLPSpeculator handling of num_speculative_tokens (#5876) 2024-06-27 10:59:33 -07:00
3fd02bda51 [doc][misc] add note for Kubernetes users (#5916) 2024-06-27 10:07:07 -07:00
98cf2ed678 [Model][Bugfix] Implicit model flags and reenable Phi-3-Vision (#5896) 2024-06-27 09:08:10 -07:00
e9d32d077d [CI/Build] [1/3] Reorganize entrypoints tests (#5526) 2024-06-27 12:43:17 +00:00
2061f0b8a7 [Bugfix] Fix img_sizes Parsing in Phi3-Vision (#5888) 2024-06-27 08:29:24 +00:00
96354d6a29 [Model] Add base class for LoRA-supported models (#5018) 2024-06-27 16:03:04 +08:00
d12af207d2 [VLM][Bugfix] Make sure that multi_modal_kwargs is broadcasted properly (#5880)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
2024-06-27 15:15:24 +08:00
6eabc6cb0e [Doc] Add note about context length in Phi-3-Vision example (#5887) 2024-06-26 23:20:01 -07:00
2110557dab [BugFix] Fix cuda graph for MLPSpeculator (#5875)
Co-authored-by: Abhinav Goyal <abhinav.goyal@flipkart.com>
2024-06-27 04:12:10 +00:00
b9e84259e9 [Misc] Add example for LLaVA-NeXT (#5879) 2024-06-26 17:57:16 -07:00
294104c3f9 [doc] update usage of env var to avoid conflict (#5873) 2024-06-26 17:57:12 -04:00
38a1674abb Support CPU inference with VSX PowerPC ISA (#5652) 2024-06-26 21:53:04 +00:00
f5c8628fdc [Bugfix][TPU] Fix CPU cache allocation (#5869) 2024-06-26 13:42:40 -07:00
cbc53b6b8d [Hardware][TPU] Support parallel sampling & Swapping (#5855) 2024-06-26 11:07:49 -07:00
c54269d967 [Frontend] Add tokenize/detokenize endpoints (#5054) 2024-06-26 16:54:22 +00:00
5bfd1bbc98 [Kernel] Adding bias epilogue support for cutlass_scaled_mm (#5560)
Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2024-06-26 15:16:00 +00:00
6984c02a27 [CI/Build] Refactor image test assets (#5821) 2024-06-26 01:02:34 -07:00
3439c5a8e3 [Bugfix][TPU] Fix KV cache size calculation (#5860) 2024-06-26 00:58:23 -07:00
6806998bf9 [Bugfix] Fix embedding to support 2D inputs (#5829) 2024-06-26 00:15:22 -07:00
515080ad2f [bugfix][distributed] fix shm broadcast when the queue size is full (#5801) 2024-06-25 21:56:02 -07:00
3aa7b6cf66 [Misc][Doc] Add Example of using OpenAI Server with VLM (#5832) 2024-06-25 20:34:25 -07:00
dda4811591 [Core] Refactor Worker and ModelRunner to consolidate control plane communication (#5408)
Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
Signed-off-by: Stephanie <swang@anyscale.com>
Co-authored-by: Stephanie <swang@anyscale.com>
2024-06-25 20:30:03 -07:00
82079729cc [Bugfix] Fix assertion in NeuronExecutor (#5841) 2024-06-25 19:52:10 -07:00
c2a8ac75e0 [CI/Build] Add E2E tests for MLPSpeculator (#5791)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-26 00:04:08 +00:00
f178e56c68 [Hardware][TPU] Raise errors for unsupported sampling params (#5850) 2024-06-25 16:58:23 -07:00
dd793d1de5 [Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1, Dockerfile improvements, test fixes (#5422) 2024-06-25 15:56:15 -07:00
bc34937d68 [Hardware][TPU] Refactor TPU backend (#5831) 2024-06-25 15:25:52 -07:00
dd248f7675 [Misc] Update w4a16 compressed-tensors support to include w8a16 (#5794) 2024-06-25 19:23:35 +00:00
d9b34baedd [CI/Build] Add unit testing for FlexibleArgumentParser (#5798) 2024-06-25 12:18:03 -07:00
c18ebfdd71 [doc][distributed] add both gloo and nccl tests (#5834) 2024-06-25 15:10:28 -04:00
67882dbb44 [Core] Add fault tolerance for RayTokenizerGroupPool (#5748) 2024-06-25 10:15:10 -07:00
7b99314301 [Misc] Remove useless code in cpu_worker (#5824) 2024-06-25 09:41:36 -07:00
2ce5d6688b [Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) 2024-06-25 09:56:06 +00:00
f23871e9ee [Doc] Add notice about breaking changes to VLMs (#5818) 2024-06-25 01:25:03 -07:00
e9de9dd551 [ci] Remove aws template (#5757)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-24 21:09:02 -07:00
ba991d5c84 [Bugfix] Fix FlexibleArgumentParser replaces _ with - for actual args (#5795) 2024-06-24 17:01:19 -06:00
1744cc99ba [Doc] Add Phi-3-medium to list of supported models (#5788) 2024-06-24 10:48:55 -07:00
e72dc6cb35 [Doc] Add "Suggest edit" button to doc pages (#5789) 2024-06-24 10:26:17 -07:00
c246212952 [doc][faq] add warning to download models for every nodes (#5783) 2024-06-24 15:37:42 +08:00
edd5fe5fa2 [Bugfix] Add phi3v resize for dynamic shape and fix torchvision requirement (#5772) 2024-06-24 12:11:53 +08:00
5d4d90536f [Distributed] Add send and recv helpers (#5719) 2024-06-23 14:42:28 -07:00
6c916ac8a8 [BugFix] [Kernel] Add Cutlass2x fallback kernels (#5744)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-23 21:07:11 +00:00
832ea88fcb [core][distributed] improve shared memory broadcast (#5754) 2024-06-22 10:00:43 -07:00
8c00f9c15d [Docs][TPU] Add installation tip for TPU (#5761) 2024-06-21 23:09:40 -07:00
0cbc1d2b4f [Bugfix] Fix pin_lora error in TPU executor (#5760) 2024-06-21 22:25:14 -07:00
ff9ddbceee [Misc] Remove #4789 workaround left in vllm/entrypoints/openai/run_batch.py (#5756) 2024-06-22 03:33:12 +00:00
9c62db07ed [Model] Support Qwen-VL and Qwen-VL-Chat models with text-only inputs (#5710)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-22 02:07:08 +00:00
cf90ae0123 [CI][Hardware][Intel GPU] add Intel GPU(XPU) ci pipeline (#5616) 2024-06-21 17:09:34 -07:00
f5dda63eb5 [LoRA] Add support for pinning lora adapters in the LRU cache (#5603) 2024-06-21 15:42:46 -07:00
7187507301 [ci][test] fix ca test in main (#5746) 2024-06-21 14:04:26 -07:00
f1e72cc19a [BugFix] exclude version 1.15.0 for modelscope (#5668) 2024-06-21 13:15:48 -06:00
5b15bde539 [Doc] Documentation on supported hardware for quantization methods (#5745) 2024-06-21 12:44:29 -04:00
bd620b01fb [Kernel][CPU] Add Quick gelu to CPU (#5717) 2024-06-21 06:39:40 +00:00
d9a252bc8e [Core][Distributed] add shm broadcast (#5399)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-06-21 05:12:35 +00:00
67005a07bc [Bugfix] Add fully sharded layer for QKVParallelLinearWithLora (#5665)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-06-21 04:46:28 +00:00
c35e4a3dd7 [BugFix] Fix test_phi3v.py (#5725) 2024-06-21 04:45:34 +00:00
1f5674218f [Kernel] Add punica dimension for Qwen2 LoRA (#5441) 2024-06-20 17:55:41 -07:00
b12518d3cf [Model] MLPSpeculator speculative decoding support (#4947)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>

Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Davis Wertheimer <Davis.Wertheimer@ibm.com>
2024-06-20 20:23:12 -04:00
6c5b7af152 [distributed][misc] use fork by default for mp (#5669) 2024-06-20 17:06:34 -07:00
8065a7e220 [Frontend] Add FlexibleArgumentParser to support both underscore and dash in names (#5718) 2024-06-20 17:00:13 -06:00
3f3b6b2150 [Bugfix] Fix the CUDA version check for FP8 support in the CUTLASS kernels (#5715) 2024-06-20 18:36:10 +00:00
a7dcc62086 [Kernel] Update Cutlass int8 kernel configs for SM80 (#5275)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-20 13:33:21 +00:00
ad137cd111 [Model] Port over CLIPVisionModel for VLMs (#5591) 2024-06-20 11:52:09 +00:00
111af1fa2c [Kernel] Update Cutlass int8 kernel configs for SM90 (#5514)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-20 06:37:08 +00:00
1b2eaac316 [Bugfix][Doc] FIx Duplicate Explicit Target Name Errors (#5703) 2024-06-19 23:10:47 -07:00
3730a1c832 [Misc] Improve conftest (#5681) 2024-06-19 19:09:21 -07:00
949e49a685 [ci] Limit num gpus if specified for A100 (#5694)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-19 16:30:03 -07:00
4a30d7e3cc [Misc] Add per channel support for static activation quantization; update w8a8 schemes to share base classes (#5650) 2024-06-19 18:06:44 -04:00
e83db9e7e3 [Doc] Update docker references (#5614)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
2024-06-19 15:01:45 -07:00
78687504f7 [Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00
d571ca0108 [ci][distributed] add tests for custom allreduce (#5689) 2024-06-19 20:16:04 +00:00
afed90a034 [Frontend][Bugfix] Fix preemption_mode -> preemption-mode for CLI arg in arg_utils.py (#5688) 2024-06-19 14:41:42 -04:00
3ee5c4bca5 [ci] Add A100 queue into AWS CI template (#5648)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-19 08:42:13 -06:00
e9c2732b97 [CI/Build] Add tqdm to dependencies (#5680) 2024-06-19 08:37:33 -06:00
d8714530d1 [Misc]Add param max-model-len in benchmark_latency.py (#5629) 2024-06-19 18:19:08 +08:00
7d46c8d378 [Bugfix] Fix sampling_params passed incorrectly in Phi3v example (#5684) 2024-06-19 17:58:32 +08:00
da971ec7a5 [Model] Add FP8 kv cache for Qwen2 (#5656) 2024-06-19 09:38:26 +00:00
3eea74889f [misc][distributed] use 127.0.0.1 for single-node (#5619) 2024-06-19 08:05:00 +00:00
f758aed0e8 [Bugfix][CI/Build][AMD][ROCm]Fixed the cmake build bug which generate garbage on certain devices (#5641) 2024-06-18 23:21:29 -07:00
e5150f2c28 [Bugfix] Added test for sampling repetition penalty bug. (#5659)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-19 06:03:55 +00:00
59a1eb59c9 [Bugfix] Fix Phi-3 Long RoPE scaling implementation (#5628) 2024-06-19 01:46:38 +00:00
6820724e51 [Bugfix] Fix w8a8 benchmarks for int8 case (#5643) 2024-06-19 00:33:25 +00:00
b23ce92032 [Bugfix] Fix CUDA version check for mma warning suppression (#5642) 2024-06-18 23:48:49 +00:00
2bd231a7b7 [Doc] Added cerebrium as Integration option (#5553) 2024-06-18 15:56:59 -07:00
8a173382c8 [Bugfix] Fix for inconsistent behaviour related to sampling and repetition penalties (#5639)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-18 14:18:37 -07:00
07feecde1a [Model] LoRA support added for command-r (#5178) 2024-06-18 11:01:21 -07:00
19091efc44 [ci] Setup Release pipeline and build release wheels with cache (#5610)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-18 11:00:36 -07:00
95db455e7f [Misc] Add channel-wise quantization support for w8a8 dynamic per token activation quantization (#5542) 2024-06-18 12:45:05 -04:00
7879f24dcc [Misc] Add OpenTelemetry support (#4687)
This PR adds basic support for OpenTelemetry distributed tracing.
It includes changes to enable tracing functionality and improve monitoring capabilities.

I've also added a markdown with print-screens to guide users how to use this feature. You can find it here
2024-06-19 01:17:03 +09:00
13db4369d9 [ci] Deprecate original CI template (#5624)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-18 14:26:20 +00:00
4ad7b53e59 [CI/Build][Misc] Update Pytest Marker for VLMs (#5623) 2024-06-18 13:10:04 +00:00
f0cc0e68e3 [Misc] Remove import from transformers logging (#5625) 2024-06-18 12:12:19 +00:00
db5ec52ad7 [bugfix][distributed] improve p2p capability test (#5612)
[bugfix][distributed] do not error if two processes do not agree on p2p capability (#5612)
2024-06-18 07:21:05 +00:00
114d7270ff [CI] Avoid naming different metrics with the same name in performance benchmark (#5615) 2024-06-17 21:37:18 -07:00
32c86e494a [Misc] Fix typo (#5618) 2024-06-17 20:58:30 -07:00
8eadcf0b90 [misc][typo] fix typo (#5620) 2024-06-17 20:54:57 -07:00
5002175e80 [Kernel] Add punica dimensions for Granite 13b (#5559)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-06-18 03:54:11 +00:00
daef218b55 [Model] Initialize Phi-3-vision support (#4986) 2024-06-17 19:34:33 -07:00
fa9e385229 [Speculative Decoding 1/2 ] Add typical acceptance sampling as one of the sampling techniques in the verifier (#5131) 2024-06-17 21:29:09 -05:00
26e1188e51 [Fix] Use utf-8 encoding in entrypoints/openai/run_batch.py (#5606) 2024-06-17 23:16:10 +00:00
a3e8a05d4c [Bugfix] Fix KV head calculation for MPT models when using GQA (#5142) 2024-06-17 15:26:41 -07:00
e441bad674 [Optimization] use a pool to reuse LogicalTokenBlock.token_ids (#5584) 2024-06-17 22:08:05 +00:00
1b44aaf4e3 [bugfix][distributed] fix 16 gpus local rank arrangement (#5604) 2024-06-17 21:35:04 +00:00
9e4e6fe207 [CI] the readability of benchmarking and prepare for dashboard (#5571)
[CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571)
2024-06-17 11:41:08 -07:00
ab66536dbf [CI/BUILD] Support non-AVX512 vLLM building and testing (#5574) 2024-06-17 14:36:10 -04:00
728c4c8a06 [Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814)
Co-authored-by: Jiang Li <jiang1.li@intel.com>
Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com>
Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
2024-06-17 11:01:25 -07:00
1f12122b17 [Misc] use AutoTokenizer for benchmark serving when vLLM not installed (#5588) 2024-06-17 09:40:35 -07:00
890d8d960b [Kernel] compressed-tensors marlin 24 support (#5435) 2024-06-17 12:32:48 -04:00
9e74d9d003 Correct alignment in the seq_len diagram. (#5592)
Co-authored-by: Liqian Chen <liqian.chen@deeplang.ai>
2024-06-17 12:05:33 -04:00
9333fb8eb9 [Model] Rename Phi3 rope scaling type (#5595) 2024-06-17 12:04:14 -04:00
e2b85cf86a Fix w8a8 benchmark and add Llama-3-8B (#5562) 2024-06-17 06:48:06 +00:00
845a3f26f9 [Doc] add debugging tips for crash and multi-node debugging (#5581) 2024-06-17 10:08:01 +08:00
f07d513320 [build][misc] limit numpy version (#5582) 2024-06-16 16:07:01 -07:00
4a6769053a [CI][BugFix] Flip is_quant_method_supported condition (#5577) 2024-06-16 14:07:34 +00:00
f31c1f90e3 Add basic correctness 2 GPU tests to 4 GPU pipeline (#5518) 2024-06-16 07:48:02 +00:00
3ce2c050dd [Fix] Correct OpenAI batch response format (#5554) 2024-06-15 16:57:54 -07:00
1c0afa13c5 [BugFix] Don't start a Ray cluster when not using Ray (#5570) 2024-06-15 16:30:51 -07:00
d919ecc771 add gptq_marlin test for bug report https://github.com/vllm-project/vllm/issues/5088 (#5145) 2024-06-15 13:38:16 -04:00
e691918e3b [misc] Do not allow to use lora with chunked prefill. (#5538)
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-06-15 14:59:36 +00:00
81fbb3655f [CI/Build] Test both text and token IDs in batched OpenAI Completions API (#5568) 2024-06-15 07:29:42 -04:00
0e9164b40a [mypy] Enable type checking for test directory (#5017) 2024-06-15 04:45:31 +00:00
1b8a0d71cf [Core][Bugfix]: fix prefix caching for blockv2 (#5364)
Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-06-14 17:23:56 -07:00
bd7efe95d0 Add ccache to amd (#5555) 2024-06-14 17:18:22 -07:00
f5bb85b435 [Core][Distributed] improve p2p cache generation (#5528) 2024-06-14 14:47:45 -07:00
28c145eb57 [Bugfix] Fix typo in Pallas backend (#5558) 2024-06-14 14:40:09 -07:00
e2afb03c92 [Bugfix] Enable loading FP8 checkpoints for gpt_bigcode models (#5460)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-14 20:28:11 +00:00
6e2527a7cb [Doc] Update documentation on Tensorizer (#5471) 2024-06-14 11:27:57 -07:00
cdab68dcdb [Docs] Add ZhenFund as a Sponsor (#5548) 2024-06-14 11:17:21 -07:00
d1c3d7d139 [misc][distributed] fix benign error in is_in_the_same_node (#5512) 2024-06-14 10:59:28 -07:00
77490c6f2f [Core] Remove duplicate processing in async engine (#5525) 2024-06-14 10:04:42 -07:00
48f589e18b [mis] fix flaky test of test_cuda_device_count_stateless (#5546) 2024-06-14 10:02:23 -07:00
348616ac4b [Kernel] Suppress mma.sp warning on CUDA 12.5 and later (#5401) 2024-06-14 10:02:00 -07:00
15985680e2 [ Misc ] Rs/compressed tensors cleanup (#5432)
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
2024-06-14 10:01:46 -07:00
d74674bbd9 [Misc] Fix arg names (#5524) 2024-06-14 09:47:44 -07:00
703475f6c2 [Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (#5516) 2024-06-14 09:30:15 -07:00
d47af2bc02 [CI/Build] Disable LLaVA-NeXT CPU test (#5529) 2024-06-14 09:27:30 -07:00
319ad7f1d3 [CI/Build][Misc] Add CI that benchmarks vllm performance on those PRs with perf-benchmarks label (#5073)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-06-13 22:36:20 -07:00
0f0d8bc065 bump version to v0.5.0.post1 (#5522) 2024-06-13 19:42:06 -07:00
55d6361b13 [Misc] Fix arg names in quantizer script (#5507) 2024-06-13 19:02:53 -07:00
cd9c0d65d9 [Hardware][Intel] Support CPU inference with AVX2 ISA (#5452) 2024-06-13 17:22:24 -06:00
50eed24d25 Add cuda_device_count_stateless (#5473) 2024-06-13 16:06:49 -07:00
e38042d4af [Kernel] Disable CUTLASS kernels for fp8 (#5505) 2024-06-13 13:38:05 -07:00
33e3b37242 [CI/Build] Disable test_fp8.py (#5508) 2024-06-13 13:37:48 -07:00
1696efe6c9 [misc] fix format.sh (#5511) 2024-06-13 12:09:16 -07:00
6b0511a57b Revert "[Core] Remove unnecessary copies in flash attn backend" (#5478) 2024-06-13 11:22:50 -07:00
a8fda4f661 Seperate dev requirements into lint and test (#5474) 2024-06-13 11:22:41 -07:00
30299a41fa [MISC] Remove FP8 warning (#5472)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
2024-06-13 11:22:30 -07:00
85657b5607 [Kernel] Factor out epilogues from cutlass kernels (#5391)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: zifeitong <zifei.tong@parasail.io>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-06-13 11:22:19 -07:00
0ce7b952f8 [Doc] Update LLaVA docs (#5437)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-13 11:22:07 -07:00
39873476f8 [CI/Build] Simplify OpenAI server setup in tests (#5100) 2024-06-13 11:21:53 -07:00
03dccc886e [Misc] Add vLLM version getter to utils (#5098) 2024-06-13 11:21:39 -07:00
a65634d3ae [Docs] Add 4th meetup slides (#5509) 2024-06-13 10:18:26 -07:00
80aa7e91fc [Hardware][Intel] Optimize CPU backend and add more performance tips (#4971)
Co-authored-by: Jianan Gu <jianan.gu@intel.com>
2024-06-13 09:33:14 -07:00
bd43973522 [Kernel] Tune Qwen2MoE kernel configurations with tp2,4 (#5497)
Tune Qwen2-57B-A14B configs based on #4921

Throughput Performance
command: python benchmarks/benchmark_throughput.py --model=Qwen/Qwen2-57B-A14B-Instruct --input-len 1000 --output-len 50 -tp 2

A100 GPU

benchmark	no config	w/ PR
tp=2	10.53 requests/s, 11058.17 tokens/s	12.47 requests/s, 13088.57 tokens/s
tp=4	17.77 requests/s, 18662.95 tokens/s	20.20 requests/s, 21212.32 tokens/s
2024-06-13 09:01:10 -07:00
23ec72fa03 [CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466) 2024-06-13 15:18:08 +00:00
c2637a613b [Kernel] w4a16 support for compressed-tensors (#5385)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-06-13 10:19:56 -04:00
88407532e7 [Bugfix]if the content is started with ":"(response of ping), client should i… (#5303)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-12 20:16:41 -07:00
916d219d62 [ci] Use sccache to build images (#5419)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-12 17:58:12 -07:00
ea3890a5f0 [Core][Distributed] code deduplication in tp&pp with coordinator(#5293)
[Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293)
2024-06-12 17:27:08 -07:00
2135cacb45 [Bugfix] Fix wrong multi_modal_input format for CPU runner (#5451) 2024-06-12 16:20:18 -07:00
7d19de2e9c [Frontend] Add "input speed" to tqdm postfix alongside output speed (#5425) 2024-06-12 18:42:12 -04:00
94a07bbdd8 [Bugfix] Fix typo in scheduler.py (requeset -> request) (#5470) 2024-06-12 21:59:44 +00:00
b8d4dfff9c [Doc] Update debug docs (#5438) 2024-06-12 14:49:31 -07:00
622d45128c [misc] add hint for AttributeError (#5462) 2024-06-12 21:46:35 +00:00
51602eefd3 [Frontend] [Core] Support for sharded tensorized models (#4990)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Sanger Steel <sangersteel@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-12 14:13:52 -07:00
5cc50a531f [Bugfix] TYPE_CHECKING for MultiModalData (#5444) 2024-06-12 14:08:52 -07:00
5985e3427d [Kernel] Vectorized FP8 quantize kernel (#5396)
Inspired by #5146, this PR improves FP8 quantize kernel by vectorizing data transfer to better utilize memory bandwidth. Microbenchmark shows that this improved kernel can achieve 1.0x-1.5x speedup (especially when hidden size is large).

In details, we applied 3 optimizations:

- Use inverted scale so that most divisions are changed to multiplications.
- Unroll the loop by 4 times to improve ILP.
- Use vectorized 4 to transfer data between HBM and SRAM.
2024-06-12 14:07:26 -07:00
8b82a89997 [ci] Add AMD, Neuron, Intel tests for AWS CI and turn off default soft fail for GPU tests (#5464)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-12 14:00:18 -07:00
c3c2903e72 [Bugfix] Add device assertion to TorchSDPA (#5402) 2024-06-12 12:58:53 -07:00
1a8bfd92d5 [Hardware] Initial TPU integration (#5292) 2024-06-12 11:53:03 -07:00
847cdcca1c [CI] Upgrade codespell version. (#5381) 2024-06-12 10:06:14 -07:00
e3c12bf6d2 Revert "[CI/Build] Add is_quant_method_supported to control quantization test configurations" (#5463) 2024-06-12 10:03:24 -07:00
3dd6853bc8 [CI/Build] Add is_quant_method_supported to control quantization test configurations (#5253) 2024-06-12 09:58:02 -07:00
8f89d72090 [Doc] add common case for long waiting time (#5430) 2024-06-11 11:12:13 -07:00
99dac099ab [Core][Doc] Default to multiprocessing for single-node distributed case (#5230)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-06-11 11:10:41 -07:00
c4bd03c7c5 [Core][Distributed] add same-node detection (#5369) 2024-06-11 10:53:59 -07:00
dcbf4286af [Frontend] Customizable RoPE theta (#5197) 2024-06-11 10:42:26 -07:00
00e6a2dc53 [Bugfix] fix lora_dtype value type in arg_utils.py (#5398) 2024-06-11 10:40:23 -07:00
2e02311a1b [Bugfix] Fix MultiprocessingGPUExecutor.check_health when world_size == 1 (#5254) 2024-06-11 10:38:07 -07:00
89ec06c33b [Docs] [Spec decode] Fix docs error in code example (#5427) 2024-06-11 10:31:56 -07:00
9fde251bf0 [Doc] Add an automatic prefix caching section in vllm documentation (#5324)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-06-11 10:24:59 -07:00
4c2ffb28ff [Speculative decoding] Initial spec decode docs (#5400) 2024-06-11 10:15:40 -07:00
246598a6b1 [CI] docfix (#5410)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: ywang96 <ywang@roblox.com>
2024-06-11 01:28:50 -07:00
8bab4959be [Misc] Remove VLLM_BUILD_WITH_NEURON env variable (#5389) 2024-06-11 00:37:56 -07:00
3c4cebf751 [Doc][Typo] Fixing Missing Comma (#5403) 2024-06-11 00:20:28 -07:00
d8f31f2f8b [Doc] add debugging tips (#5409) 2024-06-10 23:21:43 -07:00
640052b069 [Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026) 2024-06-10 22:36:46 -07:00
351d5e7b82 [Bugfix] OpenAI entrypoint limits logprobs while ignoring server defined --max-logprobs (#5312)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-06-11 10:30:31 +08:00
a008629807 [Misc] Various simplifications and typing fixes (#5368) 2024-06-11 10:29:02 +08:00
76477a93b7 [ci] Fix Buildkite agent path (#5392)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-10 18:58:07 -07:00
77c87beb06 [Doc] Add documentation for FP8 W8A8 (#5388) 2024-06-10 18:55:12 -06:00
114332b88e Bump version to v0.5.0 (#5384) 2024-06-10 15:56:06 -07:00
cb77ad836f [Docs] Alphabetically sort sponsors (#5386) 2024-06-10 15:17:19 -05:00
856c990041 [Docs] Add Docs on Limitations of VLM Support (#5383) 2024-06-10 09:53:50 -07:00
c5602f0baa [ci] Mount buildkite agent on Docker container to upload benchmark results (#5330)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-10 09:22:34 -07:00
f7f9c5f97b [ci] Use small_cpu_queue for doc build (#5331)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-10 09:21:11 -07:00
2c0d933594 [Bugfix] Fix LLaVA-NeXT (#5380) 2024-06-10 15:38:47 +00:00
774d1035e4 [Feature][Frontend]: Continued stream_options implementation also in CompletionRequest (#5319) 2024-06-10 14:22:09 +00:00
6b29d6fe70 [Model] Initial support for LLaVA-NeXT (#4199)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-10 12:47:15 +00:00
0bfa1c4f13 [Misc] Improve error message when LoRA parsing fails (#5194) 2024-06-10 19:38:49 +08:00
c81da5f56d [misc][typo] fix typo (#5372) 2024-06-10 09:51:02 +00:00
68bc81703e [Frontend][Misc] Enforce Pixel Values as Input Type for VLMs in API Server (#5374) 2024-06-10 09:13:39 +00:00
5884c2b454 [Misc] Update to comply with the new compressed-tensors config (#5350)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-06-10 03:49:46 +00:00
45f92c00cf [Bugfix] Fix KeyError: 1 When Using LoRA adapters (#5164) 2024-06-09 16:23:14 -07:00
5467ac3196 [Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) 2024-06-09 16:23:30 -04:00
5d7e3d0176 [mis][ci/test] fix flaky test in test_sharded_state_loader.py (#5361)
[mis][ci/test] fix flaky test in tests/test_sharded_state_loader.py (#5361)
2024-06-09 03:50:14 +00:00
0373e1837e [Core][CUDA Graph] add output buffer for cudagraph (#5074)
[Core][CUDA Graph] add output buffer for cudagraph to reduce memory footprint (#5074)
2024-06-08 19:14:43 -07:00
c09dade2a2 [Misc][Breaking] Change FP8 checkpoint format from act_scale -> input_scale (#5353) 2024-06-08 13:54:05 -04:00
8ea5e44a43 [CI/Test] improve robustness of test (vllm_runner) (#5357)
[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
2024-06-08 08:59:20 +00:00
9fb900f90c [CI/Test] improve robustness of test (hf_runner) (#5347)
[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347)
2024-06-07 22:31:32 -07:00
c96fc06747 [ROCm][AMD] Use pytorch sdpa math backend to do naive attention (#4965) 2024-06-07 19:13:12 -07:00
b3376e5c76 [Misc] Add args for selecting distributed executor to benchmarks (#5335) 2024-06-08 09:20:16 +08:00
e69ded7d1c [Bug Fix] Fix the support check for FP8 CUTLASS (#5352)
Bug description:
With torch 2.4.0.dev20240603+cu121,
cutlass_fp8_supported outputs False, and the (capability, version) before the comparison is (90, 11111111112)

This PR fixes the support check for FP8 CUTLASS ( cutlass_fp8_supported) which was introduced in https://github.com/vllm-project/vllm/pull/5183.
2024-06-08 00:42:05 +00:00
767c727a81 fix DbrxFusedNormAttention missing cache_config (#5340)
Co-authored-by: team <calvinn.ng@ahrefs.com>
2024-06-07 14:10:21 -07:00
6840a71610 [Misc] Remove unused cuda_utils.h in CPU backend (#5345) 2024-06-07 14:09:13 -07:00
7a9cb294ae [Frontend] Add OpenAI Vision API Support (#5237)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-06-07 11:23:32 -07:00
ca3ea51bde [Kernel] Dynamic Per-Token Activation Quantization (#5037)
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-07 09:36:26 -07:00
dc49fb892c Addition of lacked ignored_seq_groups in _schedule_chunked_prefill (#5296) 2024-06-07 13:35:42 +00:00
18a277b52d Remove Ray health check (#4693) 2024-06-07 10:01:56 +00:00
8d75fe48ca [Kernel] Switch fp8 layers to use the CUTLASS kernels (#5183)
Switching from torch._scaled_mm to vLLM's cutlass fp8 kernels when supported as we are seeing 5-15% improvement in e2e performance on neuralmagic/Meta-Llama-3-8B-Instruct-FP8

see https://docs.google.com/spreadsheets/d/1GiAnmzyGHgZ6zL_LDSTm35Bdrt4A8AaFEurDlISYYA4/ for some quick e2e benchmarks and #5144 for comparisons across different GEMM sizes.
2024-06-07 08:42:35 +00:00
388596c914 [Misc][Utils] allow get_open_port to be called for multiple times (#5333) 2024-06-06 22:15:11 -07:00
baa15a9ec3 [Feature][Frontend]: Add support for stream_options in ChatCompletionRequest (#5135) 2024-06-07 03:29:24 +00:00
15063741e3 [Misc] Missing error message for custom ops import (#5282) 2024-06-06 20:17:21 -07:00
ccdc490dda [Core] Change LoRA embedding sharding to support loading methods (#5038) 2024-06-06 19:07:57 -07:00
a31cab7556 [Core] Avoid copying prompt/output tokens if no penalties are used (#5289) 2024-06-06 18:12:00 -07:00
828da0d44e [Frontend] enable passing multiple LoRA adapters at once to generate() (#5300) 2024-06-06 15:48:13 -05:00
abe855d637 [Kernel] Retune Mixtral 8x22b configs for FP8 on H100 (#5294) 2024-06-06 09:29:29 -07:00
4efff036f0 Bugfix: fix broken of download models from modelscope (#5233)
Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
2024-06-06 09:28:10 -07:00
89c920785f [CI/Build] Update vision tests (#5307) 2024-06-06 05:17:18 -05:00
7b0a0dfb22 [Frontend][Core] Update Outlines Integration from FSM to Guide (#4109)
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Breno Faria <breno.faria@intrafind.com>
2024-06-05 16:49:12 -07:00
3a6ae1d33c [CI] Disable flash_attn backend for spec decode (#5286) 2024-06-05 15:49:27 -07:00
8f1729b829 [Docs] Add Ray Summit CFP (#5295) 2024-06-05 15:25:18 -07:00
6a7c7711a2 [Misc] Skip for logits_scale == 1.0 (#5291) 2024-06-05 15:19:02 -07:00
0f83ddd4d7 [Bugfix][Frontend/Core] Don't log exception when AsyncLLMEngine gracefully shuts down. (#5290) 2024-06-05 15:18:12 -07:00
065aff6c16 [Bugfix] Make EngineArgs use named arguments for config construction (#5285) 2024-06-05 15:16:56 -07:00
3d33e372a1 [BugFix] Fix log message about default max model length (#5284) 2024-06-05 14:53:16 -07:00
faf71bcd4b [Speculative Decoding] Add ProposerWorkerBase abstract class (#5252) 2024-06-05 14:53:05 -07:00
f270a39537 [Docs] Add Sequoia as sponsors (#5287) 2024-06-05 18:02:56 +00:00
51a08e7d8f [Kernel] Re-tune Mixtral MoE configurations for FP8 on H100 (#5238) 2024-06-05 10:59:14 -07:00
eb8fcd2666 [BugFix] Apply get_cached_tokenizer to the tokenizer setter of LLM (#5207)
Co-authored-by: qiujiawei9 <qiujiawei9@jd.com>
2024-06-05 10:59:02 -07:00
5563a4dea8 [Model] Correct Mixtral FP8 checkpoint loading (#5231) 2024-06-05 10:58:50 -07:00
ccd4f129e8 [Kernel] Add GPU architecture guards to the CUTLASS w8a8 kernels to reduce binary size (#5157)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-06-05 10:44:15 -07:00
02cc3b51a7 [misc] benchmark_serving.py -- add ITL results and tweak TPOT results (#5263) 2024-06-05 10:17:51 -07:00
d5b1eb081e [CI] Add nightly benchmarks (#5260) 2024-06-05 09:42:08 -07:00
f0a500545f [Frontend] OpenAI API server: Add add_special_tokens to ChatCompletionRequest (default False) (#5278) 2024-06-05 09:32:58 -07:00
c65146e75e [Misc] Fix docstring of get_attn_backend (#5271) 2024-06-05 09:18:59 -07:00
41ca62cf03 [Misc] Add CustomOp interface for device portability (#5255) 2024-06-05 09:18:19 -07:00
974fc9b845 [Bugfix] Fix prompt_logprobs when SamplingParams.detokenize is set to True (#5226) 2024-06-04 19:37:28 -07:00
fee4dcc33a [Misc] update collect env (#5261) 2024-06-04 17:29:09 -05:00
650a4cc55e [Misc] Add transformers version to collect_env.py (#5259) 2024-06-04 12:52:28 -07:00
9ca62d8668 [CI] mark AMD test as softfail to prevent blockage (#5256) 2024-06-04 11:34:53 -07:00
45c35f0d58 [CI/Build] Reducing CPU CI execution time (#5241) 2024-06-04 10:26:40 -07:00
9ba093b4f4 [CI/Build] Simplify model loading for HfRunner (#5251) 2024-06-04 10:09:19 -07:00
27208be66e [Kernel] Add back batch size 1536 and 3072 to MoE tuning (#5242) 2024-06-04 09:58:47 -07:00
87d5abef75 [Bugfix] Fix a bug caused by pip install setuptools>=49.4.0 for CPU backend (#5249) 2024-06-04 09:57:51 -07:00
ec784b2526 [CI/Build] Add inputs tests (#5215) 2024-06-03 21:01:46 -07:00
a58f24e590 [Bugfix] Fix torch.compile() error when using MultiprocessingGPUExecutor (#5229) 2024-06-03 20:55:50 -07:00
f42a006b15 [Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 20:32:57 -07:00
3a434b07ed [Kernel] Enhance MoE benchmarking & tuning script (#4921) 2024-06-03 20:06:59 -07:00
bd0e7802e0 [Bugfix] Add warmup for prefix caching example (#5235) 2024-06-03 19:36:41 -07:00
06b2550cbb [Bugfix] Support prompt_logprobs==0 (#5217) 2024-06-03 17:59:30 -07:00
f775a07e30 [FRONTEND] OpenAI tools support named functions (#5032) 2024-06-03 18:25:29 -05:00
4f0d17c05c New CI template on AWS stack (#5110)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-03 16:16:43 -07:00
10c38e3e46 [Misc]: Implement CPU/GPU swapping in BlockManagerV2 (#3834) 2024-06-03 13:37:11 -07:00
cafb8e06c5 [CI/BUILD] enable intel queue for longer CPU tests (#4113) 2024-06-03 10:39:50 -07:00
cbb2f59cc8 [Kernel] Pass a device pointer into the quantize kernel for the scales (#5159) 2024-06-03 09:52:30 -07:00
0ab278ca31 [Core] Remove unnecessary copies in flash attn backend (#5138) 2024-06-03 09:39:31 -07:00
7a64d24aad [Core] Support image processor (#4197) 2024-06-02 22:56:41 -07:00
dfbe60dc62 [Misc] Simplify code and fix type annotations in conftest.py (#5118) 2024-06-02 16:05:50 -07:00
a66cf40b20 [Kernel][ROCm][AMD] enable fused topk_softmax kernel for moe layer (#4927)
This PR enables the fused topk_softmax kernel used in moe layer for HIP
2024-06-02 14:13:26 -07:00
f790ad3c50 [Frontend][OpenAI] Support for returning max_model_len on /v1/models response (#4643) 2024-06-02 08:06:13 +00:00
ed59a7ed23 Update test_ignore_eos (#4898) 2024-06-02 02:21:53 +00:00
044793d8df [BugFix] Prevent LLM.encode for non-generation Models (#5184)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-06-01 23:35:41 +00:00
c2d6d2f960 [Bugfix]: Fix issues related to prefix caching example (#5177) (#5180) 2024-06-01 15:53:52 -07:00
8279078e21 [Bugfix] Remove deprecated @abstractproperty (#5174) 2024-06-01 22:40:25 +00:00
b9c0605a8e [Feature][Kernel] Support bitsandbytes quantization and QLoRA (#4776) 2024-06-01 14:51:10 -06:00
37464a0f74 [Bugfix] Fix call to init_logger in openai server (#4765) 2024-06-01 17:18:50 +00:00
c354072828 [Minor] Fix the path typo in loader.py: save_sharded_states.py -> save_sharded_state.py (#5151)
Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
2024-06-01 17:11:22 +00:00
f081c3ce4b [Kernel] Update Cutlass fp8 configs (#5144)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-06-01 08:46:07 +00:00
260d119e86 [Kernel] Refactor CUTLASS kernels to always take scales that reside on the GPU (#5137) 2024-06-01 06:45:32 +00:00
a360ff80bb [CI/Build] CMakeLists: build all extensions' cmake targets at the same time (#5034) 2024-05-31 22:06:45 -06:00
1197e02141 [Build] Guard against older CUDA versions when building CUTLASS 3.x kernels (#5168) 2024-05-31 17:21:38 -07:00
657579113f [Doc] Add checkmark for GPTBigCodeForCausalLM LoRA support (#5171) 2024-05-31 17:20:19 -07:00
e9899fb7a4 [Model] Enable FP8 QKV in MoE and refine kernel tuning script (#5039) 2024-05-31 14:29:19 -07:00
a377f0bd5e [Misc]: optimize eager mode host time (#4196)
Co-authored-by: xuhao <xuhao@cambricon.com>
2024-05-31 13:14:50 +08:00
e9d3aa04f6 Revert "[Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5)" (#5149) 2024-05-30 22:00:26 -07:00
a22dea54d3 [Model] Support MAP-NEO model (#5081)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-05-30 19:24:41 -07:00
533c217792 Fix cutlass sm_90a vesrion in CMakeList 2024-05-31 02:13:01 +00:00
6d21fa1cad [Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5) (#5136) 2024-05-30 21:02:11 -05:00
b35be5403f [Bugfix] Avoid Warnings in SparseML Activation Quantization (#5120) 2024-05-30 17:04:37 -07:00
45a1a69b98 [Build] Disable sm_90a in cu11 (#5141) 2024-05-30 14:37:16 -07:00
87a658c812 Bump version to v0.4.3 (#5046) 2024-05-30 11:13:46 -07:00
429d89720e add doc about serving option on dstack (#3074)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-05-30 10:11:07 -07:00
a9bcc7afb2 [Doc] Use intersphinx and update entrypoints docs (#5125) 2024-05-30 09:59:23 -07:00
d79d9eaaff [Misc] remove duplicate definition of seq_lens_tensor in model_runner.py (#5129) 2024-05-30 06:56:19 -07:00
f758505c73 [CI/Build] increase wheel size limit to 200 MB (#5130) 2024-05-30 06:29:48 -07:00
d910816c73 [Bugfix] Automatically Detect SparseML models (#5119) 2024-05-30 12:58:37 +00:00
87d41c849d [BUGFIX] [FRONTEND] Correct chat logprobs (#5029)
Co-authored-by: Breno Faria <breno.faria@intrafind.com>
2024-05-30 02:52:14 -07:00
e07aff9e52 [CI/Build] Docker cleanup functionality for amd servers (#5112)
Co-authored-by: Alexey Kondratiev <alexey.kondratiev@amd.com>
Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Co-authored-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Co-authored-by: omkarkakarparthi <okakarpa>
2024-05-30 03:27:39 +00:00
5bf185a1c4 [Bugfix] gptq_marlin: Ensure g_idx_sort_indices is not a Parameter (#5108) 2024-05-30 00:30:18 +00:00
4fbcb0f27e [Doc][Build] update after removing vllm-nccl (#5103)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-05-29 23:51:18 +00:00
7c3604fb68 [Bugfix] logprobs is not compatible with the OpenAI spec #4795 (#5031) 2024-05-29 16:13:22 -07:00
b1c255630d [Core] Avoid the need to pass None values to Sequence.inputs (#5099) 2024-05-29 16:05:01 -07:00
eb6c50cdc2 [Bugfix][CI/Build] Fix codespell failing to skip files in git diff (#5097) 2024-05-29 16:02:54 -07:00
eecd864388 [Bugfix][CI/Build] Fix test and improve code for merge_async_iterators (#5096) 2024-05-29 16:02:25 -07:00
ae495c74ea [Doc]Replace deprecated flag in readme (#4526) 2024-05-29 22:26:33 +00:00
4238bc82f2 [Core] Cross-attention KV caching and memory-management (towards eventual encoder/decoder model support) (#4837) 2024-05-29 16:09:13 +00:00
594392d27a [Core][Distributed] improve p2p access check (#4992) 2024-05-29 11:29:07 +00:00
18c1f16d86 [Bugfix] Fix arguments passed to Sequence in stop checker test (#5092) 2024-05-29 07:16:41 +00:00
5bd3c65072 [Core][Optimization] remove vllm-nccl (#5091) 2024-05-29 05:13:52 +00:00
616e600e0b [Misc] add gpu_memory_utilization arg (#5079)
Signed-off-by: pandyamarut <pandyamarut@gmail.com>
2024-05-28 17:16:18 -07:00
dfba529b40 [Bugfix] Remove the last EOS token unless explicitly specified (#5077) 2024-05-28 17:15:35 -07:00
5ae5ed1e60 [Core] Consolidate prompt arguments to LLM engines (#4328)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-05-28 13:29:31 -07:00
290f4ada2b [Docs] Add Dropbox as sponsors (#5089) 2024-05-28 10:29:09 -07:00
dd8de11f0a [Kernel][ROCm][AMD] Add fused_moe Triton configs for MI300X (#4951)
This PR adds Triton kernel configs for the MoE kernel for MI300X
2024-05-28 16:03:23 +00:00
9ba415588a [BugFix] Fix Embedding Models with TP>1 (#5075) 2024-05-28 08:32:42 -07:00
d4f3985907 [Core] Sliding window for block manager v2 (#4545)
Co-authored-by: Ruth Evans <ruthevans@Ruths-MacBook-Pro.local>
2024-05-28 11:07:07 +09:00
890aa93d27 [Model] Add support for falcon-11B (#5069) 2024-05-27 16:41:43 -07:00
fbdb7b3ee2 [Core] Allow AQLM on Pascal (#5058) 2024-05-27 15:26:14 -07:00
1102bef219 [Bugfix / Core] Prefix Caching Guards (merged with main) (#4846)
Co-authored-by: rsnm2 <rshaw@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-27 15:18:17 -07:00
f17a1a8f96 [Misc] Make Serving Benchmark More User-friendly (#5044) 2024-05-25 17:28:16 +00:00
d5a1697772 [Dynamic Spec Decoding] Minor fix for disabling speculative decoding (#5000) 2024-05-25 10:00:14 -07:00
325c119961 [Misc] add logging level env var (#5045) 2024-05-24 23:49:49 -07:00
8e192ff967 [Kernel][Backend][Model] Blocksparse flash attention kernel and Phi-3-Small model (#4799)
Co-authored-by: beagleski <yunanzhang@microsoft.com>
Co-authored-by: bapatra <bapatra@microsoft.com>
Co-authored-by: Barun Patra <codedecde@users.noreply.github.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-05-24 22:00:52 -07:00
e64fde4b01 [Core][Bugfix]: fix prefix caching for blockv2 (#4764)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-05-24 10:07:09 -07:00
919770957f [Bugfix] Fix Mistral v0.3 Weight Loading (#5005)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-05-24 12:28:27 +00:00
6a50f4cafa [Doc] add ccache guide in doc (#5012)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-05-23 23:21:54 +00:00
e3470f8753 [Core]: Option To Use Prompt Token Ids Inside Logits Processor (#4985)
Co-authored-by: Elisei Smirnov <el.smirnov@innopolis.university>
2024-05-23 22:04:24 +00:00
a1242324c9 [Kernel] Initial Activation Quantization Support (#4525)
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-05-23 21:29:18 +00:00
5eda2ea02a [Core][1/N] Support send/recv in PyNCCL Groups (#4988)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-05-23 09:54:48 -07:00
2ba80bed27 [Bugfix] Update Dockerfile.cpu to fix NameError: name 'vllm_ops' is not defined (#5009) 2024-05-23 09:08:58 -07:00
6066253296 Marlin 24 prefill performance improvement (about 25% better on average) (#4983) 2024-05-23 02:39:27 -04:00
ee3eea0a1b [Misc] Take user preference in attention selector (#4960) 2024-05-23 07:55:56 +09:00
a36de682d4 [Minor] Fix small typo in llama.py: QKVParallelLinear -> QuantizationConfig (#4991) 2024-05-22 22:26:56 +00:00
eb6d3c264d [Core] Eliminate parallel worker per-step task scheduling overhead (#4894) 2024-05-23 06:17:27 +09:00
97b030005c [Model] LoRA gptbigcode implementation (#3949) 2024-05-22 13:58:59 -07:00
a3a73ab069 [Misc] Load FP8 kv-cache scaling factors from checkpoints (#4893)
The 2nd PR for #4532.

This PR supports loading FP8 kv-cache scaling factors from a FP8 checkpoint (with .kv_scale parameter).
2024-05-22 13:28:20 -07:00
8674f9880e [Kernel] Fixup for CUTLASS kernels in CUDA graphs (#4954)
Pass the CUDA stream into the CUTLASS GEMMs, to avoid future issues with CUDA graphs
2024-05-22 14:10:43 +00:00
c74c913bfb [misc] remove comments that were supposed to be removed (#4977) 2024-05-22 09:02:58 -04:00
5f6d10c14c [CI/Build] Enforce style for C++ and CUDA code with clang-format (#4722) 2024-05-22 07:18:41 +00:00
9b9a10d6cb [Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 01:32:35 -04:00
99eff67ba9 [Bugfix][Kernel] Add head size check for attention backend selection (#4944) 2024-05-21 15:33:25 -04:00
14772eeb8e [Bugfix] Fix flag name for max_seq_len_to_capture (#4935)
Signed-off-by: kerthcet <kerthcet@gmail.com>
2024-05-21 09:30:52 -07:00
757b62c495 [CI/Build] Codespell ignore build/ directory (#4945) 2024-05-21 09:06:10 -07:00
e941f88584 [Docs] Add acknowledgment for sponsors (#4925) 2024-05-21 00:17:25 -07:00
f12c3b5b3d [Model] Add Phi-2 LoRA support (#4886) 2024-05-21 14:24:17 +09:00
d130b573a0 [Model] add rope_scaling support for qwen2 (#4930) 2024-05-21 05:22:22 +00:00
65ae8c2c8f [Core] Fix scheduler considering "no LoRA" as "LoRA" (#4897) 2024-05-20 17:48:32 -07:00
c3af44722c [Doc]Add documentation to benchmarking script when running TGI (#4920) 2024-05-20 20:16:57 +00:00
1937e29848 [Core] Sharded State Loader download from HF (#4889) 2024-05-20 11:46:12 -07:00
f0eecee610 [Bugfix] Fix dummy weight for fp8 (#4916)
Allow dummy load format for fp8,
torch.uniform_ doesn't support FP8 at the moment

Co-authored-by: Mor Zusman <morz@ai21.com>
2024-05-20 18:44:25 +00:00
943e72ca56 [Build/CI] Enabling AMD Entrypoints Test (#4834)
Co-authored-by: Alexey Kondratiev <alexey.kondratiev@amd.com>
2024-05-20 11:29:28 -07:00
546a97ef69 [Misc]: allow user to specify port in distributed setting (#4914) 2024-05-20 17:45:06 +00:00
da5a0b539d Remove marlin warning (#4918) 2024-05-20 14:55:34 +00:00
6287537a0c [Model] LLaVA model refactor (#4910) 2024-05-20 08:11:25 +00:00
b57e6c5949 [Kernel] Add flash-attn back (#4907) 2024-05-19 18:11:30 -07:00
27ce85476e [Kernel] Add marlin_24 unit tests (#4901) 2024-05-19 11:37:34 -04:00
f68470e803 [Bugfix][Model] Add base class for vision-language models (#4809) 2024-05-19 00:13:33 -07:00
2e9a2227ec [Lora] Support long context lora (#4787)
Currently we need to call rotary embedding kernel for each LoRA, which makes it hard to serve multiple long context length LoRA. Add batched rotary embedding kernel and pipe it through.

It replaces the rotary embedding layer to the one that is aware of multiple cos-sin-cache per scaling factors.

Follow up of https://github.com/vllm-project/vllm/pull/3095/files
2024-05-18 16:05:23 +09:00
c0724fc915 [ROCm][Hardware][AMD] Adding Navi21 to fallback to naive attention if Triton is not used (#4658) 2024-05-18 05:09:11 +00:00
86b45ae065 [Bugfix] Relax tiktoken to >= 0.6.0 (#4890) 2024-05-17 12:58:52 -06:00
c5711ef985 [Doc] Update Ray Data distributed offline inference example (#4871) 2024-05-17 10:52:11 -07:00
48d5985a08 Sync huggingface modifications of qwen Moe model (#4774) 2024-05-17 09:43:19 -07:00
33e0823de5 [Bugfix] fix rope error when load models with different dtypes (#4835) 2024-05-17 18:43:34 +09:00
26148120b3 [Build/CI] Extending the set of AMD tests with Regression, Basic Correctness, Distributed, Engine, Llava Tests (#4797) 2024-05-16 20:58:25 -07:00
0150a10630 [Frontend] OpenAI API server: Do not add bos token by default when encoding (#4688) 2024-05-16 18:47:22 -07:00
8e7fb5d43a Support to serve vLLM on Kubernetes with LWS (#4829)
Signed-off-by: kerthcet <kerthcet@gmail.com>
2024-05-16 16:37:29 -07:00
9a31a817a8 [Bugfix] Fix FP8 KV cache support (#4869) 2024-05-16 22:42:29 +00:00
2060e93659 [Kernel] Add w8a8 CUTLASS kernels (#4749) 2024-05-16 18:32:50 -04:00
8435b207af [Kernel] Add punica dimension for Qwen1.5-32B LoRA (#4850)
Co-authored-by: Silencio <silencio@adsl-99-6-187-6.dsl.irvnca.sbcglobal.net>
2024-05-16 11:16:09 -07:00
10fa9eea21 [Misc] remove old comments (#4866) 2024-05-16 11:07:41 -07:00
e08188081b [Core][Distributed] remove graph mode function (#4818) 2024-05-16 10:59:52 -07:00
b5853f9963 [ROCm][AMD][Bugfix] adding a missing triton autotune config (#4845) 2024-05-16 10:46:52 -07:00
f09edd8a25 Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00
6979ade384 Add GPTQ Marlin 2:4 sparse structured support (#4790)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-05-16 12:56:15 -04:00
9216b9cc38 [Bugfix] Bypass authorization API token for preflight requests (#4862) 2024-05-16 09:42:21 -07:00
5e0391c040 [Frontend] Separate OpenAI Batch Runner usage from API Server (#4851) 2024-05-17 00:42:41 +09:00
dbc0754ddf [docs] Fix typo in examples filename openi -> openai (#4864) 2024-05-17 00:42:17 +09:00
99caa49106 [Kernel] add bfloat16 support for gptq marlin kernel (#4788) 2024-05-16 09:55:29 -04:00
5c342570d7 Add marlin unit tests and marlin benchmark script (#4815) 2024-05-16 09:36:49 -04:00
973617ae02 [Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840)
Co-authored-by: Cade Daniel <edacih@gmail.com>
Co-authored-by: Cade Daniel <cade@anyscale.com>
2024-05-16 00:53:51 -07:00
30e754390c [Core] Implement sharded state loader (#4690)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-05-15 22:11:54 -07:00
52f8107cf2 [Frontend] Support OpenAI batch file format (#4794)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-15 19:13:36 -04:00
fc0d9dfc3a [Frontend] Re-enable custom roles in Chat Completions API (#4758) 2024-05-15 14:58:46 -07:00
361c461a12 [Doc] Highlight the fourth meetup in the README (#4842) 2024-05-15 11:38:49 -07:00
a5675d348b [Bugfix] Properly set distributed_executor_backend in ParallelConfig (#4816) 2024-05-15 07:22:09 -07:00
e9cdd2b1e2 [CI/Build] Further decouple HuggingFace implementation from ours during tests (#4166) 2024-05-14 23:38:40 -07:00
65bf2ac165 [Core][2/N] Model runner refactoring part 2. Combine prepare prefill / decode to a single API (#4681)
This PR combines prepare_prompt and prepare_decode into a single API. This PR also coelsce the attn metadata for prefill/decode to a single class and allow to slice them when running attn backend.

It also refactors subquery_start_loc which was not refactored in the previous PR
2024-05-15 14:00:10 +09:00
8a7cc254a0 Revert "[Kernel] Use flash-attn for decoding (#3648)" (#4820)
Lora 3 & 4 test seems to have illegal memory access failure after this commit;

[2024-05-14 23:51:18,182 E 22 22] logging.cc:101: Unhandled exception: N3c105ErrorE. what(): CUDA error: an illegal memory access was encountered
<br class="Apple-interchange-newline">
Exmaple: https://buildkite.com/vllm/ci/builds/7382#018f793d-1527-4e1c-ab59-c3a34ec55241

This reverts commit 1356df5.

FILL IN THE PR DESCRIPTION HERE

FIX #xxxx (link existing issues this PR will resolve)
2024-05-15 11:52:45 +09:00
29bc01bf3b Add 4th meetup announcement to readme (#4817) 2024-05-14 18:33:06 -04:00
676a99982f [Core] Add MultiprocessingGPUExecutor (#4539)
Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com>
2024-05-14 10:38:59 -07:00
dc72402b57 [Bugfix][Doc] Fix CI failure in docs (#4804)
This PR fixes the CI failure introduced by #4798.

The failure originates from having duplicate target names in reST, and is fixed by changing the ref targets to anonymous ones. For more information, see this discussion.

I have also changed the format of the links to be more distinct from each other.
2024-05-15 01:57:08 +09:00
ccb63a8245 [Core][Hash][Automatic Prefix caching] Accelerating the hashing function by avoiding deep copies (#4696) 2024-05-14 21:34:33 +09:00
c579b750a0 [Doc] Add meetups to the doc (#4798) 2024-05-13 18:48:00 -07:00
4bfa7e7f75 [Doc] Add API reference for offline inference (#4710) 2024-05-13 17:47:42 -07:00
ac1fbf7fd2 [Doc] Shorten README by removing supported model list (#4796) 2024-05-13 16:23:54 -07:00
33d3914b1e [Bugfix] Fix dynamic FP8 quantization for Mixtral (#4793) 2024-05-13 19:00:27 -04:00
1356df53bd [Kernel] Use flash-attn for decoding (#3648)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
2024-05-13 15:50:33 -07:00
ce532ff45c [Speculative decoding] Improve n-gram efficiency (#4724) 2024-05-13 15:00:13 -07:00
8bc68e198c [Frontend] [Core] perf: Automatically detect vLLM-tensorized model, update tensorizer to version 2.9.0 (#4208) 2024-05-13 14:57:07 -07:00
0fca3cdcf2 [Misc] Enhance attention selector (#4751) 2024-05-13 10:47:25 -07:00
e7c46b9527 [Scheduler] Warning upon preemption and Swapping (#4647)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-13 23:50:44 +09:00
350f9e107f [CI/Build] Move test_utils.py to tests/utils.py (#4425)
Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time)

Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py.
2024-05-13 23:50:09 +09:00
702bee461f [Core][Distributed] refactor custom allreduce to support multiple tp groups (#4754) 2024-05-12 17:47:59 -07:00
a7be4d0072 [CORE] Improvement in ranks code (#4718) 2024-05-12 17:47:47 -07:00
a709e87a4f [CI/Build] Tweak Marlin Nondeterminism Issues (#4713) 2024-05-12 17:46:31 -07:00
6eaccb7353 [Model] Add support for IBM Granite Code models (#4636) 2024-05-11 21:27:24 -07:00
e254497b66 [Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-11 11:30:37 -07:00
4e12131089 [Core][Test] fix function name typo in custom allreduce (#4750) 2024-05-10 15:14:40 -07:00
fcc2994be6 [CI] Nits for bad initialization of SeqGroup in testing (#4748) 2024-05-10 18:01:01 -04:00
2e7796f2cf [Speculative decoding] CUDA graph support (#4295)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-05-10 17:36:25 +00:00
706588a77d [Bugfix] Fix CLI arguments in OpenAI server docs (#4729) 2024-05-11 00:00:56 +09:00
6a0f617210 [Core] Fix circular reference which leaked llm instance in local dev env (#4737)
Storing exception frame is extremely prone to circular refernece because it contains the reference to objects.

When tensorizer is not installed, it leaks llm instance because error frame has references to various modules which cause circular reference problem.

I also found spec decoding has a circular reference issue, and I solved it using weakref.proxy.
2024-05-10 23:54:32 +09:00
dac6a3f6ed [Misc] Apply a couple g++ cleanups (#4719) 2024-05-10 13:37:05 +00:00
64b77dfd7e [Core]fix type annotation for swap_blocks (#4726) 2024-05-10 21:52:48 +09:00
51d4094fda chunked-prefill-doc-syntax (#4603)
Fix the docs: https://docs.vllm.ai/en/latest/models/performance.html

Co-authored-by: sang <rkooo567@gmail.com>
2024-05-10 14:13:23 +09:00
e965d46184 [Misc] Keep only one implementation of the create_dummy_prompt function. (#4716) 2024-05-09 21:42:38 -07:00
208b71bcc1 [Core][Distributed] refactor pynccl (#4591)
[Core][Distributed] refactor pynccl to hold multiple communicators (#4591)
2024-05-09 19:48:43 -07:00
c833101740 [Kernel] Refactor FP8 kv-cache with NVIDIA float8_e4m3 support (#4535) 2024-05-09 18:04:17 -06:00
379da6dcb5 [Kernel] [FP8] Improve FP8 linear layer performance (#4691)
This PR improves the FP8 performance of linear layers, which had been lacking before (#4118 (comment) and #4118 (comment)).

We noticed that CUBLASLt can find a better algorithm if the first dimension of the matrix is greater than 16. So this PR enlarges matrices appropriately during quantization. This improves FP8 performance and removes the performance regression vs. FP16, in many cases exceeding FP16 performance.

Here are benchmarks on llama3 70b (ITL numbers for 1000 input and 50 output tokens at fixed qps and at TP 4), all FP8 measurements are for dynamic quantization:

qps = 1: 24 ms (FP8, this PR), 32 ms (FP8, previous main), 26 ms (FP16)
qps = 2: 26 ms (FP8, this PR), 34ms (FP8, previous main), 28 ms (FP16) 
qps = 4: 33 ms (FP8, this PR), 44 ms (FP8, previous main), 36 ms (FP16)
qps = 6: 46 ms (FP8, this PR), 56 ms (FP8, previous main), 54 ms (FP16)
qps = 8: 85 ms (FP8, this PR), 85 ms (FP8, previous main), 138 ms (FP16)
2024-05-09 16:38:07 -07:00
ebce310b74 [Model] Snowflake arctic model implementation (#4652)
Co-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com>
Co-authored-by: Aurick Qiao <qiao@aurick.net>
Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
Co-authored-by: Aurick Qiao <aurickq@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-05-09 22:37:14 +00:00
be0c5180ac [Bugfix] Add logs for all model dtype casting (#4717) 2024-05-09 18:36:25 +00:00
cea64430f6 [Bugfix] Update grafana.json (#4711) 2024-05-09 10:10:13 -07:00
a3c124570a [Bugfix] Fix CLI arguments in OpenAI server docs (#4709) 2024-05-09 09:53:14 -07:00
ff5abcd746 [ROCm] Add support for Punica kernels on AMD GPUs (#3140)
Co-authored-by: miloice <jeffaw99@hotmail.com>
2024-05-09 09:19:50 -07:00
0ee535b294 [Misc] Set block size at initialization & Fix test_model_runner (#4705) 2024-05-09 09:04:59 -07:00
190bc838e1 [Misc] Remove unnecessary ModelRunner imports (#4703) 2024-05-09 00:17:17 -07:00
f12b20decc [Frontend] Move async logic outside of constructor (#4674) 2024-05-08 22:48:33 -07:00
16bc0a098f [Frontend] add tok/s speed metric to llm class when using tqdm (#4400)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-05-08 22:02:31 -07:00
e288df0632 [Bugfix] Fine-tune gptq_marlin configs to be more similar to marlin (#4626) 2024-05-08 17:14:31 -07:00
8b9241be3a [Speculative decoding] [Bugfix] Fix overallocation in ngram + spec logprobs (#4672) 2024-05-08 23:24:46 +00:00
f942efb5a3 [Dynamic Spec Decoding] Auto-disable by the running queue size (#4592)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-05-08 21:44:00 +00:00
89579a201f [Misc] Use vllm-flash-attn instead of flash-attn (#4686) 2024-05-08 13:15:34 -07:00
230c4b38c1 [CI/Test] fix swap test for multi gpu (#4689) 2024-05-08 13:14:02 -07:00
20cfcdec99 [Core][Optimization] change python dict to pytorch tensor for blocks to swap (#4659) 2024-05-08 12:07:05 -07:00
ad932a221d [Core] Faster startup for LoRA enabled models (#4634) 2024-05-08 10:33:18 -07:00
5510cf0e8a [Misc] Add get_name method to attention backends (#4685) 2024-05-08 09:59:31 -07:00
0f9a6e3d22 [Bugfix][Kernel] allow non-power-of-2 for prefix prefill with alibi (#4573) 2024-05-08 09:19:58 -07:00
f6a593093a [CI] Make mistral tests pass (#4596) 2024-05-08 08:44:35 -07:00
d7740ea4dc [Core] Optimize sampler get_logprobs (#4594) 2024-05-08 08:42:28 -07:00
cc466a3290 [Core][Distributed] support cpu&device in broadcast tensor dict (#4660)
[Core][Distributed] support both cpu and device tensor in broadcast tensor dict (#4660)
2024-05-07 19:34:47 -07:00
8344f7742b [Bug fix][Core] fixup ngram not setup correctly (#4551)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Cade Daniel <edacih@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-05-07 11:40:18 -07:00
469f85c782 [Core][Optimization] change copy-on-write from dict[int, list] to list (#4648) 2024-05-07 11:06:32 -07:00
10760da800 [Bugfix] Fixed error in slice_lora_b for MergedQKVParallelLinearWithLora (#4609) 2024-05-07 10:59:07 -07:00
478aed5827 [Build/CI] Fixing 'docker run' to re-enable AMD CI tests. (#4642) 2024-05-07 09:23:17 -07:00
63575bc2e1 [Core][Optimization] change python dict to pytorch tensor (#4607) 2024-05-06 21:30:27 -07:00
a98187cf72 [Kernel] Make static FP8 scaling more robust (#4570)
Previously FP8 static scaling works if the scales are overestimating the maxima of all activation tensors during computation. However this will not always be the case even if the scales were calibrated very carefully. For example, with the activations in my checkpoint

https://huggingface.co/pcmoritz/Mixtral-8x7B-v0.1-fp8-act-scale

(which was calibrated on https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k), I'm getting the following mostly random performance on MMLU:

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.2295|±  |0.0035|
| - humanities     |N/A    |none  |     5|acc   |0.2421|±  |0.0062|
| - other          |N/A    |none  |     5|acc   |0.2398|±  |0.0076|
| - social_sciences|N/A    |none  |     5|acc   |0.2171|±  |0.0074|
| - stem           |N/A    |none  |     5|acc   |0.2125|±  |0.0073|
With the fix in this PR where the scaled activations are clamped between [-std::numeric_limits<c10::Float8_e4m3fn>::max(), std::numeric_limits<c10::Float8_e4m3fn>::max()] to make sure there are no NaNs, the performance is

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7008|±  |0.0036|
| - humanities     |N/A    |none  |     5|acc   |0.6453|±  |0.0065|
| - other          |N/A    |none  |     5|acc   |0.7692|±  |0.0072|
| - social_sciences|N/A    |none  |     5|acc   |0.8083|±  |0.0070|
| - stem           |N/A    |none  |     5|acc   |0.6115|±  |0.0083|
This is not perfect yet but is getting very close to the FP16 / dynamic activation scale performance.
2024-05-06 17:39:28 -07:00
bd99d22629 Update lm-format-enforcer to 0.10.1 (#4631) 2024-05-06 23:51:59 +00:00
19cb4716ee [CI] Add retry for agent lost (#4633) 2024-05-06 23:18:57 +00:00
e186d37cb1 [CI] use ccache actions properly in release workflow (#4629) 2024-05-06 22:23:36 +00:00
323f27b904 [Bugfix] Fix asyncio.Task not being subscriptable (#4623) 2024-05-06 09:31:05 -07:00
0650e5935b Disable cuda version check in vllm-openai image (#4530) 2024-05-05 16:58:55 -07:00
c7f2cf2b7f [CI] Reduce wheel size by not shipping debug symbols (#4602) 2024-05-04 21:28:58 -07:00
8d8357c8ed bump version to v0.4.2 (#4600) 2024-05-04 17:09:49 -07:00
4302987069 [Bugfix] Fix inappropriate content of model_name tag in Prometheus metrics (#3937) 2024-05-04 15:39:34 -07:00
021b1a2ab7 [CI] check size of the wheels (#4319) 2024-05-04 20:44:36 +00:00
2a052011ca [Kernel] Support MoE Fp8 Checkpoints for Mixtral (Static Weights with Dynamic/Static Activations) (#4527)
Follow on to #4332 to enable FP8 checkpoint loading for Mixtral and supersedes #4436.

This PR enables the following checkpoint loading features for Mixtral:

Supports loading fp8 checkpoints for Mixtral, such as this "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8" test model
Supports static or dynamic activation quantization with static weight quantization (all per tensor)
Supports different scales for each expert weight
Supports Fp8 in QKV layer
Notes:

The Expert Gate/Router always runs at half / full precision for now.
If there are different weight scales between QKV layer (for separate QKV weights), they are re-quantized using layer.weight_scale.max() so we can have a single gemm for performance.
2024-05-04 11:45:16 -07:00
36fb68f947 [Doc] Chunked Prefill Documentation (#4580) 2024-05-04 00:18:00 -07:00
bc8ad68455 [Misc][Refactor] Introduce ExecuteModelData (#4540) 2024-05-03 17:47:07 -07:00
344bf7cd2d [Misc] add installation time env vars (#4574) 2024-05-03 15:55:56 -07:00
ab50275111 [Speculative decoding] Support target-model logprobs (#4378) 2024-05-03 15:52:01 -07:00
43c413ec57 [Kernel] Use flashinfer for decoding (#4353)
Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>
2024-05-03 15:51:27 -07:00
f8e7adda21 Fix/async chat serving (#2727) 2024-05-03 11:04:14 -07:00
7e65477e5e [Bugfix] Allow "None" or "" to be passed to CLI for string args that default to None (#4586) 2024-05-03 10:32:21 -07:00
3521ba4f25 [Core][Model runner refactoring 1/N] Refactor attn metadata term (#4518) 2024-05-03 10:20:12 -07:00
2d7bce9cd5 [Doc] add env vars to the doc (#4572) 2024-05-03 05:13:49 +00:00
ce3f1eedf8 [Misc] remove chunk detected debug logs (#4571) 2024-05-03 04:48:08 +00:00
808632d3b4 [BugFix] Prevent the task of _force_log from being garbage collected (#4567) 2024-05-03 01:35:18 +00:00
344a5d0c33 [Core][Distributed] enable allreduce for multiple tp groups (#4566) 2024-05-02 17:32:33 -07:00
0f8a91401c [Core] Ignore infeasible swap requests. (#4557) 2024-05-02 14:31:20 -07:00
9b5c9f9484 [CI/Build] AMD CI pipeline with extended set of tests. (#4267)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-05-02 12:29:07 -07:00
32881f3f31 [kernel] fix sliding window in prefix prefill Triton kernel (#4405)
Co-authored-by: SangBin Cho <rkooo567@gmail.com>
2024-05-02 11:23:37 -07:00
5b8a7c1cb0 [Misc] centralize all usage of environment variables (#4548) 2024-05-02 11:13:25 -07:00
1ff0c73a79 [BugFix] Include target-device specific requirements.txt in sdist (#4559) 2024-05-02 10:52:51 -07:00
5ad60b0cbd [Misc] Exclude the tests directory from being packaged (#4552) 2024-05-02 10:50:25 -07:00
fb087af52e [mypy][7/N] Cover all directories (#4555) 2024-05-02 10:47:41 -07:00
7038e8b803 [Kernel] Support running GPTQ 8-bit models in Marlin (#4533) 2024-05-02 12:56:22 -04:00
2a85f93007 [Core][Distributed] enable multiple tp group (#4512)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-05-02 04:28:21 +00:00
cf8cac8c70 [mypy][6/N] Fix all the core subdirectory typing (#4450)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-05-02 03:01:00 +00:00
5e401bce17 [CI]Add regression tests to ensure the async engine generates metrics (#4524) 2024-05-01 19:57:12 -07:00
0d62fe58db [Bug fix][Core] assert num_new_tokens == 1 fails when SamplingParams.n is not 1 and max_tokens is large & Add tests for preemption (#4451) 2024-05-01 19:24:13 -07:00
b8afa8b95a [MISC] Rework logger to enable pythonic custom logging configuration to be provided (#4273) 2024-05-01 17:34:40 -07:00
826b82a260 [Misc] Fix expert_ids shape in MoE (#4517) 2024-05-01 23:47:59 +00:00
c9d852d601 [Misc] Remove Mixtral device="cuda" declarations (#4543)
Remove the device="cuda" declarations in mixtral as promised in #4343
2024-05-01 16:30:52 -07:00
6ef09b08f8 [Core][Distributed] fix pynccl del error (#4508) 2024-05-01 15:23:06 -07:00
Roy
3a922c1e7e [Bugfix][Core] Fix and refactor logging stats (#4336) 2024-05-01 20:08:14 +00:00
c47ba4aaa9 [Bugfix] Add validation for seed (#4529) 2024-05-01 19:31:22 +00:00
24bb4fe432 [Kernel] Update fused_moe tuning script for FP8 (#4457)
This PR updates the tuning script for the fused_moe kernel to support FP8 and also adds configurations for TP4. Note that for the configuration I removed num_warps and num_stages for small batch sizes since that improved performance and brought the benchmarks on par with the numbers before in that regime to make sure this is a strict improvement over the status quo.

All the numbers below are for mistralai/Mixtral-8x7B-Instruct-v0.1, 1000 input and 50 output tokens.

Before this PR (with static activation scaling):

qps = 1: 9.8 ms ITL, 0.49s e2e latency
qps = 2: 9.7 ms ITL, 0.49s e2e latency 
qps = 4: 10.1 ms ITL, 0.52s e2e latency
qps = 6: 11.9 ms ITL, 0.59s e2e latency
qps = 8: 14.0 ms ITL, 0.70s e2e latency
qps = 10: 15.7 ms ITL, 0.79s e2e latency

After this PR (with static activation scaling):

qps = 1: 9.8 ms ITL, 0.49s e2e latency
qps = 2: 9.7 ms ITL, 0.49s e2e latency
qps = 4: 10.2 ms ITL, 0.53s e2e latency
qps = 6: 11.9 ms ITL, 0.59s e2e latency
qps = 8: 11.9 ms ITL, 0.59s e2e latency
qps = 10: 12.1 ms ITL, 0.61s e2e latency
2024-05-01 11:47:38 -07:00
a657bfc48a [Core] Add multiproc_worker_utils for multiprocessing-based workers (#4357) 2024-05-01 18:41:59 +00:00
24750f4cad [Core] Enable prefix caching with block manager v2 enabled (#4142)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Sage Moore <sagemoore@utexas.edu>
2024-05-01 11:20:32 -07:00
b38e42fbca [Speculative decoding] Add ngram prompt lookup decoding (#4237)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-05-01 11:13:03 -07:00
8b798eec75 [CI/Build][Bugfix] VLLM_USE_PRECOMPILED should skip compilation (#4534)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-05-01 18:01:50 +00:00
69909126a7 [Bugfix] Use random seed if seed is -1 (#4531) 2024-05-01 10:41:17 -07:00
e491c7e053 [Doc] update(example model): for OpenAI compatible serving (#4503) 2024-05-01 10:14:16 -07:00
4dc8026d86 [Bugfix] Fix 307 Redirect for /metrics (#4523) 2024-05-01 09:14:13 -07:00
a88bb9b032 [Bugfix] Fix the fp8 kv_cache check error that occurs when failing to obtain the CUDA version. (#4173)
Signed-off-by: AnyISalIn <anyisalin@gmail.com>
2024-05-01 09:11:03 -07:00
6f1df80436 [Test] Add ignore_eos test (#4519) 2024-05-01 08:45:42 -04:00
d6f4bd7cdd [Misc]Add customized information for models (#4132) 2024-04-30 21:18:14 -07:00
c3845d82dc Allow user to define whitespace pattern for outlines (#4305) 2024-04-30 20:48:39 -07:00
a822eb3413 [Misc] fix typo in block manager (#4453) 2024-04-30 20:41:32 -07:00
f458112e8a [Misc][Typo] type annotation fix (#4495) 2024-04-30 20:21:39 -07:00
2e240c69a9 [Core] Centralize GPU Worker construction (#4419) 2024-05-01 01:06:34 +00:00
ee37328da0 Unable to find Punica extension issue during source code installation (#4494)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-05-01 00:42:09 +00:00
6ad58f42c5 fix_tokenizer_snapshot_download_bug (#4493) 2024-04-30 16:38:50 -07:00
dd1a50a8bc [Bugfix][Minor] Make ignore_eos effective (#4468) 2024-04-30 16:33:33 -07:00
715c2d854d [Frontend] [Core] Tensorizer: support dynamic num_readers, update version (#4467) 2024-04-30 16:32:13 -07:00
a494140433 [Frontend] Support complex message content for chat completions endpoint (#3467)
Co-authored-by: Lily Liu <lilyliupku@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-04-30 16:28:46 -07:00
111815d482 [Kernel] Support Fp8 Checkpoints (Dynamic + Static) (#4332)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-04-30 21:46:12 +00:00
b31a1fb63c [Doc] add visualization for multi-stage dockerfile (#4456)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-30 17:41:59 +00:00
4bb53e2dde [BugFix] fix num_lookahead_slots missing in async executor (#4165)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-04-30 10:12:59 -07:00
26f2fb5113 [Core]Refactor gptq_marlin ops (#4466) 2024-04-30 08:14:47 -04:00
fa32207842 [Bugfix][Kernel] Fix compute_type for MoE kernel (#4463) 2024-04-29 22:05:40 -07:00
d627a3d837 [Misc] Upgrade to torch==2.3.0 (#4454) 2024-04-29 20:05:47 -04:00
f4f921b7f1 [Core][Distributed] use cpu group to broadcast metadata in cpu (#4444) 2024-04-29 13:52:22 -07:00
ac5ccf0156 [CI] hotfix: soft fail neuron test (#4458) 2024-04-29 19:50:01 +00:00
73c8d677e5 [Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922)
Co-authored-by: alexm <alexm@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-04-29 09:35:34 -07:00
df29793dc7 [mypy][5/N] Support all typing on model executor (#4427) 2024-04-28 19:01:26 -07:00
03dd7d52bf [CI] clean docker cache for neuron (#4441) 2024-04-28 23:32:07 +00:00
bf480c5302 Add more Prometheus metrics (#2764)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-04-28 15:59:33 -07:00
9c7306ac11 [Misc] fix typo in llm_engine init logging (#4428) 2024-04-28 18:58:30 +08:00
4ea1f9678d [BugFix] Resolved Issues For LinearMethod --> QuantConfig (#4418) 2024-04-27 18:35:33 +00:00
ba4be44c32 [BugFix] Fix return type of executor execute_model methods (#4402) 2024-04-27 11:17:45 -07:00
d6e520e170 [Core] Support offline use of local cache for models (#4374)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Travis Johnson <tjohnson31415@gmail.com>
2024-04-27 09:59:55 -07:00
81661da7b2 [BugFix] Fix min_tokens when eos_token_id is None (#4389)
Co-authored-by: DefTruth <31974251+deftruth@users.noreply.github.com>
2024-04-27 09:52:46 -07:00
dfea173148 [Bugfix] Abort requests when the connection to /v1/completions is interrupted (#4363) 2024-04-27 09:48:37 -07:00
Roy
7134303cbb [Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 11:30:08 +00:00
3da24c2df7 [Model] Phi-3 4k sliding window temp. fix (#4380) 2024-04-27 18:08:15 +08:00
eefeb16464 [Kernel] Full Tensor Parallelism for LoRA Layers (#3524)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-04-27 00:03:48 -07:00
18d23f642a [ROCm][Hardware][AMD] Enable group query attention for triton FA (#4406) 2024-04-26 23:37:40 -07:00
Roy
87f545ba6f [Misc] Fix logger format typo (#4396) 2024-04-27 13:45:02 +08:00
8947bc3c15 [Frontend][Bugfix] Disallow extra fields in OpenAI API (#4355) 2024-04-27 05:08:24 +00:00
12628d3c78 [Kernel] Optimize FP8 support for MoE kernel / Mixtral via static scales (#4343)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-27 04:49:59 +00:00
258a2c58d0 [Core] Introduce DistributedGPUExecutor abstract class (#4348) 2024-04-27 04:14:26 +00:00
aba47be3fe [Misc] add RFC issue template (#4401)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-26 15:47:45 -07:00
a62aaf1df5 [Misc][Refactor] Generalize linear_method to be quant_method (#4373) 2024-04-26 16:41:14 -04:00
603ad84815 [Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 13:02:02 +00:00
a88081bf76 [CI] Disable non-lazy string operation on logging (#4326)
Co-authored-by: Danny Guinther <dguinther@neuralmagic.com>
2024-04-26 00:16:58 -07:00
2f30e7c72f [Frontend] Add --log-level option to api server (#4377) 2024-04-26 05:36:01 +00:00
a74dee9b62 [Bugfix] Fix parameter name in get_tokenizer (#4107) 2024-04-25 19:10:48 -07:00
cf29b7eda4 [ROCm][Hardware][AMD][Doc] Documentation update for ROCm (#4376)
Co-authored-by: WoosukKwon <woosuk.kwon@berkeley.edu>
2024-04-25 18:12:25 -07:00
efffb63f58 [Core] Move function tracing setup to util function (#4352) 2024-04-25 16:45:12 -07:00
15e7c675b0 [Core] Add shutdown() method to ExecutorBase (#4349) 2024-04-25 16:32:48 -07:00
Roy
b6dcb4d442 [Misc] Fix flash attention backend log (#4368) 2024-04-25 12:43:32 -07:00
b5b4a398a7 [Mypy] Typing lora folder (#4337) 2024-04-25 19:13:50 +00:00
f4bc4de1b1 [Core]refactor aqlm quant ops (#4351) 2024-04-25 15:03:56 -04:00
bd7a8eef25 [Doc] README Phi-3 name fix. (#4372)
Co-authored-by: Caio Mendes <caiocesart@microsoft.com>
2024-04-25 10:32:00 -07:00
7ee82bef1e [CI/Build] Adding functionality to reset the node's GPUs before processing. (#4213) 2024-04-25 09:37:20 -07:00
fbf152d976 [Bugfix][Model] Refactor OLMo model to support new HF format in transformers 4.40.0 (#4324)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-25 09:35:56 -07:00
479d69fad0 [Core] Move ray_utils.py from engine to executor package (#4347) 2024-04-25 06:52:22 +00:00
96e90fdeb3 [Model] Adds Phi-3 support (#4298) 2024-04-25 03:06:57 +00:00
a395a638c2 [Misc] Use public API in benchmark_throughput (#4300) 2024-04-24 21:10:24 +00:00
2768884ac4 [Doc] Add note for docker user (#4340)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-24 21:09:44 +00:00
aae08249ac [Bugfix] Fix marlin kernel crash on H100 (#4218)
This PR addresses the Marlin kernel H100 crash that was reported here: neuralmagic#187.
The reason for the crash was the inline PTX assembly that introduced the async_copy with streaming behavior. The solution is to use the more standard PTX for async_copy (without the fractional L2 policy for "evict_first"). There is no performance difference between standard async_copy PTX and the previous one.
2024-04-24 10:35:01 -07:00
7923dcad12 [Misc] Update ShareGPT Dataset Sampling in Serving Benchmark (#4279) 2024-04-24 09:49:13 -07:00
3cd9b5bb2d [Core][Distributed] use existing torch.cuda.device (#4318)
[Core][Distributed] use existing torch.cuda.device context manager (#4318)
2024-04-24 09:00:20 -07:00
468d761b32 [Misc] Reduce supported Punica dtypes (#4304) 2024-04-23 18:54:33 -07:00
e4bf860a54 [CI][Build] change pynvml to nvidia-ml-py (#4302) 2024-04-23 18:33:12 -07:00
91f50a6fe2 [Core][Distributed] use cpu/gloo to initialize pynccl (#4248) 2024-04-23 18:32:19 -07:00
79a268c4ab [BUG] fixed fp8 conflict with aqlm (#4307)
Fixes fp8 iterface which broke in AQLM merge.
2024-04-23 18:26:33 -07:00
eace8bf0b9 [Kernel] FP8 support for MoE kernel / Mixtral (#4244)
This PR is the first step towards fixing https://github.com/vllm-project/vllm/pull/3208

It implements dynamic per-tensor scaling (see https://github.com/vllm-project/vllm/pull/4118), so users do not need to compute activation scales on a calibration dataset and they also don't need to convert their model checkpoints. It is enough to specify the `quantization="fp8"` argument. You can try out the PR like this:

```python
from vllm import LLM, SamplingParams

prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=2, quantization="fp8")

outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

**Performance**: For this PR, the focus is on making the code clean (while still trying to get reasonable performance), there is a bunch of optimizations that we will submit as a follow up PR that significantly improve the performance (similar to the numbers in https://github.com/vllm-project/vllm/pull/3954). With this PR, the results are as follows:

<img width="725" alt="Screenshot 2024-04-21 at 1 31 50 PM" src="https://github.com/vllm-project/vllm/assets/113316/d8fe1118-07a0-4d4e-8530-37a77d465a03">


**Accuracy**: The accuracy with this PR on MMLU on `mistralai/Mixtral-8x7B-v0.1` is as follows:

```
|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7018|±  |0.0036|
| - humanities     |N/A    |none  |     5|acc   |0.6472|±  |0.0065|
| - other          |N/A    |none  |     5|acc   |0.7673|±  |0.0072|
| - social_sciences|N/A    |none  |     5|acc   |0.8099|±  |0.0070|
| - stem           |N/A    |none  |     5|acc   |0.6131|±  |0.0083|
```
this compares favorably with the fp16 results which are
```
|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7020|±  |0.1313|
| - humanities     |N/A    |none  |     5|acc   |0.6425|±  |0.1349|
| - other          |N/A    |none  |     5|acc   |0.7744|±  |0.1038|
| - social_sciences|N/A    |none  |     5|acc   |0.8131|±  |0.0695|
| - stem           |N/A    |none  |     5|acc   |0.6108|±  |0.1383|
```

Happy hacking!
2024-04-24 01:18:23 +00:00
1e8f4252aa [Bugfix][Frontend] Raise exception when file-like chat template fails to be opened (#4292) 2024-04-23 18:19:03 +00:00
2b7949c1c2 AQLM CUDA support (#3287)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-04-23 13:59:33 -04:00
62b5166bd4 [CI] Add ccache for wheel builds job (#4281) 2024-04-23 09:51:41 -07:00
d86285a4a4 [Core][Logging] Add last frame information for better debugging (#4278) 2024-04-23 09:45:52 -07:00
d87f39e9a9 [Bugfix] Add init_cached_hf_modules to RayWorkerWrapper (#4286) 2024-04-23 09:28:35 -07:00
d3c8180ac4 [Bugfix] Fixing max token error message for openai compatible server (#4016) 2024-04-23 19:06:29 +08:00
62b8aebc6f [Speculative decoding 7/9] Speculative decoding end-to-end correctness tests. (#3951) 2024-04-23 08:02:36 +00:00
050f285ff6 [Core] Scheduling optimization 2 (#4280) 2024-04-23 08:02:11 +00:00
8f2ea22bde [Core] Some simplification of WorkerWrapper changes (#4183) 2024-04-23 07:49:08 +00:00
0ae11f78ab [Mypy] Part 3 fix typing for nested directories for most of directory (#4161) 2024-04-22 21:32:44 -07:00
34128a697e Fix autodoc directives (#4272)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-23 01:53:01 +00:00
c1b4e4157c [Core][Distributed] use absolute path for library file (#4271) 2024-04-22 17:21:48 -07:00
ceaf4ed003 [Doc] Update the SkyPilot doc with serving and Llama-3 (#4276) 2024-04-22 15:34:31 -07:00
ad8d696a99 [Core] Scheduler perf fix (#4270) 2024-04-22 21:11:06 +00:00
3d925165f2 Add example scripts to documentation (#4225)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-22 16:36:54 +00:00
1543680691 [Bugfix] Ensure download_weights_from_hf(..) inside loader is using the revision parameter (#4217) 2024-04-22 09:10:48 -07:00
077f0a2e8a [Frontend] Enable support for CPU backend in AsyncLLMEngine. (#3993)
Signed-off-by: Tao He <sighingnow@gmail.com>
2024-04-22 09:19:51 +00:00
e73ed0f1c6 [Bugfix] Fix type annotations in CPU model runner (#4256) 2024-04-22 00:54:16 -07:00
296cdf8ac7 [Misc] Add vision language model support to CPU backend (#3968) 2024-04-22 00:44:16 -07:00
747b1a7147 [Core][Distributed] fix _is_full_nvlink detection (#4233) 2024-04-21 23:04:16 -07:00
95e5b087cf [AMD][Hardware][Misc][Bugfix] xformer cleanup and light navi logic and CI fixes and refactoring (#4129) 2024-04-21 21:57:24 -07:00
a37d815b83 Make initialization of tokenizer and detokenizer optional (#3748)
Co-authored-by: Yun Ding <yunding@nvidia.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-21 22:06:46 +00:00
7f2593b164 [Doc]: Update the doc of adding new models (#4236) 2024-04-21 09:57:08 -07:00
fe7d648fe5 Don't show default value for flags in EngineArgs (#4223)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-21 09:15:28 -07:00
cc74b2b232 Updating lm-format-enforcer version and adding links to decoding libraries in docs (#4222) 2024-04-20 08:33:16 +00:00
91528575ec [Frontend] multiple sampling params support (#3570) 2024-04-20 00:11:57 -07:00
a22cdea371 [Kernel][FP8] Initial support with dynamic per-tensor scaling (#4118)
Provide an initial support to FP8 computation. This PR is inspired by HuggingFace TGI: huggingface/text-generation-inference#1726

This feature can be enabled with --quantization fp8 or -q fp8 when launching an engine.

Algorithm:
We still load a model checkpoint in FP16/BF16. After the weights are loaded, Fp8LinearMethod calculates the per-tensor scaling factor of weights and quantizes the weights accordingly. The scaling factor will then be stored for future use. Meanwhile, the per-tensor scaling factor for activations is calculated in every forward pass.

Initial Results:
Currently tested Mistral-7B on 1xH100. With prompt length ~5 and decoding length 128:

BF16: 1.47s
FP8: 1.66s
I'll try to use larger models and try to find more performance bottleneck. Meanwhile, you're welcome to try this code.
2024-04-20 04:28:57 +00:00
682789d402 Fix missing docs and out of sync EngineArgs (#4219)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-19 20:51:33 -07:00
138485a82d [Bugfix] Add fix for JSON whitespace (#4189)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-13-147.ec2.internal>
2024-04-19 20:49:22 -07:00
bc9df1571b Pass tokenizer_revision when getting tokenizer in openai serving (#4214) 2024-04-19 17:13:56 -07:00
15b86408a8 [Misc] add nccl in collect env (#4211) 2024-04-19 19:44:51 +00:00
7be4f5628f [Bugfix][Core] Restore logging of stats in the async engine (#4150) 2024-04-19 08:08:26 -07:00
8f20fc04bf [Misc] fix docstrings (#4191)
Co-authored-by: Zhong Wang <wangzhong@infini-ai.com>
2024-04-19 08:18:33 +00:00
221d93ecbf Bump version of 0.4.1 (#4177) 2024-04-19 01:00:22 -07:00
d17c8477f1 [Bugfix] Fix LoRA loading check (#4138)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-04-19 00:59:54 -07:00
a134ef6f5e Support eos_token_id from generation_config.json (#4182) 2024-04-19 04:13:36 +00:00
8a7a3e4436 [Core] add an option to log every function call to for debugging hang/crash in distributed inference (#4079)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-18 16:15:12 -07:00
8f9c28fd40 [Bugfix] Fix CustomAllreduce nvlink topology detection (#3974)
[Bugfix] Fix CustomAllreduce pcie nvlink topology detection (#3974) (#4159)
2024-04-18 15:32:47 -07:00
cd2f63fb36 [CI/CD] add neuron docker and ci test scripts (#3571) 2024-04-18 15:26:01 -07:00
87fa80c91f [Misc] Bump transformers to latest version (#4176) 2024-04-18 14:36:39 -07:00
e1bb2fd52d [Bugfix] Support logprobs when using guided_json and other constrained decoding fields (#4149) 2024-04-18 21:12:55 +00:00
705578ae14 [Docs] document that Meta Llama 3 is supported (#4175) 2024-04-18 10:55:48 -07:00
e8cc7967ff [Bugfix][Kernel] allow non-power-of-two head sizes in prefix prefill (#4128) 2024-04-18 00:51:28 -07:00
53b018edcb [Bugfix] Get available quantization methods from quantization registry (#4098) 2024-04-18 00:21:55 -07:00
66ded03067 Allow model to be served under multiple names (#2894)
Co-authored-by: Alexandre Payot <alexandrep@graphcore.ai>
2024-04-18 00:16:26 -07:00
6dc1fc9cfe [Core] nccl integrity check and test (#4155)
[Core] Add integrity check during initialization; add test for it (#4155)
2024-04-17 22:28:52 -07:00
533d2a1f39 [Typing] Mypy typing part 2 (#4043)
Co-authored-by: SangBin Cho <sangcho@sangcho-LT93GQWG9C.local>
2024-04-17 17:28:43 -07:00
a53222544c [Kernel] Add punica dimension for Swallow-MS-7B LoRA (#4134) 2024-04-17 10:02:45 -07:00
fe3b5bbc23 [Bugfix] fix output parsing error for trtllm backend (#4137)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-17 11:07:23 +00:00
8438e0569e [Core] RayWorkerVllm --> WorkerWrapper to reduce duplication (#4024)
[Core] replace narrow-usage RayWorkerVllm to general WorkerWrapper to reduce code duplication (#4024)
2024-04-17 08:34:33 +00:00
11d652bd4f [CI] Move CPU/AMD tests to after wait (#4123) 2024-04-16 22:53:26 -07:00
d150e4f89f [Misc] [CI] Fix CI failure caught after merge (#4126) 2024-04-16 17:56:01 -07:00
e95cd87959 [Speculative decoding 6/9] Integrate speculative decoding with LLMEngine (#3894) 2024-04-16 13:09:21 -07:00
69e1d2fb69 [Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00
05434764cd LM Format Enforcer Guided Decoding Support (#3868)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-16 05:54:57 +00:00
4e7ee664e2 [Core] Fix engine-use-ray broken (#4105) 2024-04-16 05:24:53 +00:00
37e84a403d [Typing] Fix Sequence type GenericAlias only available after Python 3.9. (#4092) 2024-04-15 14:47:31 -07:00
4695397dcf [Bugfix] Fix ray workers profiling with nsight (#4095) 2024-04-15 14:24:45 -07:00
d619ae2d19 [Doc] Add better clarity for tensorizer usage (#4090)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-15 13:28:25 -07:00
eb46fbfda2 [Core] Simplifications to executor classes (#4071) 2024-04-15 13:05:09 -07:00
0003e9154b [Misc][Minor] Fix CPU block num log in CPUExecutor. (#4088) 2024-04-15 08:35:55 -07:00
e11e200736 [Bugfix] Fix filelock version requirement (#4075) 2024-04-14 21:50:08 -07:00
Roy
8db1bf32f8 [Misc] Upgrade triton to 2.2.0 (#4061) 2024-04-14 17:43:54 -07:00
aceb17cf2d [Docs] document that mixtral 8x22b is supported (#4073) 2024-04-14 14:35:55 -07:00
563c54f760 [BugFix] Fix tensorizer extra in setup.py (#4072) 2024-04-14 14:12:42 -07:00
2cd6b4f362 [Core] avoid too many cuda context by caching p2p test (#4021) 2024-04-13 23:40:21 -07:00
711a000255 [Frontend] [Core] feat: Add model loading using tensorizer (#3476) 2024-04-13 17:13:01 -07:00
989ae2538d [Kernel] Add punica dimension for Baichuan-13B (#4053) 2024-04-13 07:55:05 -07:00
0a430b4ae2 [Bugfix] fix_small_bug_in_neuron_executor (#4051) 2024-04-13 07:54:03 -07:00
ec8e3c695f [Bugfix] fix_log_time_in_metrics (#4050) 2024-04-13 07:52:36 -07:00
98afde19fc [Core][Distributed] improve logging for init dist (#4042) 2024-04-13 07:12:53 -07:00
5c2e66e487 [Bugfix] More type hint fixes for py 3.8 (#4039) 2024-04-12 21:07:04 -07:00
546e721168 [CI/Test] expand ruff and yapf for all supported python version (#4037) 2024-04-13 01:43:37 +00:00
b8aacac31a [Bugfix] Fix LoRA bug (#4032) 2024-04-12 16:56:37 -07:00
d04973ad54 Fix triton compilation issue (#3984)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-12 16:41:26 -07:00
fbb9d9eef4 [Core] fix custom allreduce default value (#4040) 2024-04-12 16:40:39 -07:00
09473ee41c [mypy] Add mypy type annotation part 1 (#4006) 2024-04-12 14:35:50 -07:00
d4ec9ffb95 [Misc] Fix typo in scheduler.py (#4022) 2024-04-12 13:56:04 -07:00
96b6a6d790 [Bugfix] fix type hint for py 3.8 (#4036) 2024-04-12 19:35:44 +00:00
36729bac13 [Test] Test multiple attn backend for chunked prefill. (#4023) 2024-04-12 09:56:57 -07:00
7fd3949a0b [Frontend][Core] Move merge_async_iterators to utils (#4026) 2024-04-12 05:30:54 +00:00
1096717ae9 [Core] Support LoRA on quantized models (#4012) 2024-04-11 21:02:44 -07:00
c2b4a1bce9 [Doc] Add typing hints / mypy types cleanup (#3816)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-11 17:17:21 -07:00
e46a60aa4c [BugFix] Fix handling of stop strings and stop token ids (#3672) 2024-04-11 15:34:12 -07:00
1e96c3341a Add extra punica sizes to support bigger vocabs (#4015) 2024-04-11 22:18:57 +00:00
95e7d4a97c Fix echo/logprob OpenAI completion bug (#3441)
Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
2024-04-11 22:15:50 +00:00
559eb852f8 [Core] init_distributed_environment align with init_process_group(#4014)
[Core][Distributed] make init_distributed_environment compatible with init_process_group (#4014)
2024-04-11 14:00:48 -07:00
a10d3056da [Core] Set linear_weights directly on the layer (#3977) 2024-04-11 16:35:51 -04:00
8afca50889 [Hardware][Intel] Isolate CPUModelRunner and ModelRunner for better maintenance (#3824) 2024-04-11 11:56:49 -07:00
08ccee1e83 punica fix-bgmv-kernel-640 (#4007) 2024-04-11 08:59:26 -07:00
c1dc547129 [Kernel] Fused MoE Config for Mixtral 8x22 (#4002) 2024-04-11 07:50:00 -07:00
f3d0bf7589 [Doc][Installation] delete python setup.py develop (#3989) 2024-04-11 03:33:02 +00:00
e9da5a40c6 [Misc] Add indirection layer for custom ops (#3913) 2024-04-10 20:26:07 -07:00
e42df7227d [Test] Add xformer and flash attn tests (#3961)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-11 03:09:50 +00:00
caada5e50a [Core][Model] torch.compile for layernorm in commandr (#3985)
[Core][Model] Use torch.compile to accelerate layernorm in commandr (#3985)
2024-04-11 01:48:26 +00:00
67b4221a61 [Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-10 17:56:48 -07:00
63e7176f26 [Core][Refactor] move parallel_utils into vllm/distributed (#3950)
[WIP][Core][Refactor] move vllm/model_executor/parallel_utils into vllm/distributed and vllm/device_communicators (#3950)
2024-04-10 15:33:30 -07:00
934d3662f7 [Bugfix] handle hf_config with architectures == None (#3982)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-10 22:28:25 +00:00
92cd2e2f21 [Doc] Fix getting stared to use publicly available model (#3963) 2024-04-10 18:05:52 +00:00
e4c4072c94 [Bugfix] Remove key sorting for guided_json parameter in OpenAi compatible Server (#3945) 2024-04-10 10:15:51 -07:00
e35397468f [Doc] Add doc to state our model support policy (#3948)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-10 17:03:02 +00:00
8b317c6dd0 [Model][AMD] ROCm support for 256 head dims for Gemma (#3972) 2024-04-10 08:12:00 -07:00
bd3c144e0b [Bugfix][ROCm] Add numba to Dockerfile.rocm (#3962) 2024-04-10 07:37:17 -07:00
0258b7a94b [Bugfix] handle prompt_logprobs in _apply_min_tokens_penalty (#3876)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-04-10 01:39:56 -07:00
b3104b2a10 [Bugfix] Fix logits processor when prompt_logprobs is not None (#3899) 2024-04-10 00:09:36 -07:00
c2e00af523 [Bugfix] fix utils.py/merge_dict func TypeError: 'type' object is not subscriptable (#3955)
Co-authored-by: tianyi_zhao <tianyi.zhao@transwarp.io>
2024-04-10 04:49:11 +00:00
c013d32c75 [Benchmark] Add cpu options to bench scripts (#3915) 2024-04-09 21:30:03 -07:00
11dd6ebb89 [Misc] Avoid loading incorrect LoRA config (#3777) 2024-04-09 19:47:15 -07:00
6c0b04515f [ROCm][Hardware][AMD] Use Triton Kernel for default FA on ROCm (#3643)
Co-authored-by: jpvillam <jpvillam@amd.com>
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-09 15:10:47 -07:00
e23a43aef8 [Bugfix] Fix KeyError on loading GPT-NeoX (#3925) 2024-04-09 12:11:31 -07:00
e7c7067b45 [Misc] [Core] Implement RFC "Augment BaseExecutor interfaces to enable hardware-agnostic speculative decoding" (#3837) 2024-04-09 11:44:15 -07:00
6d592eb430 [Core] separate distributed_init from worker (#3904) 2024-04-09 08:49:02 +00:00
Roy
d036198e23 [BugFix][Model] Fix commandr RoPE max_position_embeddings (#3919) 2024-04-09 06:17:21 +08:00
59a6abf3c9 [Hotfix][CI/Build][Kernel] CUDA 11.8 does not support layernorm optimizations (#3782) 2024-04-08 14:31:02 -07:00
bc0c0192d1 [Bugfix] Enable Proper attention_bias Usage in Llama Model Configuration (#3767)
Co-authored-by: roy <jasonailu87@gmail.com>
2024-04-08 19:42:35 +00:00
f46864d68d [Bugfix] Added Command-R GPTQ support (#3849)
Co-authored-by: Egor Tolmachev <t333ga@gmail.com>
2024-04-08 14:59:38 +00:00
b4543c8f6b [Model] add minicpm (#3893) 2024-04-08 18:28:36 +08:00
0ce0539d47 [Bugfix] Fix Llava inference with Tensor Parallelism. (#3883) 2024-04-07 22:54:13 +08:00
2f19283549 [Core] latency optimization (#3890) 2024-04-06 19:14:06 -07:00
95baec828f [Core] enable out-of-tree model register (#3871) 2024-04-06 17:11:41 -07:00
e4be7d70bb [CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 21:32:30 +00:00
54951ac4bf [Bugfix] Fix incorrect output on OLMo models in Tensor Parallelism (#3869) 2024-04-05 12:02:09 -07:00
18de883489 [Chunked Prefill][4/n] Chunked prefill scheduler. (#3853) 2024-04-05 10:17:58 -07:00
1d7c940d74 Add option to completion API to truncate prompt tokens (#3144) 2024-04-05 10:15:42 -07:00
cfaf49a167 [Misc] Define common requirements (#3841) 2024-04-05 00:39:17 -07:00
9edec652e2 [Bugfix] Fixing requirements.txt (#3865) 2024-04-04 23:46:01 -07:00
e0dd4d3589 [Misc] Fix linter issues in examples/fp8/quantizer/quantize.py (#3864) 2024-04-04 21:57:33 -07:00
e5043a3e75 [Misc] Add pytest marker to opt-out of global test cleanup (#3863) 2024-04-04 21:54:16 -07:00
d03d64fd2e [CI/Build] refactor dockerfile & fix pip cache
[CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859)
2024-04-04 21:53:16 -07:00
78107fa091 [Doc]Add asynchronous engine arguments to documentation. (#3810)
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-04 21:52:01 -07:00
c391e4b68e [Core] improve robustness of pynccl (#3860) 2024-04-04 16:52:12 -07:00
9117f892f0 [Model] Cohere CommandR+ (#3829) 2024-04-04 13:31:49 -07:00
db2a6a41e2 [Hardware][CPU] Update cpu torch to match default of 2.2.1 (#3854) 2024-04-04 19:49:49 +00:00
ca81ff5196 [Core] manage nccl via a pypi package & upgrade to pt 2.2.1 (#3805) 2024-04-04 10:26:19 -07:00
b7782002e1 [Benchmark] Refactor sample_requests in benchmark_throughput (#3613)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-04 09:56:22 +00:00
819a309c0f [Bugfix] Fix args in benchmark_serving (#3836)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-04 07:41:05 +00:00
aabe8f40f2 [Core] [Frontend] Make detokenization optional (#3749)
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-04-03 21:52:18 -07:00
498eb5cfa3 [Bugfix] Add kv_scale input parameter to CPU backend (#3840) 2024-04-04 04:33:08 +00:00
537ee25f43 [Core] Enable hf_transfer by default if available (#3817) 2024-04-04 04:02:43 +00:00
294f8f6665 [BugFix] Pass tokenizer_config to local_tokenizer_group (#3754)
Signed-off-by: Tao He <sighingnow@gmail.com>
2024-04-03 20:31:46 -07:00
b95047f2da [Misc] Publish 3rd meetup slides (#3835) 2024-04-03 15:46:10 -07:00
2ff767b513 Enable scaled FP8 (e4m3fn) KV cache on ROCm (AMD GPU) (#3290)
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: HaiShaw <hixiao@gmail.com>
Co-authored-by: AdrianAbeyta <Adrian.Abeyta@amd.com>
Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com>
Co-authored-by: root <root@gt-pla-u18-08.pla.dcgpu>
Co-authored-by: mawong-amd <156021403+mawong-amd@users.noreply.github.com>
Co-authored-by: ttbachyinsda <ttbachyinsda@outlook.com>
Co-authored-by: guofangze <guofangze@kuaishou.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-03 14:15:55 -07:00
3dcb3e8b98 [3/N] Refactor scheduler for chunked prefill scheduling (#3550) 2024-04-03 14:13:49 -07:00
c64cf38673 [Doc] Update contribution guidelines for better onboarding (#3819) 2024-04-03 07:31:43 +00:00
76b889bf1d [Doc] Update README.md (#3806) 2024-04-02 23:11:10 -07:00
c9b506dad4 [BugFix] Use different mechanism to get vllm version in is_cpu() (#3804) 2024-04-02 23:06:25 -07:00
5757d90e26 [Speculative decoding] Adding configuration object for speculative decoding (#3706)
Co-authored-by: Lily Liu <lilyliupku@gmail.com>
2024-04-03 00:40:57 +00:00
a3c226e7eb [CI/Build] 0.4.0.post1, fix sm 7.0/7.5 binary (#3803) 2024-04-02 12:57:04 -07:00
b321d4881b [Bugfix] Add __init__.py files for vllm/core/block/ and vllm/spec_decode/ (#3798) 2024-04-02 12:35:31 -07:00
ad6eca408b Fix early CUDA init via get_architecture_class_name import (#3770)
Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-04-02 11:56:26 -07:00
205b94942e [CI/Build] fix TORCH_CUDA_ARCH_LIST in wheel build (#3801) 2024-04-02 11:54:33 -07:00
3bec41f41a [Doc] Fix vLLMEngine Doc Page (#3791) 2024-04-02 09:49:37 -07:00
0739b1947f [Frontend][Bugfix] allow using the default middleware with a root path (#3788)
Co-authored-by: A-Mahla <>
2024-04-02 01:20:28 -07:00
77a6572aa5 [HotFix] [CI/Build] Minor fix for CPU backend CI (#3787) 2024-04-01 22:50:53 -07:00
0e3f06fe9c [Hardware][Intel] Add CPU inference backend (#3634)
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Yuan Zhou <yuan.zhou@intel.com>
2024-04-01 22:07:30 -07:00
eb69d68804 [Misc] [CI/Build] Speed up block manager CPU-only unit tests ~10x by opting-out of GPU cleanup (#3783) 2024-04-02 00:49:51 +00:00
7d4e1b85e7 [Misc] Add support for new autogptq checkpoint_format (#3689)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-04-01 19:32:01 -04:00
93deb0b38f [Speculative decoding 4/9] Lookahead scheduling for speculative decoding (#3250) 2024-04-01 22:55:24 +00:00
ccb58b23e6 [Misc] Fix Benchmark TTFT Calculation for Chat Completions (#3768) 2024-04-01 15:24:30 -07:00
49782fcb76 [Misc] Some minor simplifications to detokenization logic (#3670)
Some simplifications made for clarity.

Also moves detokenization-related functions from tokenizer.py to detokenizer.py.
2024-04-01 13:22:06 -07:00
f03cc667a0 [Misc] Minor fixes in requirements.txt (#3769) 2024-04-01 10:15:48 +00:00
563c1d7ec5 [CI/Build] Make Marlin Tests Green (#3753) 2024-03-30 19:18:34 -07:00
9c82a1bec3 [Doc] Update installation doc (#3746)
[Doc] Update installation doc for build from source and explain the dependency on torch/cuda version (#3746)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-03-30 16:34:38 -07:00
b6d103542c [Kernel] Layernorm performance optimization (#3662) 2024-03-30 14:26:38 -07:00
51c31bc10c CMake build elf without PTX (#3739) 2024-03-30 01:53:08 +00:00
3ad438c66f Fix build when nvtools is missing (#3698) 2024-03-29 18:52:39 -07:00
203d4f82ac [Core][Bugfix] cache len of tokenizer (#3741) 2024-03-29 18:46:39 -07:00
991143cfcd [BugFix] Use consistent logger everywhere (#3738) 2024-03-29 23:26:44 +00:00
8b2d3cbc1b usage lib get version another way (#3735) 2024-03-29 15:57:08 -07:00
9765b5c406 [ROCm][Bugfix] Fixed several bugs related to rccl path and attention selector logic (#3699) 2024-03-29 14:52:36 -07:00
430530fc18 bump version to v0.4.0 (#3712) 2024-03-29 12:28:33 -07:00
97356f3c7e [Bugfix] Command-R Max Model Length (#3727) 2024-03-29 12:27:51 -07:00
Roy
f510395bbf [BugFix][Frontend] Fix completion logprobs=0 error (#3731) 2024-03-29 09:38:21 -07:00
Roy
6110c39dc8 [BugFix] Fix tokenizer out of vocab size (#3685) 2024-03-29 08:18:59 -07:00
d8658c8cc1 Usage Stats Collection (#2852) 2024-03-28 22:16:12 -07:00
7bc94a0fdd add ccache to docker build image (#3704) 2024-03-28 22:14:24 -07:00
756b30a5f3 [Core][Test] move local_rank to the last arg with default value(#3711)
[Core][Test] move local_rank to the last arg with default value to keep api compatible (#3711)
2024-03-28 21:19:45 -07:00
395aa823ea [Misc] Minor type annotation fix (#3716) 2024-03-28 21:12:24 -07:00
26422e477b [Test] Make model tests run again and remove --forked from pytest (#3631)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-03-28 21:06:40 -07:00
f342153b48 Revert "bump version to v0.4.0" (#3708) 2024-03-28 18:49:42 -07:00
27a57cad52 bump version to v0.4.0 (#3705) 2024-03-28 18:26:51 -07:00
98a42e7078 [Benchmark] Change mii to use persistent deployment and support tensor parallel (#3628) 2024-03-28 17:33:52 -07:00
0267fef52a [Core] fix del of communicator (#3702) 2024-03-29 00:24:58 +00:00
4716a32dd4 fix logging msg for block manager (#3701) 2024-03-28 23:29:55 +00:00
c0935c96d3 [Bugfix] Set enable_prefix_caching=True in prefix caching example (#3703) 2024-03-28 16:26:30 -07:00
cb40b3ab6b [Kernel] Add MoE Triton kernel configs for A100 40GB (#3700) 2024-03-28 15:26:24 -07:00
Roy
515386ef3c [Core] Support multi-node inference(eager and cuda graph) (#3686) 2024-03-28 15:01:55 -07:00
a4075cba4d [CI] Add test case to run examples scripts (#3638) 2024-03-28 14:36:10 -07:00
96aa014d1e fix benchmark format reporting in buildkite (#3693) 2024-03-28 14:35:16 -07:00
1715056fef [Bugfix] Update neuron_executor.py to add optional vision_language_config (#3695) 2024-03-28 10:43:34 -07:00
b51c1cc9d2 [2/N] Chunked prefill data update (#3538) 2024-03-28 10:06:01 -07:00
ce567a2926 [Kernel] DBRX Triton MoE kernel H100 (#3692) 2024-03-28 10:05:34 -07:00
d6ea427f04 [Model] Add support for Qwen2MoeModel (#3346) 2024-03-28 15:19:59 +00:00
14ccd94c89 [Core][Bugfix]Refactor block manager for better testability (#3492) 2024-03-27 23:59:28 -07:00
8267b06c30 [Kernel] Add Triton MoE kernel configs for DBRX on A100 (#3679) 2024-03-27 22:22:25 -07:00
3492859b68 [CI/Build] update default number of jobs and nvcc threads to avoid overloading the system (#3675) 2024-03-28 00:18:54 -04:00
098e1776ba [Model] Add support for xverse (#3610)
Co-authored-by: willhe <hexin@xverse.cn>
Co-authored-by: root <root@localhost.localdomain>
2024-03-27 18:12:54 -07:00
Roy
10e6322283 [Model] Fix and clean commandr (#3671) 2024-03-28 00:20:00 +00:00
6d9aa00fc4 [Docs] Add Command-R to supported models (#3669) 2024-03-27 15:20:00 -07:00
1182607e18 Add support for Cohere's Command-R model (#3433)
Co-authored-by: José Maria Pombal <jose.pombal@unbabel.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2024-03-27 14:19:32 -07:00
45b6ef6513 feat(benchmarks): Add Prefix Caching Benchmark to Serving Benchmark (#3277) 2024-03-27 13:39:26 -07:00
1956931436 [Misc] add the "download-dir" option to the latency/throughput benchmarks (#3621) 2024-03-27 13:39:05 -07:00
e24336b5a7 [Model] Add support for DBRX (#3660) 2024-03-27 13:01:46 -07:00
d18f4e73f3 [Bugfix] [Hotfix] fix nccl library name (#3661) 2024-03-27 17:23:54 +00:00
82c540bebf [Bugfix] More faithful implementation of Gemma (#3653) 2024-03-27 09:37:18 -07:00
8f44facddd [Core] remove cupy dependency (#3625) 2024-03-27 00:33:26 -07:00
e66b629c04 [Misc] Minor fix in KVCache type (#3652) 2024-03-26 23:14:06 -07:00
76879342a3 [Doc]add lora support (#3649) 2024-03-27 02:06:46 +00:00
566b57c5c4 [Kernel] support non-zero cuda devices in punica kernels (#3636) 2024-03-27 00:37:42 +00:00
0dc72273b8 [BugFix] Fix ipv4 address parsing regression (#3645) 2024-03-26 14:39:44 -07:00
a979d9771e [Bugfix] Fix ipv6 address parsing bug (#3641) 2024-03-26 11:58:20 -07:00
8af890a865 Enable more models to inference based on LoRA (#3382)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-03-25 18:09:31 -07:00
dfeb2ecc3a [Misc] Include matched stop string/token in responses (#2976)
Co-authored-by: Sahil Suneja <sahilsuneja@gmail.com>
2024-03-25 17:31:32 -07:00
3a243095e5 Optimize _get_ranks in Sampler (#3623) 2024-03-25 16:03:02 -07:00
64172a976c [Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00
f408d05c52 hotfix isort on logprobs ranks pr (#3622) 2024-03-25 11:55:46 -07:00
0b4997e05c [Bugfix] API stream returning two stops (#3450)
Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
2024-03-25 10:14:34 -07:00
c13ad1b7bd feat: implement the min_tokens sampling parameter (#3124)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-03-25 10:14:26 -07:00
819924e749 [Core] Adding token ranks along with logprobs (#3516)
Co-authored-by: Swapnil Parekh <swapnilp@ibm.com>
2024-03-25 10:13:10 -07:00
01bfb22b41 [CI] Try introducing isort. (#3495) 2024-03-25 07:59:47 -07:00
e67c295b0c [Bugfix] fix automatic prefix args and add log info (#3608) 2024-03-25 05:35:22 -07:00
925f3332ca [Core] Refactor Attention Take 2 (#3462) 2024-03-25 04:39:33 +00:00
b0dfa91dd7 [Model] Add starcoder2 awq support (#3569) 2024-03-24 21:07:36 -07:00
56a8652f33 [Bugfix] store lock file in tmp directory (#3578)" (#3599)
Co-authored-by: youkaichao <youkaichao@126.com>
2024-03-24 20:06:50 -07:00
6d93d35308 [BugFix] tensor.get_device() -> tensor.device (#3604) 2024-03-24 19:01:13 -07:00
837e185142 [CI/Build] fix flaky test (#3602) 2024-03-24 17:43:05 -07:00
42bc386129 [CI/Build] respect the common environment variable MAX_JOBS (#3600) 2024-03-24 17:04:00 -07:00
8b268a46a7 [CI] typo fix: is_hip --> is_hip() (#3595) 2024-03-24 16:03:06 -07:00
41deac4a3d [BugFix] 1D query fix for MoE models (#3597) 2024-03-24 16:00:16 -07:00
af9e53496f [BugFix] Fix Falcon tied embeddings (#3590)
Co-authored-by: 44670 <44670@users.noreply.github.com>
2024-03-24 06:34:01 -07:00
f8a12ecc7f [Misc] Bump transformers version (#3592) 2024-03-24 06:32:45 -07:00
3c5ab9b811 [Misc] Fix BLOOM copyright notice (#3591) 2024-03-23 23:30:56 -07:00
743a0b7402 [Bugfix] use SoftLockFile instead of LockFile (#3578) 2024-03-23 11:43:11 -07:00
bfdb1ba5c3 [Core] Improve detokenization performance for prefill (#3469)
Co-authored-by: MeloYang <meloyang05@gmail.com>
2024-03-22 13:44:12 -07:00
cf2f084d56 Dynamic scheduler delay to improve ITL performance (#3279)
Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com>
2024-03-22 12:28:14 -07:00
f721096d48 [BugFix] Some fixes for custom allreduce kernels (#2760) 2024-03-21 23:02:58 -07:00
e90fc21f2e [Hardware][Neuron] Refactor neuron support (#3471) 2024-03-22 01:22:17 +00:00
Roy
ea5f14e6ff [Bugfix][Model] Fix Qwen2 (#3554) 2024-03-22 00:18:58 +00:00
b7050ca7df [BugFix] gemma loading after quantization or LoRA. (#3553) 2024-03-21 13:16:57 -07:00
c188ecb080 [Misc] Bump up transformers to v4.39.0 & Remove StarCoder2Config (#3551)
Co-authored-by: Roy <jasonailu87@gmail.com>
Co-authored-by: Roger Meier <r.meier@siemens.com>
2024-03-21 07:58:12 -07:00
Roy
865732342b [Misc][Log] Add log for tokenizer length not equal to vocabulary size (#3500) 2024-03-21 18:07:48 +08:00
4c07dd28c0 [🚀 Ready to be merged] Added support for Jais models (#3183) 2024-03-21 09:45:24 +00:00
3bbff9e5ab Fix 1D query issue from _prune_hidden_states (#3539) 2024-03-21 08:49:06 +00:00
6ebd02bdef [PREFIX CACHING FOLLOW UP] OrderedDict-based evictor (#3431)
Co-authored-by: rsnm2 <rshaw@neuralmagic.com>
Co-authored-by: Luka <luka@paperspace>
2024-03-20 23:20:04 -07:00
523e30ea0c [BugFix] Hot fix in setup.py for neuron build (#3537) 2024-03-20 17:59:52 -07:00
Roy
f1c0fc3919 Migrate logits computation and gather to model_runner (#3233) 2024-03-20 23:25:01 +00:00
6e435de766 [1/n][Chunked Prefill] Refactor input query shapes (#3236) 2024-03-20 14:46:05 -07:00
426ec4ec67 [1/n] Triton sampling kernel (#3186)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-03-20 14:45:08 -07:00
80e254834d [Bugfix] Fix ROCm support in CMakeLists.txt (#3534) 2024-03-20 21:05:03 +00:00
ba8ae1d84f Check for _is_cuda() in compute_num_jobs (#3481) 2024-03-20 10:06:56 -07:00
84eaa68425 Abort when nvcc command is not found in the PATH (#3527) 2024-03-20 09:28:29 -07:00
5ee14494e4 [Misc] Remove cache stream and cache events (#3461) 2024-03-20 00:38:53 -07:00
4ad521d8b5 [Core] Add generic typing to LRUCache (#3511) 2024-03-20 00:36:09 -07:00
9474e89ba4 [PREFIX CACHING FOLLOW UP] A bunch of fixes to block allocator performance when automatic prefix caching is disabled (#3357)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-03-20 00:11:11 -07:00
20478c4d3a Use lru_cache for some environment detection utils (#3508) 2024-03-19 21:34:15 +00:00
63e8b28a99 [Doc] minor fix of spelling in amd-installation.rst (#3506) 2024-03-19 20:32:30 +00:00
cc63d03fbb Revert "[Core] Cache some utils" (#3507) 2024-03-19 13:22:58 -07:00
2a60c9bd17 [Doc] minor fix to neuron-installation.rst (#3505) 2024-03-19 13:21:35 -07:00
c614cfee58 Update dockerfile with ModelScope support (#3429) 2024-03-19 10:54:59 -07:00
7341c77d69 [BugFix] Avoid initializing CUDA too early (#3487) 2024-03-18 23:05:20 -07:00
ef65dcfa6f [Doc] Add docs about OpenAI compatible server (#3288) 2024-03-18 22:05:34 -07:00
6a9c583e73 [Core] print error before deadlock (#3459) 2024-03-19 04:06:23 +00:00
b37cdce2b1 [Core] Cache some utils (#3474) 2024-03-18 17:14:26 -07:00
b30880a762 [Misc] Update README for the Third vLLM Meetup (#3479) 2024-03-18 15:58:38 -07:00
49eedea373 [Core] Zero-copy asdict for InputMetadata (#3475) 2024-03-18 22:56:40 +00:00
9fdf3de346 Cmake based build system (#2830) 2024-03-18 15:38:33 -07:00
c0c17d4896 [Misc] Fix PR Template (#3478) 2024-03-18 15:00:31 -07:00
097aa0ea22 [CI/Build] Fix Bad Import In Test (#3473) 2024-03-18 20:28:00 +00:00
482b0adf1b [Testing] Add test_config.py to CI (#3437) 2024-03-18 12:48:45 -07:00
8c654c045f CI: Add ROCm Docker Build (#2886) 2024-03-18 19:33:47 +00:00
9101d832e6 [Bugfix] Make moe_align_block_size AMD-compatible (#3470) 2024-03-18 11:26:24 -07:00
93348d9458 [CI] Shard tests for LoRA and Kernels to speed up (#3445) 2024-03-17 14:56:30 -07:00
abfc4f3387 [Misc] Use dataclass for InputMetadata (#3452)
Co-authored-by: youkaichao <youkaichao@126.com>
2024-03-17 10:02:46 +00:00
6b78837b29 Fix setup.py neuron-ls issue (#2671) 2024-03-16 16:00:25 -07:00
120157fd2a Support arbitrary json_object in OpenAI and Context Free Grammar (#3211) 2024-03-16 13:35:27 -07:00
8e67598aa6 [Misc] fix line length for entire codebase (#3444) 2024-03-16 00:36:29 -07:00
ad50bf4b25 fix lint 2024-03-15 22:23:38 -07:00
cf6ff18246 Fix Baichuan chat template (#3340) 2024-03-15 21:02:12 -07:00
14e3f9a1b2 Replace lstrip() with removeprefix() to fix Ruff linter warning (#2958) 2024-03-15 21:01:30 -07:00
3123f15138 Fixes the incorrect argument in the prefix-prefill test cases (#3246) 2024-03-15 20:58:10 -07:00
413366e9a2 [Misc] PR templates (#3413)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-03-15 18:25:51 -07:00
10585e035e Removed Extraneous Print Message From OAI Server (#3440) 2024-03-16 00:35:36 +00:00
fb96c1e98c Asynchronous tokenization (#2879) 2024-03-15 23:37:01 +00:00
8fa7357f2d fix document error for value and v_vec illustration (#3421) 2024-03-15 16:06:09 -07:00
a7af4538ca Fix issue templates (#3436) 2024-03-15 21:26:00 +00:00
604f235937 [Misc] add error message in non linux platform (#3438) 2024-03-15 21:21:37 +00:00
14b8ae02e7 Fixes the misuse/mixuse of time.time()/time.monotonic() (#3220)
Signed-off-by: Tao He <sighingnow@gmail.com>
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-03-15 18:25:43 +00:00
03d37f2441 [Fix] Add args for mTLS support (#3430)
Co-authored-by: declark1 <daniel.clark@ibm.com>
2024-03-15 09:56:13 -07:00
a7c871680e Fix tie_word_embeddings for Qwen2. (#3344) 2024-03-15 09:36:53 -07:00
429284dc37 Fix dist.broadcast stall without group argument (#3408) 2024-03-14 23:25:05 -07:00
253a98078a Add chat templates for ChatGLM (#3418) 2024-03-14 23:19:22 -07:00
21539e6856 Add chat templates for Falcon (#3420) 2024-03-14 23:19:02 -07:00
b522c4476f [Misc] add HOST_IP env var (#3419)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-03-14 21:32:52 -07:00
78b6c4845a Dynamically configure shared memory size for moe_align_block_size_kernel (#3376) 2024-03-14 18:18:07 -07:00
b983ba35bd fix marlin config repr (#3414) 2024-03-14 16:26:19 -07:00
54be8a0be2 Fix assertion failure in Qwen 1.5 with prefix caching enabled (#3373)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-03-14 13:56:57 -07:00
dfc77408bd [issue templates] add some issue templates (#3412) 2024-03-14 13:16:00 -07:00
c17ca8ef18 Add args for mTLS support (#3410)
Co-authored-by: Daniel Clark <daniel.clark@ibm.com>
2024-03-14 13:11:45 -07:00
06ec486794 Install flash_attn in Docker image (#3396) 2024-03-14 10:55:54 -07:00
8fe8386591 [Kernel] change benchmark script so that result can be directly used; tune moe kernel in A100/H100 with tp=2,4,8 (#3389) 2024-03-14 08:11:48 +00:00
a37415c31b allow user to chose which vllm's merics to display in grafana (#3393) 2024-03-14 06:35:13 +00:00
81653d9688 [Hotfix] [Debug] test_openai_server.py::test_guided_regex_completion (#3383) 2024-03-13 17:02:21 -07:00
eeab52a4ff [FIX] Simpler fix for async engine running on ray (#3371) 2024-03-13 14:18:40 -07:00
c33afd89f5 Fix lint (#3388) 2024-03-13 13:56:49 -07:00
7e9bd08f60 Add batched RoPE kernel (#3095) 2024-03-13 13:45:26 -07:00
ae0ccb4017 Add missing kernel for CodeLlama-34B on A/H100 (no tensor parallelism) when using Multi-LoRA. (#3350) 2024-03-13 12:18:25 -07:00
739c350c19 [Minor Fix] Use cupy-cuda11x in CUDA 11.8 build (#3256) 2024-03-13 09:43:24 -07:00
ba8dc958a3 [Minor] Fix bias in if to remove ambiguity (#3259) 2024-03-13 09:16:55 -07:00
e221910e77 add hf_transfer to requirements.txt (#3031) 2024-03-12 23:33:43 -07:00
b167109ba1 [Fix] Fix quantization="gptq" when using Marlin (#3319)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-03-12 22:51:42 -07:00
602358f8a8 Add kernel for GeGLU with approximate GELU (#3337) 2024-03-12 22:06:17 -07:00
49a3c8662b Fixes #1556 double free (#3347) 2024-03-13 00:30:08 +00:00
b0925b3878 docs: Add BentoML deployment doc (#3336)
Signed-off-by: Sherlock113 <sherlockxu07@gmail.com>
2024-03-12 10:34:30 -07:00
654865e21d Support Mistral Model Inference with transformers-neuronx (#3153) 2024-03-11 13:19:51 -07:00
c9415c19d3 [ROCm] Fix warp and lane calculation in blockReduceSum (#3321) 2024-03-11 13:14:07 -07:00
4c922709b6 Add distributed model executor abstraction (#3191) 2024-03-11 11:03:45 -07:00
657061fdce [docs] Add LoRA support information for models (#3299) 2024-03-11 00:54:51 -07:00
2f8844ba08 Re-enable the 80 char line width limit (#3305) 2024-03-10 19:49:14 -07:00
4b59f00e91 [Fix] Fix best_of behavior when n=1 (#3298) 2024-03-10 19:17:46 -07:00
Roy
9e8744a545 [BugFix] Fix get tokenizer when using ray (#3301) 2024-03-10 19:17:16 -07:00
e4a28e5316 [ROCM] Fix blockReduceSum to use correct warp counts for ROCm and CUDA (#3262) 2024-03-10 15:27:45 -07:00
0bba88df03 Enhance lora tests with more layer and rank variations (#3243) 2024-03-09 17:14:16 -08:00
8437bae6ef [Speculative decoding 3/9] Worker which speculates, scores, and applies rejection sampling (#3103) 2024-03-08 23:32:46 -08:00
f48c6791b7 [FIX] Fix prefix test error on main (#3286) 2024-03-08 17:16:14 -08:00
c2c5e0909a Move model filelocks from /tmp/ to ~/.cache/vllm/locks/ dir (#3241) 2024-03-08 13:33:10 -08:00
1cb0cc2975 [FIX] Make flash_attn optional (#3269) 2024-03-08 10:52:20 -08:00
99c3cfb83c [Docs] Fix Unmocked Imports (#3275) 2024-03-08 09:58:01 -08:00
1ece1ae829 [Minor Fix] Fix comments in benchmark_serving (#3252) 2024-03-07 22:22:59 -08:00
c59e120c55 Feature add lora support for Qwen2 (#3177) 2024-03-07 21:58:24 -08:00
d2339d6840 Connect engine healthcheck to openai server (#3260) 2024-03-07 16:38:12 -08:00
b35cc93420 Fix auto prefix bug (#3239) 2024-03-07 16:37:28 -08:00
8cbba4622c Possible fix for conflict between Automated Prefix Caching (#2762) and multi-LoRA support (#1804) (#3263) 2024-03-07 23:03:22 +00:00
385da2dae2 Measure model memory usage (#3120) 2024-03-07 11:42:42 -08:00
2daf23ab0c Separate attention backends (#3005) 2024-03-07 01:45:50 -08:00
cbf4c05b15 Update requirements-dev.txt to include package for benchmarking scripts. (#3181)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-03-07 08:39:28 +00:00
d3c04b6a39 Add GPTQ support for Gemma (#3200) 2024-03-07 08:19:14 +08:00
4cb3b924cd Add tqdm dynamic_ncols=True (#3242) 2024-03-06 22:41:42 +00:00
a33ce60c66 [Testing] Fix core tests (#3224) 2024-03-06 01:04:23 -08:00
24aecf421a [Tests] Add block manager and scheduler tests (#3108) 2024-03-05 18:23:34 -08:00
2efce05dc3 [Fix] Avoid pickling entire LLMEngine for Ray workers (#3207)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-03-06 00:17:20 +00:00
8999ec3c16 Store eos_token_id in Sequence for easy access (#3166) 2024-03-05 15:35:43 -08:00
05af6da8d9 [ROCm] enable cupy in order to enable cudagraph mode for AMD GPUs (#3123)
Co-authored-by: lcskrishna <lollachaitanya@gmail.com>
2024-03-04 18:14:53 -08:00
9a4548bae7 Fix the openai benchmarking requests to work with latest OpenAI apis (#2992)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-03-04 15:51:56 -08:00
ff578cae54 Add health check, make async Engine more robust (#3015)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-03-04 22:01:40 +00:00
22de45235c Push logprob generation to LLMEngine (#3065)
Co-authored-by: Avnish Narayan <avnish@anyscale.com>
2024-03-04 19:54:06 +00:00
76e8a70476 [Minor fix] The domain dns.google may cause a socket.gaierror exception (#3176)
Co-authored-by: guofangze <guofangze@kuaishou.com>
2024-03-04 19:17:12 +00:00
9cbc7e5f3b enable --gpu-memory-utilization in benchmark_throughput.py (#3175)
Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
2024-03-04 10:37:58 -08:00
27a7b070db Add document for vllm paged attention kernel. (#2978) 2024-03-04 09:23:34 -08:00
901cf4c52b [Minor Fix] Remove unused code in benchmark_prefix_caching.py (#3171) 2024-03-03 22:48:27 -08:00
d0fae88114 [DOC] add setup document to support neuron backend (#2777) 2024-03-04 01:03:51 +00:00
17c3103c56 Make it easy to profile workers with nsight (#3162)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-03-03 16:19:13 -08:00
996d095c54 [FIX] Fix styles in automatic prefix caching & add a automatic prefix caching benchmark (#3158) 2024-03-03 14:37:18 -08:00
d65fac2738 Add vLLM version info to logs and openai API server (#3161) 2024-03-02 21:00:29 -08:00
ce4f5a29fb Add Automatic Prefix Caching (#2762)
Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-03-02 00:50:01 -08:00
baee28c46c Reorder kv dtype check to avoid nvcc not found error on AMD platform (#3104) 2024-03-02 14:34:48 +08:00
29e70e3e88 allow user chose log level by --log-level instead of fixed 'info'. (#3109)
Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-03-01 23:28:41 +00:00
82091b864a Bump up to v0.3.3 (#3129) 2024-03-01 12:58:06 -08:00
c0c2335ce0 Integrate Marlin Kernels for Int4 GPTQ inference (#2497)
Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com>
Co-authored-by: alexm <alexm@neuralmagic.com>
2024-03-01 12:47:51 -08:00
90fbf12540 fix relative import path of protocol.py (#3134)
Co-authored-by: huohuarong <huohuarong@zuoshouyisheng.com>
2024-03-01 19:42:06 +00:00
49d849b3ab docs: Add tutorial on deploying vLLM model with KServe (#2586)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2024-03-01 11:04:14 -08:00
27ca23dc00 Remove exclude_unset in streaming response (#3143) 2024-03-01 09:59:06 -08:00
54d3544784 Fix: Output text is always truncated in some models (#3016) 2024-03-01 07:52:22 +00:00
703e42ee4b Add guided decoding for OpenAI API server (#2819)
Co-authored-by: br3no <breno@veltefaria.de>
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-02-29 22:13:08 +00:00
29a8d6a554 [Fix] Don't deep-copy LogitsProcessors when copying SamplingParams (#3099) 2024-02-29 19:20:42 +00:00
2c08ff23c0 Fix building from source on WSL (#3112) 2024-02-29 11:13:58 -08:00
bfdcfa6a05 Support starcoder2 architecture (#3089) 2024-02-29 00:51:48 -08:00
9289e577ec add cache_config's info to prometheus metrics. (#3100) 2024-02-29 06:15:18 +00:00
a6d471c759 Fix: AttributeError in OpenAI-compatible server (#3018) 2024-02-28 22:04:07 -08:00
01a5d18a53 Add Support for 2/3/8-bit GPTQ Quantization Models (#2330) 2024-02-28 21:52:23 -08:00
929b4f2973 Add LoRA support for Gemma (#3050) 2024-02-28 13:03:28 -08:00
3b7178cfa4 [Neuron] Support inference with transformers-neuronx (#2569) 2024-02-28 09:34:34 -08:00
e46fa5d52e Restrict prometheus_client >= 0.18.0 to prevent errors when importing pkgs (#3070) 2024-02-28 05:38:26 +00:00
a8683102cc multi-lora documentation fix (#3064) 2024-02-27 21:26:15 -08:00
71bcaf99e2 Enable GQA support in the prefix prefill kernels (#3007)
Signed-off-by: Tao He <sighingnow@gmail.com>
2024-02-27 01:14:31 -08:00
8b430d7dea [Minor] Fix StableLMEpochForCausalLM -> StableLmForCausalLM (#3046) 2024-02-26 20:23:50 -08:00
e0ade06d63 Support logit bias for OpenAI API (#3027) 2024-02-27 11:51:53 +08:00
4bd18ec0c7 [Minor] Fix type annotation in fused moe (#3045) 2024-02-26 19:44:29 -08:00
2410e320b3 fix get_ip error in pure ipv6 environment (#2931) 2024-02-26 19:22:16 -08:00
48a8f4a7fd Support Orion model (#2539)
Co-authored-by: zhangdacheng <zhangdacheng@ainirobot.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-02-26 19:17:06 -08:00
Roy
4dd6416faf Fix stablelm (#3038) 2024-02-26 18:31:10 -08:00
Roy
c1c0d00b88 Don't use cupy when enforce_eager=True (#3037) 2024-02-26 17:33:38 -08:00
Roy
d9f726c4d0 [Minor] Remove unused config files (#3039) 2024-02-26 17:25:22 -08:00
d6e4a130b0 [Minor] Remove gather_cached_kv kernel (#3043) 2024-02-26 15:00:54 -08:00
cfc15a1031 Optimize Triton MoE Kernel (#2979)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-02-26 13:48:56 -08:00
70f3e8e3a1 Add LogProbs for Chat Completions in OpenAI (#2918) 2024-02-26 10:39:34 +08:00
ef978fe411 Port metrics from aioprometheus to prometheus_client (#2730) 2024-02-25 11:54:00 -08:00
f7c1234990 [Fix] Fissertion on YaRN model len (#2984) 2024-02-23 12:57:48 -08:00
57f044945f Fix nvcc not found in vlm-openai image (#2781) 2024-02-22 14:25:07 -08:00
4caf7044e0 Include tokens from prompt phase in counter_generation_tokens (#2802) 2024-02-22 14:00:12 -08:00
6f32cddf1c Remove Flash Attention in test env (#2982) 2024-02-22 09:58:29 -08:00
c530e2cfe3 [FIX] Fix a bug in initializing Yarn RoPE (#2983) 2024-02-22 01:40:05 -08:00
fd5dcc5c81 Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00
93dc5a2870 chore(vllm): codespell for spell checking (#2820) 2024-02-21 18:56:01 -08:00
95529e3253 Use Llama RMSNorm custom op for Gemma (#2974) 2024-02-21 18:28:23 -08:00
Roy
344020c926 Migrate MistralForCausalLM to LlamaForCausalLM (#2868) 2024-02-21 18:25:05 -08:00
5574081c49 Added early stopping to completion APIs (#2939) 2024-02-21 18:24:01 -08:00
d7f396486e Update comment (#2934) 2024-02-21 18:18:37 -08:00
8fbd84bf78 Bump up version to v0.3.2 (#2968)
This version is for more model support. Add support for Gemma models (#2964) and OLMo models (#2832).
2024-02-21 11:47:25 -08:00
7d2dcce175 Support per-request seed (#2514) 2024-02-21 11:47:00 -08:00
dc903e70ac [ROCm] Upgrade transformers to v4.38.0 (#2967) 2024-02-21 09:46:57 -08:00
a9c8212895 [FIX] Add Gemma model to the doc (#2966) 2024-02-21 09:46:15 -08:00
c20ecb6a51 Upgrade transformers to v4.38.0 (#2965) 2024-02-21 09:38:03 -08:00
5253edaacb Add Gemma model (#2964) 2024-02-21 09:34:30 -08:00
017d9f1515 Add metrics to RequestOutput (#2876) 2024-02-20 21:55:57 -08:00
181b27d881 Make vLLM logging formatting optional (#2877) 2024-02-20 14:38:55 -08:00
63e2a6419d [FIX] Fix beam search test (#2930) 2024-02-20 14:37:39 -08:00
264017a2bf [ROCm] include gfx908 as supported (#2792) 2024-02-19 17:58:59 -08:00
e433c115bc Fix vllm:prompt_tokens_total metric calculation (#2869) 2024-02-18 23:55:41 -08:00
86fd8bb0ac Add warning to prevent changes to benchmark api server (#2858) 2024-02-18 21:36:19 -08:00
ab3a5a8259 Support OLMo models. (#2832) 2024-02-18 21:05:15 -08:00
a61f0521b8 [Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00
537c9755a7 [Minor] Small fix to make distributed init logic in worker looks cleaner (#2905) 2024-02-18 14:39:00 -08:00
786b7f18a5 Add code-revision config argument for Hugging Face Hub (#2892) 2024-02-17 22:36:53 -08:00
8f36444c4f multi-LoRA as extra models in OpenAI server (#2775)
how to serve the loras (mimicking the [multilora inference example](https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py)):
```terminal
$ export LORA_PATH=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
$ python -m vllm.entrypoints.api_server \
 --model meta-llama/Llama-2-7b-hf \
 --enable-lora \
 --lora-modules sql-lora=$LORA_PATH sql-lora2=$LORA_PATH
```
the above server will list 3 separate values if the user queries `/models`: one for the base served model, and one each for the specified lora modules. in this case sql-lora and sql-lora2 point to the same underlying lora, but this need not be the case. lora config values take the same values they do in EngineArgs

no work has been done here to scope client permissions to specific models
2024-02-17 12:00:48 -08:00
185b2c29e2 Defensively copy sampling_params (#2881)
If the SamplingParams object passed to LLMEngine.add_request() is mutated after it returns, it could affect the async sampling process for that request.

Suggested by @Yard1 https://github.com/vllm-project/vllm/pull/2514#discussion_r1490106059
2024-02-17 11:18:04 -08:00
5f08050d8d Bump up to v0.3.1 (#2887) 2024-02-16 15:05:18 -08:00
64da65b322 Prefix Caching- fix t4 triton error (#2517) 2024-02-16 14:17:55 -08:00
5255d99dc5 [ROCm] Dockerfile fix for flash-attention build (#2885) 2024-02-15 10:22:39 -08:00
4f2ad11135 Fix DeciLM (#2883) 2024-02-14 22:29:57 -08:00
d7afab6d3a [BugFix] Fix GC bug for LLM class (#2882) 2024-02-14 22:17:44 -08:00
31348dff03 Align LoRA code between Mistral and Mixtral (fixes #2875) (#2880)
* Fix AttributeError: MixtralModel object has no attribute org_vocab_size.

* Make LoRA logic for Mistral and Mixtral the same

---------

Co-authored-by: Pernekhan Utemuratov <pernekhan@deepinfra.com>
2024-02-15 01:00:43 +01:00
25e86b6a61 Don't use cupy NCCL for AMD backends (#2855) 2024-02-14 12:30:44 -08:00
Roy
4efbac6d35 Migrate AquilaForCausalLM to LlamaForCausalLM (#2867) 2024-02-14 12:30:24 -08:00
87069ccf68 Fix docker python version (#2845) 2024-02-14 10:17:57 -08:00
7e45107f51 [Fix] Fix memory profiling when GPU is used by multiple processes (#2863) 2024-02-13 19:52:34 -08:00
0c48b37c31 Fix internlm after https://github.com/vllm-project/vllm/pull/2860 (#2861) 2024-02-13 18:01:15 -08:00
7eacffd951 Migrate InternLMForCausalLM to LlamaForCausalLM (#2860)
Co-authored-by: Roy <jasonailu87@gmail.com>
2024-02-13 17:12:05 -08:00
2a543d6efe Add LoRA support for Mixtral (#2831)
* add mixtral lora support

* formatting

* fix incorrectly ported logic

* polish tests

* minor fixes and refactoring

* minor fixes

* formatting

* rename and remove redundant logic

* refactoring

* refactoring

* minor fix

* minor refactoring

* fix code smell
2024-02-14 00:55:45 +01:00
317b29de0f Remove Yi model definition, please use LlamaForCausalLM instead (#2854)
Co-authored-by: Roy <jasonailu87@gmail.com>
2024-02-13 14:22:22 -08:00
a463c333dd Use CuPy for CUDA graphs (#2811) 2024-02-13 11:32:06 -08:00
ea356004d4 Revert "Refactor llama family models (#2637)" (#2851)
This reverts commit 5c976a7e1a1bec875bf6474824b7dff39e38de18.
2024-02-13 09:24:59 -08:00
Roy
5c976a7e1a Refactor llama family models (#2637) 2024-02-13 00:09:23 -08:00
f964493274 [CI] Ensure documentation build is checked in CI (#2842) 2024-02-12 22:53:07 -08:00
a4211a4dc3 Serving Benchmark Refactoring (#2433) 2024-02-12 22:53:00 -08:00
Rex
563836496a Refactor 2 awq gemm kernels into m16nXk32 (#2723)
Co-authored-by: Chunan Zeng <chunanzeng@Chunans-Air.attlocal.net>
2024-02-12 11:02:17 -08:00
4ca2c358b1 Add documentation section about LoRA (#2834) 2024-02-12 17:24:45 +01:00
0580aab02f [ROCm] support Radeon™ 7900 series (gfx1100) without using flash-attention (#2768) 2024-02-10 23:14:37 -08:00
3711811b1d Disable custom all reduce by default (#2808) 2024-02-08 09:58:03 -08:00
65b89d16ee [Ray] Integration compiled DAG off by default (#2471) 2024-02-08 09:57:25 -08:00
931746bc6d Add documentation on how to do incremental builds (#2796) 2024-02-07 14:42:02 -08:00
c81dddb45c [ROCm] Fix build problem resulted from previous commit related to FP8 kv-cache support (#2790) 2024-02-06 22:36:59 -08:00
fe6d09ae61 [Minor] More fix of test_cache.py CI test failure (#2750) 2024-02-06 11:38:38 -08:00
ed70c70ea3 modelscope: fix issue when model parameter is not a model id but path of the model. (#2489) 2024-02-06 09:57:15 -08:00
f0d4e14557 Add fused top-K softmax kernel for MoE (#2769) 2024-02-05 17:38:02 -08:00
2ccee3def6 [ROCm] Fixup arch checks for ROCM (#2627) 2024-02-05 14:59:09 -08:00
b92adec8e8 Set local logging level via env variable (#2774) 2024-02-05 14:26:50 -08:00
56f738ae9b [ROCm] Fix some kernels failed unit tests (#2498) 2024-02-05 14:25:36 -08:00
72d3a30c63 [Minor] Fix benchmark_latency script (#2765) 2024-02-05 12:45:37 -08:00
c9b45adeeb Require triton >= 2.1.0 (#2746)
Co-authored-by: yangrui1 <yangrui@lanjingren.com>
2024-02-04 23:07:36 -08:00
Rex
5a6c81b051 Remove eos tokens from output by default (#2611) 2024-02-04 14:32:42 -08:00
51cd22ce56 set&get llm internal tokenizer instead of the TokenizerGroup (#2741)
Co-authored-by: shujunhua1 <shujunhua1@jd.com>
2024-02-04 14:25:36 -08:00
5ed704ec8c docs: fix langchain (#2736) 2024-02-03 18:17:55 -08:00
4abf6336ec Add one example to run batch inference distributed on Ray (#2696) 2024-02-02 15:41:42 -08:00
0e163fce18 Fix default length_penalty to 1.0 (#2667) 2024-02-01 15:59:39 -08:00
96b6f475dd Remove hardcoded device="cuda" to support more devices (#2503)
Co-authored-by: Jiang Li <jiang1.li@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2024-02-01 15:46:39 -08:00
c410f5d020 Use revision when downloading the quantization config file (#2697)
Co-authored-by: Pernekhan Utemuratov <pernekhan@deepinfra.com>
2024-02-01 15:41:58 -08:00
bb8c697ee0 Update README for meetup slides (#2718) 2024-02-01 14:56:53 -08:00
b9e96b17de fix python 3.8 syntax (#2716) 2024-02-01 14:00:58 -08:00
923797fea4 Fix compile error when using rocm (#2648) 2024-02-01 09:35:09 -08:00
cd9e60c76c Add Internlm2 (#2666) 2024-02-01 09:27:40 -08:00
93b38bea5d Refactor Prometheus and Add Request Level Metrics (#2316) 2024-01-31 14:58:07 -08:00
d0d93b92b1 Add unit test for Mixtral MoE layer (#2677) 2024-01-31 14:34:17 -08:00
89efcf1ce5 [Minor] Fix test_cache.py CI test failure (#2684) 2024-01-31 10:12:11 -08:00
c664b0e683 fix some bugs (#2689) 2024-01-31 10:09:23 -08:00
d69ff0cbbb Fixes assertion failure in prefix caching: the lora index mapping should respect prefix_len (#2688)
Signed-off-by: Tao He <sighingnow@gmail.com>
2024-01-31 18:00:13 +01:00
1af090b57d Bump up version to v0.3.0 (#2656) 2024-01-31 00:07:07 -08:00
3dad944485 Add quantized mixtral support (#2673) 2024-01-30 16:34:10 -08:00
105a40f53a [Minor] Fix false warning when TP=1 (#2674) 2024-01-30 14:39:40 -08:00
bbe9bd9684 [Minor] Fix a small typo (#2672) 2024-01-30 13:40:37 -08:00
4f65af0e25 Add swap_blocks unit tests (#2616) 2024-01-30 09:30:50 -08:00
d79ced3292 Fix 'Actor methods cannot be called directly' when using --engine-use-ray (#2664)
* fix: engine-useray complain

* fix: typo
2024-01-30 17:17:05 +01:00
ab40644669 Fused MOE for Mixtral (#2542)
Co-authored-by: chen shen <scv119@gmail.com>
2024-01-29 22:43:37 -08:00
5d60def02c DeepseekMoE support with Fused MoE kernel (#2453)
Co-authored-by: roy <jasonailu87@gmail.com>
2024-01-29 21:19:48 -08:00
ea8489fce2 ROCm: Allow setting compilation target (#2581) 2024-01-29 10:52:31 -08:00
1b20639a43 No repeated IPC open (#2642) 2024-01-29 10:46:29 -08:00
b72af8f1ed Fix error when tp > 1 (#2644)
Co-authored-by: zhaoyang-star <zhao.yang16@zte.com.cn>
2024-01-28 22:47:39 -08:00
9090bf02e7 Support FP8-E5M2 KV Cache (#2279)
Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-01-28 16:43:54 -08:00
7d648418b8 Update Ray version requirements (#2636) 2024-01-28 14:27:22 -08:00
89be30fa7d Small async_llm_engine refactor (#2618) 2024-01-27 23:28:37 -08:00
f8ecb84c02 Speed up Punica compilation (#2632) 2024-01-27 17:46:56 -08:00
5f036d2bcc [Minor] Fix warning on Ray dependencies (#2630) 2024-01-27 15:43:40 -08:00
380170038e Implement custom all reduce kernels (#2192) 2024-01-27 12:46:35 -08:00
220a47627b Use head_dim in config if exists (#2622) 2024-01-27 10:30:49 -08:00
beb89f68b4 AWQ: Up to 2.66x higher throughput (#2566) 2024-01-26 23:53:17 -08:00
390b495ff3 Don't build punica kernels by default (#2605) 2024-01-26 15:19:19 -08:00
3a0e1fc070 Support for Stable LM 2 (#2598)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-01-26 12:45:19 -08:00
6b7de1a030 [ROCm] add support to ROCm 6.0 and MI300 (#2274) 2024-01-26 12:41:10 -08:00
5265631d15 use a correct device when creating OptionalCUDAGuard (#2583) 2024-01-25 23:48:17 -08:00
2832e7b9f9 fix names and license for Qwen2 (#2589) 2024-01-24 22:37:51 -08:00
3a7dd7e367 Support Batch Completion in Server (#2529) 2024-01-24 17:11:07 -08:00
223c19224b Fix the syntax error in the doc of supported_models (#2584) 2024-01-24 11:22:51 -08:00
f1f6cc10c7 Added include_stop_str_in_output and length_penalty parameters to OpenAI API (#2562) 2024-01-24 10:21:56 -08:00
3209b49033 [Bugfix] fix crash if max_tokens=None (#2570) 2024-01-23 22:38:55 -08:00
1e4277d2d1 lint: format all python file instead of just source code (#2567) 2024-01-23 15:53:06 -08:00
9b945daaf1 [Experimental] Add multi-LoRA support (#1804)
Co-authored-by: Chen Shen <scv119@gmail.com>
Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com>
Co-authored-by: Avnish Narayan <avnish@anyscale.com>
2024-01-23 15:26:37 -08:00
9c1352eb57 [Feature] Simple API token authentication and pluggable middlewares (#1106) 2024-01-23 15:13:00 -08:00
7a0b011dd5 Add a 1-line docstring to explain why calling context_attention_fwd twice in test_prefix_prefill.py (#2553) 2024-01-22 14:47:25 -08:00
63e835cbcc Fix progress bar and allow HTTPS in benchmark_serving.py (#2552) 2024-01-22 14:40:31 -08:00
94b5edeb53 Add qwen2 (#2495) 2024-01-22 14:34:21 -08:00
ab7e6006d6 Fix https://github.com/vllm-project/vllm/issues/2540 (#2545) 2024-01-22 19:02:38 +01:00
18bfcdd05c [Speculative decoding 2/9] Multi-step worker for draft model (#2424) 2024-01-21 16:31:47 -08:00
71d63ed72e migrate pydantic from v1 to v2 (#2531) 2024-01-21 16:05:56 -08:00
d75c40734a [Fix] Keep scheduler.running as deque (#2523) 2024-01-20 22:36:09 -08:00
5b23c3f26f Add group as an argument in broadcast ops (#2522) 2024-01-20 16:00:26 -08:00
00efdc84ba Add benchmark serving to CI (#2505) 2024-01-19 20:20:19 -08:00
Roy
91a61da9b1 [Bugfix] fix load local safetensors model (#2512) 2024-01-19 16:26:16 -08:00
ef9b636e2d Simplify broadcast logic for control messages (#2501) 2024-01-19 11:23:30 -08:00
2709c0009a Support OpenAI API server in benchmark_serving.py (#2172) 2024-01-18 20:34:08 -08:00
dd7e8f5f64 refactor complemention api for readability (#2499) 2024-01-18 16:45:14 -08:00
d2a68364c4 [BugFix] Fix abort_seq_group (#2463) 2024-01-18 15:10:42 -08:00
7e1081139d Don't download both safetensor and bin files. (#2480) 2024-01-18 11:05:53 -08:00
18473cf498 [Neuron] Add an option to build with neuron (#2065) 2024-01-18 10:58:50 -08:00
4df417d059 fix: fix some args desc (#2487) 2024-01-18 09:41:44 -08:00
5d80a9178b Minor fix in prefill cache example (#2494) 2024-01-18 09:40:34 -08:00
8a25d3a71a fix stablelm.py tensor-parallel-size bug (#2482) 2024-01-18 09:39:46 -08:00
d10f8e1d43 [Experimental] Prefix Caching Support (#1669)
Co-authored-by: DouHappy <2278958187@qq.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-01-17 16:32:10 -08:00
14cc317ba4 OpenAI Server refactoring (#2360) 2024-01-16 21:33:14 -08:00
e1957c6ebd Add StableLM3B model (#2372) 2024-01-16 20:32:40 -08:00
8cd5a992bf ci: retry on build failure as well (#2457) 2024-01-16 12:51:04 -08:00
947f0b23cc CI: make sure benchmark script exit on error (#2449) 2024-01-16 09:50:13 -08:00
f780504d12 fix weigit loading for GQA with TP (#2379) 2024-01-15 15:43:59 -08:00
bfc072addf Allow buildkite to retry build on agent lost (#2446) 2024-01-15 15:43:15 -08:00
2a18da257c Announce the second vLLM meetup (#2444) 2024-01-15 14:11:59 -08:00
6e01e8c1c8 [CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00
Roy
9f659bf07f [Minor] Optimize cuda graph memory usage (#2437) 2024-01-14 18:40:51 +01:00
35c4bc20d9 [Minor] Fix err msg (#2431) 2024-01-12 14:02:52 -08:00
218dc2ccda Aligning top_p and top_k Sampling (#1885)
* Align top_p and top_k with huggingface

* remove _get_prompt_and_output_tokens

* rename _apply_top_p_top_k

* compare top_p top_k with hf

* fix test errors
2024-01-12 22:51:03 +01:00
827cbcd37c Update quickstart.rst (#2369) 2024-01-12 12:56:18 -08:00
Ben
cb7a1c1cbf Suggest using dtype=half when OOM. 2024-01-12 12:33:29 -08:00
7878958c0d Address Phi modeling update 2 (#2428) 2024-01-12 12:16:49 -08:00
ce036244c9 Allow setting fastapi root_path argument (#2341) 2024-01-12 10:59:59 -08:00
48cf1e413c fix: deque mutated during iteration in abort_seq_group (#2371) 2024-01-12 17:44:18 +01:00
97460585d9 Add gradio chatbot for openai webserver (#2307) 2024-01-11 19:45:56 -08:00
f745847ef7 [Minor] Fix the format in quick start guide related to Model Scope (#2425) 2024-01-11 19:44:01 -08:00
6549aef245 [DOC] Add additional comments for LLMEngine and AsyncLLMEngine (#1011) 2024-01-11 19:26:49 -08:00
50376faa7b Rename phi_1_5 -> phi (#2385) 2024-01-11 16:23:43 -08:00
4b61c6b669 get_ip(): Fix ipv4 ipv6 dualstack (#2408) 2024-01-10 11:39:58 -08:00
79d64c4954 [Speculative decoding 1/9] Optimized rejection sampler (#2336) 2024-01-09 15:38:41 -08:00
KKY
74cd5abdd1 Add baichuan chat template jinjia file (#2390) 2024-01-09 09:13:02 -08:00
28c3f12104 [Minor] Remove unused code in attention (#2384) 2024-01-08 13:13:08 -08:00
c884819135 Fix eager mode performance (#2377) 2024-01-08 10:11:06 -08:00
05921a9a7a Changed scheduler to use deques instead of lists (#2290)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-01-07 09:48:07 -08:00
d0215a58e7 Ensure metrics are logged regardless of requests (#2347) 2024-01-05 05:24:42 -08:00
937e7b7d7c Build docker image with shared objects from "build" step (#2237) 2024-01-04 09:35:18 -08:00
aee8ef661a Miner fix of type hint (#2340) 2024-01-03 21:27:56 -08:00
2e0b6e7757 Bump up to v0.2.7 (#2337) 2024-01-03 17:35:56 -08:00
941767127c Revert the changes in test_cache (#2335) 2024-01-03 17:32:05 -08:00
74d8d77626 Remove unused const TIMEOUT_TO_PREVENT_DEADLOCK (#2321) 2024-01-03 15:49:07 -08:00
fd4ea8ef5c Use NCCL instead of ray for control-plane communication to remove serialization overhead (#2221) 2024-01-03 11:30:22 -08:00
1066cbd152 Remove deprecated parameter: concurrency_count (#2315) 2024-01-03 09:56:21 -08:00
6ef00b03a2 Enable CUDA graph for GPTQ & SqueezeLLM (#2318) 2024-01-03 09:52:29 -08:00
Roy
9140561059 [Minor] Fix typo and remove unused code (#2305) 2024-01-02 19:23:15 -08:00
77af974b40 [FIX] Support non-zero CUDA devices in custom kernels (#1959) 2024-01-02 19:09:59 -08:00
4934d49274 Support GPT-NeoX Models without attention biases (#2301) 2023-12-30 11:42:04 -05:00
358c328d69 [BUGFIX] Fix communication test (#2285) 2023-12-27 17:18:11 -05:00
4aaafdd289 [BUGFIX] Fix the path of test prompts (#2273) 2023-12-26 10:37:21 -08:00
66b108d142 [BUGFIX] Fix API server test (#2270) 2023-12-26 10:37:06 -08:00
e0ff920001 [BUGFIX] Do not return ignored sentences twice in async llm engine (#2258) 2023-12-26 13:41:09 +08:00
face83c7ec [Docs] Add "About" Heading to README.md (#2260) 2023-12-25 16:37:07 -08:00
1db83e31a2 [Docs] Update installation instructions to include CUDA 11.8 xFormers (#2246) 2023-12-22 23:20:02 -08:00
a1b9cb2a34 [BugFix] Fix recovery logic for sequence group (#2186) 2023-12-20 21:52:37 -08:00
3a4fd5ca59 Disable Ray usage stats collection (#2206) 2023-12-20 21:52:08 -08:00
c17daa9f89 [Docs] Fix broken links (#2222) 2023-12-20 12:43:42 -08:00
bd29cf3d3a Remove Sampler copy stream (#2209) 2023-12-20 00:04:33 -08:00
31bff69151 Make _prepare_sample non-blocking and use pinned memory for input buffers (#2207) 2023-12-19 16:52:46 -08:00
ba4f826738 [BugFix] Fix weight loading for Mixtral with TP (#2208) 2023-12-19 16:16:11 -08:00
de60a3fb93 Added DeciLM-7b and DeciLM-7b-instruct (#2062) 2023-12-19 02:29:33 -08:00
21d5daa4ac Add warning on CUDA graph memory usage (#2182) 2023-12-18 18:16:17 -08:00
290e015c6c Update Help Text for --gpu-memory-utilization Argument (#2183) 2023-12-18 11:33:24 -08:00
1b7c791d60 [ROCm] Fixes for GPTQ on ROCm (#2180) 2023-12-18 10:41:04 -08:00
bbe4466fd9 [Minor] Fix typo (#2166)
Co-authored-by: John-Saxon <zhang.xiangxuan@oushu.com>
2023-12-17 23:28:49 -08:00
08133c4d1a Add SSL arguments to API servers (#2109) 2023-12-18 10:56:23 +08:00
76a7983b23 [BugFix] Fix RoPE kernel on long sequences(#2164) 2023-12-17 17:09:10 -08:00
8041b7305e [BugFix] Raise error when max_model_len is larger than KV cache (#2163) 2023-12-17 17:08:23 -08:00
3ec8c25cd0 [Docs] Update documentation for gpu-memory-utilization option (#2162) 2023-12-17 10:51:57 -08:00
671af2b1c0 Bump up to v0.2.6 (#2157) 2023-12-17 10:34:56 -08:00
6f41f0e377 Disable CUDA graph for SqueezeLLM (#2161) 2023-12-17 10:24:25 -08:00
2c9b638065 [Minor] Fix a typo in .pt weight support (#2160) 2023-12-17 10:12:44 -08:00
a7347d9a6d Make sampler less blocking (#1889) 2023-12-17 23:03:49 +08:00
f8c688d746 [Minor] Add Phi 2 to supported models (#2159) 2023-12-17 02:54:57 -08:00
c9fadda543 [Minor] Fix xformers version (#2158) 2023-12-17 02:28:02 -08:00
30fb0956df [Minor] Add more detailed explanation on quantization argument (#2145) 2023-12-17 01:56:16 -08:00
3a765bd5e1 Temporarily enforce eager mode for GPTQ models (#2154) 2023-12-17 01:51:12 -08:00
26c52a5ea6 [Docs] Add CUDA graph support to docs (#2148) 2023-12-17 01:49:20 -08:00
c3372e87be Remove dependency on CuPy (#2152) 2023-12-17 01:49:07 -08:00
b0a1d667b0 Pin PyTorch & xformers versions (#2155) 2023-12-17 01:46:54 -08:00
e1d5402238 Fix all-reduce memory usage (#2151) 2023-12-17 01:44:45 -08:00
3d1cfbfc74 [Minor] Delete Llama tokenizer warnings (#2146) 2023-12-16 22:05:18 -08:00
37ca558103 Optimize model execution with CUDA graph (#1926)
Co-authored-by: Chen Shen <scv119@gmail.com>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2023-12-16 21:12:08 -08:00
Roy
eed74a558f Simplify weight loading logic (#2133) 2023-12-16 12:41:23 -08:00
2acd76f346 [ROCm] Temporarily remove GPTQ ROCm support (#2138) 2023-12-15 17:13:58 -08:00
b81a6a6bb3 [Docs] Add supported quantization methods to docs (#2135) 2023-12-15 13:29:22 -08:00
0fbfc4b81b Add GPTQ support (#916) 2023-12-15 03:04:22 -08:00
c06170cc8e Add a flag to include stop string in output text (#1976) 2023-12-15 00:45:58 -08:00
614856da25 Avoid multiple redefinition (#1817) 2023-12-14 09:35:58 -08:00
05bdf4eaf3 Fix Dockerfile.rocm (#2101)
Co-authored-by: miloice <jeffaw99@hotmail.com>
2023-12-14 00:45:58 -08:00
6774bd50b0 Fix typing in AsyncLLMEngine & add toml to requirements-dev (#2100) 2023-12-14 00:19:41 -08:00
31c1f3255e Bump up to v0.2.5 (#2095) 2023-12-13 23:56:15 -08:00
21d93c140d Optimize Mixtral with expert parallelism (#2090) 2023-12-13 23:55:07 -08:00
f1c8520146 [BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00
096827c284 [Docs] Add notes on ROCm-supported models (#2087) 2023-12-13 09:45:34 -08:00
6565d9e33e Update installation instruction for vLLM + CUDA 11.8 (#2086) 2023-12-13 09:25:59 -08:00
f375ec8440 [ROCm] Upgrade xformers version for ROCm & update doc (#2079)
Co-authored-by: miloice <jeffaw99@hotmail.com>
2023-12-13 00:56:05 -08:00
518369d78c Implement lazy model loader (#2044) 2023-12-12 22:21:45 -08:00
30bad5c492 Fix peak memory profiling (#2031) 2023-12-12 22:01:53 -08:00
3fefe271ec Update Dockerfile to build Megablocks (#2042) 2023-12-12 17:34:17 -08:00
6428f1d051 Support MPT with GQA (#1938)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2023-12-12 10:16:05 -08:00
7e1b21daac Remove einops from requirements (#2049) 2023-12-12 09:34:09 -08:00
cb3f30c600 Upgrade transformers version to 4.36.0 (#2046) 2023-12-11 18:39:14 -08:00
f3e024bece [CI/CD] Upgrade PyTorch version to v2.1.1 (#2045) 2023-12-11 17:48:11 -08:00
31d2ab4aff Remove python 3.10 requirement (#2040) 2023-12-11 12:26:42 -08:00
eb17212858 Update Dockerfile to support Mixtral (#2027) 2023-12-11 11:59:08 -08:00
4dd4b5c538 Bump up to v0.2.4 (#2034) 2023-12-11 11:49:39 -08:00
6120e5aaea Fix import error msg for megablocks (#2038) 2023-12-11 11:40:56 -08:00
Ram
2eaa81b236 Update README.md to add megablocks requirement for mixtral (#2033) 2023-12-11 11:37:34 -08:00
81ce2a4b26 [Minor] Fix type annotation in Mixtral (#2036) 2023-12-11 11:32:39 -08:00
5dd80d3777 Fix latency benchmark script (#2035) 2023-12-11 11:19:08 -08:00
beeee69bc9 Revert adding Megablocks (#2030) 2023-12-11 10:49:00 -08:00
Ram
9bf28d0b69 Update requirements.txt for mixtral (#2029) 2023-12-11 10:39:29 -08:00
c0ce15dfb2 Update run_on_sky.rst (#2025)
sharable -> shareable
2023-12-11 10:32:58 -08:00
b9bcdc7158 Change the load format to pt for Mixtral (#2028) 2023-12-11 10:32:17 -08:00
4ff0203987 Minor fixes for Mixtral (#2015) 2023-12-11 09:16:15 -08:00
b5f882cc98 Mixtral 8x7B support (#2011)
Co-authored-by: Pierre Stock <p@mistral.ai>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-12-11 01:09:15 -08:00
2e8fc0d4c3 Fix completion API echo and logprob combo (#1992) 2023-12-10 13:20:30 -08:00
wbn
dacaf5a400 Replace head_mapping params with num_kv_heads to attention kernel. (#1997)
Co-authored-by: wangguoya <wangguoya@baidu.com>
Co-authored-by: Yang Zhao <zhaoyangstar@foxmail.com>
2023-12-10 10:12:53 -08:00
24cde76a15 [Minor] Add comment on skipping rope caches (#2004) 2023-12-10 10:04:12 -08:00
1aa1361510 Fix OpenAI server completion_tokens referenced before assignment (#1996) 2023-12-09 21:01:21 -08:00
fe470ae5ad [Minor] Fix code style for baichuan (#2003) 2023-12-09 19:24:29 -08:00
3a8c2381f7 Fix for KeyError on Loading LLaMA (#1978) 2023-12-09 15:59:57 -08:00
c85b80c2b6 [Docker] Add cuda arch list as build option (#1950) 2023-12-08 09:53:47 -08:00
2b981012a6 Fix Baichuan2-7B-Chat (#1987) 2023-12-08 09:38:36 -08:00
6ccc0bfffb Merge EmbeddedLLM/vllm-rocm into vLLM main (#1836)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
Co-authored-by: Amir Balwel <amoooori04@gmail.com>
Co-authored-by: root <kuanfu.liu@akirakan.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: kuanfu <kuanfu.liu@embeddedllm.com>
Co-authored-by: miloice <17350011+kliuae@users.noreply.github.com>
2023-12-07 23:16:52 -08:00
c8e7eb1eb3 fix typo in getenv call (#1972) 2023-12-07 16:04:41 -08:00
24f60a54f4 [Docker] Adding number of nvcc_threads during build as envar (#1893) 2023-12-07 11:00:32 -08:00
42c02f5892 Fix quickstart.rst typo jinja (#1964) 2023-12-07 08:34:44 -08:00
ebede26ebf Make InternLM follow rope_scaling in config.json (#1956)
Co-authored-by: lijie8 <lijie8@sensetime.com>
2023-12-07 08:32:08 -08:00
d940ce497e Fix typo in adding_model.rst (#1947)
adpated -> adapted
2023-12-06 10:04:26 -08:00
05ff90b692 Save pytorch profiler output for latency benchmark (#1871)
* Save profiler output

* Apply feedback from code review
2023-12-05 20:55:55 -08:00
1d9b737e05 Support ChatGLMForConditionalGeneration (#1932)
Co-authored-by: shujunhua1 <shujunhua1@jd.com>
2023-12-05 10:52:48 -08:00
Roy
60dc62dc9e add custom server params (#1868) 2023-12-03 12:59:18 -08:00
0f90effc66 Bump up to v0.2.3 (#1903) 2023-12-03 12:27:47 -08:00
464dd985e3 Fix num_gpus when TP > 1 (#1852) 2023-12-03 12:24:30 -08:00
c07a442854 chore(examples-docs): upgrade to OpenAI V1 (#1785) 2023-12-03 01:11:22 -08:00
cd3aa153a4 Fix broken worker test (#1900) 2023-12-02 22:17:33 -08:00
9b294976a2 Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00
5313c2cb8b Add Production Metrics in Prometheus format (#1890) 2023-12-02 16:37:44 -08:00
5f09cbdb63 Fix broken sampler tests (#1896)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2023-12-02 16:06:17 -08:00
4cefa9b49b [Docs] Update the AWQ documentation to highlight performance issue (#1883) 2023-12-02 15:52:47 -08:00
f86bd6190a Fix the typo in SamplingParams' docstring (#1886) 2023-12-01 02:06:36 -08:00
e5452ddfd6 Normalize head weights for Baichuan 2 (#1876) 2023-11-30 20:03:58 -08:00
d06980dfa7 Fix Baichuan tokenizer error (#1874) 2023-11-30 18:35:50 -08:00
66785cc05c Support chat template and echo for chat API (#1756) 2023-11-30 16:43:13 -08:00
05a38612b0 docs: add instruction for langchain (#1162) 2023-11-30 10:57:44 -08:00
Roy
d27f4bae39 Fix rope cache key error (#1867) 2023-11-30 08:29:28 -08:00
8d8c2f6ffe Support max-model-len argument for throughput benchmark (#1858) 2023-11-30 08:10:24 -08:00
51d3cb951d Remove max_num_seqs in latency benchmark script (#1855) 2023-11-30 00:00:32 -08:00
e74b1736a1 Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00
f07c1ceaa5 [FIX] Fix docker build error (#1831) (#1832)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2023-11-29 23:06:50 -08:00
63b2206ad0 Avoid multiple instantiations of the RoPE class (#1828) 2023-11-29 23:06:27 -08:00
27feead2f8 Refactor Worker & InputMetadata (#1843) 2023-11-29 22:16:37 -08:00
c782195662 Disable Logs Requests should Disable Logging of requests. (#1779)
Co-authored-by: Michael McCulloch <mjm.gitlab@fastmail.com>
2023-11-29 21:50:02 -08:00
0f621c2c7d [Docs] Add information about using shared memory in docker (#1845) 2023-11-29 18:33:56 -08:00
a9e4574261 Refactor Attention (#1840) 2023-11-29 15:37:31 -08:00
0229c386c5 Better integration with Ray Serve (#1821)
Co-authored-by: FlorianJoncour <florian@zetta-sys.com>
2023-11-29 13:25:43 -08:00
a7b3e33078 [Fix] Fix RoPE in ChatGLM-32K (#1841) 2023-11-29 13:01:19 -08:00
e19a64c7ef [FIX] Fix formatting error in main branch (#1822) 2023-11-28 16:56:43 -08:00
1cb4ad8de9 [FIX] Fix formatting error 2023-11-29 00:40:19 +00:00
6ed068a71a Use the type BlockTable (#1791) 2023-11-28 16:34:05 -08:00
708e6c18b0 [FIX] Fix class naming (#1803) 2023-11-28 14:08:01 -08:00
b943890484 Fix OPT param names (#1819) 2023-11-28 11:22:44 -08:00
a1125ad4df Correct comments in parallel_state.py (#1818) 2023-11-28 10:19:35 -08:00
a8b150c595 Init model on GPU to reduce CPU memory footprint (#1796) 2023-11-27 11:18:26 -08:00
665cbcec4b Added echo function to OpenAI API server. (#1504) 2023-11-26 21:29:17 -08:00
7c600440f7 Fix model docstrings (#1764) 2023-11-23 23:04:44 -08:00
e0c6f556e8 [Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00
de23687d16 Fix repetition penalty aligned with huggingface (#1577) 2023-11-22 14:41:44 -08:00
4cea74c73b Set top_p=0 and top_k=-1 in greedy sampling (#1748) 2023-11-22 12:51:09 -08:00
a921d8be9d [DOCS] Add engine args documentation (#1741) 2023-11-22 12:31:27 -08:00
094f716bf2 Add stop_token_ids in SamplingParams.__repr__ (#1745) 2023-11-21 20:13:53 -08:00
7d761fe3c1 [FIX] Fix the case when input_is_parallel=False for ScaledActivation (#1737) 2023-11-20 23:56:48 -08:00
cf35d8f3d7 [BugFix] Fix TP support for AWQ (#1731) 2023-11-20 21:42:45 -08:00
4bb6b67188 fix RAM OOM when load large models in tensor parallel mode. (#1395)
Co-authored-by: ran_lin <rlin@thoughtworks.com>
2023-11-20 19:02:42 -08:00
819b18e7ba Rewrite torch.repeat_interleave to remove cpu synchronization (#1599) 2023-11-20 17:46:32 -08:00
19849db573 [Fix] Fix bugs in scheduler (#1727) 2023-11-20 16:10:50 -08:00
3d4ceb292c Fix hanging in the scheduler caused by long prompts (#1534) 2023-11-20 16:06:49 -08:00
f5a37c6c6c [BugFix] Fix a bug in loading safetensors (#1732) 2023-11-20 15:51:18 -08:00
32c927b53f [FIX] Update the doc link in README.md (#1730) 2023-11-20 12:46:24 -08:00
5ffc0d13a2 Migrate linter from pylint to ruff (#1665) 2023-11-20 11:58:01 -08:00
112627e8b2 [Docs] Fix the code block's format in deploying_with_docker page (#1722) 2023-11-20 01:22:39 -08:00
37c1e3c218 Documentation about official docker image (#1709) 2023-11-19 20:56:26 -08:00
06e9ebebd5 Add instructions to install vLLM+cu118 (#1717) 2023-11-18 23:48:58 -08:00
c5f7740d89 Bump up to v0.2.2 (#1689) 2023-11-18 21:57:07 -08:00
be66d9b125 Fix warning msg on quantization (#1715) 2023-11-18 21:49:55 -08:00
e1054247ba [Optimization] Implement fused add rmsnorm (#1667) 2023-11-18 18:18:02 -08:00
8d17774f92 Add AWQ support for all models (#1714) 2023-11-18 17:56:47 -08:00
e946260cf3 use get_tensor in safe_open (#1696) 2023-11-18 16:45:18 -08:00
edb305584b Support download models from www.modelscope.cn (#1588) 2023-11-17 20:38:31 -08:00
bb00f66e19 Use quantization_config in hf config (#1695) 2023-11-17 16:23:49 -08:00
Roy
e87557b069 Support Min P Sampler (#1642) 2023-11-17 16:20:49 -08:00
dcc543a298 [Minor] Fix comment (#1704) 2023-11-17 09:42:49 -08:00
0fc280b06c Update the adding-model doc according to the new refactor (#1692) 2023-11-16 18:46:26 -08:00
20d0699d49 [Fix] Fix comm test (#1691) 2023-11-16 16:28:39 -08:00
686f5e3210 Return usage for openai streaming requests (#1663) 2023-11-16 15:28:36 -08:00
415d109527 [Fix] Update Supported Models List (#1690) 2023-11-16 14:47:26 -08:00
521b35f799 Support Microsoft Phi 1.5 (#1664) 2023-11-16 14:28:39 -08:00
cb08cd0d75 [Minor] Fix duplication of ignored seq group in engine step (#1666) 2023-11-16 13:11:41 -08:00
2a2c135b41 Fix loading error when safetensors contains empty tensor (#1687) 2023-11-16 10:38:10 -08:00
65ea2ddf17 feat(config): support parsing torch.dtype (#1641)
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2023-11-16 01:31:06 -08:00
b514d3c496 Revert MptConfig to MPTConfig (#1668) 2023-11-16 01:19:39 -08:00
7076fa1c9f TP/quantization/weight loading refactor part 2 - Refactor quantized linear logic and extend quantization support to all models (#1622)
Refactor the tensor parallelism, quantization, and weight-loading codes.

Summary of the new features enabled by this PR:
- **All models** are able to be quantized with AWQ and SqueezeLLM, and [soon GPTQ](https://github.com/vllm-project/vllm/pull/1580).
- Model loading code became much simpler.
- Support model parallelism for all MQA/GQA models when the number of key/value heads is smaller than the tensor parallel size.
2023-11-15 22:50:41 -08:00
660a7fcfa4 Add DeepSpeed MII backend to benchmark script (#1649) 2023-11-14 12:35:30 -08:00
054072bee5 [Minor] Move RoPE selection logic to get_rope (#1633) 2023-11-12 16:04:50 -08:00
eb825c1e74 Fix #1474 - AssertionError:assert param_slice.shape == loaded_weight.shape (#1631) 2023-11-12 15:53:12 -08:00
1b290ace4f Run default _AsyncLLMEngine._run_workers_async in threadpool (#1628) 2023-11-11 14:50:44 -08:00
Sin
0d578228ca config parser: add ChatGLM2 seq_length to _get_and_verify_max_len (#1617) 2023-11-09 19:29:51 -08:00
aebfcb262a Dockerfile: Upgrade Cuda to 12.1 (#1609) 2023-11-09 11:49:02 -08:00
ab9e8488d5 Add Yi model to quantization support (#1600) 2023-11-09 11:47:14 -08:00
fd58b73a40 Build CUDA11.8 wheels for release (#1596) 2023-11-09 03:52:29 -08:00
8efe23f150 Fix input_metadata.selected_token_indices in worker prepare_inputs (#1546) 2023-11-08 14:19:12 -08:00
06458a0b42 Upgrade to CUDA 12 (#1527)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2023-11-08 14:17:49 -08:00
1a2bbc9301 ChatGLM Support (#1261) 2023-11-06 16:09:33 -08:00
Roy
e7f579eb97 Support Yi model (#1567) 2023-11-06 15:26:03 -08:00
8516999495 Add Quantization and AutoAWQ to docs (#1235) 2023-11-04 22:43:39 -07:00
9f669a9a7c Support YaRN models (#1264)
Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
Co-authored-by: Viktor Ferenczi <viktor@ferenczi.eu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2023-11-03 14:12:48 -07:00
555bdcc5a3 Added logits processor API to sampling params (#1469) 2023-11-03 14:12:15 -07:00
54ca1ba71d docs: add description (#1553) 2023-11-03 09:14:52 -07:00
9738b84a08 Force paged attention v2 for long contexts (#1510) 2023-11-01 16:24:32 -07:00
1fe0990023 Remove MPTConfig (#1529) 2023-11-01 15:29:05 -07:00
7e90a2d117 Add /health Endpoint for both Servers (#1540) 2023-11-01 10:29:44 -07:00
5687d584fe [BugFix] Set engine_use_ray=True when TP>1 (#1531) 2023-11-01 02:14:18 -07:00
cf8849f2d6 Add MptForCausalLM key in model_loader (#1526) 2023-10-31 15:46:53 -07:00
e575df33b1 [Small] Formatter only checks lints in changed files (#1528) 2023-10-31 15:39:38 -07:00
0ce8647dc5 Fix integer overflows in attention & cache ops (#1514) 2023-10-31 15:19:30 -07:00
9cabcb7645 Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
7b895c5976 [Fix] Fix duplicated logging messages (#1524) 2023-10-31 09:04:47 -07:00
7013a80170 Add support for spaces_between_special_tokens 2023-10-30 16:52:56 -07:00
79a30912b8 Add py.typed so consumers of vLLM can get type checking (#1509)
* Add py.typed so consumers of vLLM can get type checking

* Update py.typed

---------
Co-authored-by: aarnphm <29749331+aarnphm@users.noreply.github.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-10-30 14:50:47 -07:00
2f3d36a8a1 Fix logging so we actually get info level entries in the log. (#1494) 2023-10-30 10:02:21 -07:00
ac8d36f3e5 Refactor LLMEngine demo script for clarity and modularity (#1413)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-10-30 09:14:37 -07:00
15f5632365 Delay GPU->CPU sync in sampling (#1337) 2023-10-30 09:01:34 -07:00
aa9af07cac Fix bias in InternLM (#1501) 2023-10-29 16:24:18 -07:00
69be658bba Support repetition_penalty (#1424) 2023-10-29 10:02:41 -07:00
beac8dd461 fix: don't skip first special token. (#1497) 2023-10-29 04:26:36 -07:00
28b47d1e49 Add rope_scaling to Aquila model (#1457) 2023-10-29 04:25:21 -07:00
1f24755bf8 Support SqueezeLLM (#1326)
Co-authored-by: squeeze-ai-lab <squeezeailab.bair@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2023-10-21 23:14:59 -07:00
bf31d3606a Pin pydantic dependency versions (#1429) 2023-10-21 11:18:58 -07:00
d189170b6c remove useless statements (#1408) 2023-10-20 08:52:07 -07:00
f61dc8072f Fix type hints (#1427) 2023-10-20 08:50:47 -07:00
f8a1e39fae [BugFix] Define __eq__ in SequenceGroupOutputs (#1389) 2023-10-17 01:09:44 -07:00
a132435204 Fix typo (#1383) 2023-10-16 21:53:37 -07:00
9524867701 Add Mistral 7B to test_models (#1366) 2023-10-16 17:49:54 -07:00
c1376e0f82 Change scheduler & input tensor shape (#1381) 2023-10-16 17:48:42 -07:00
651c614aa4 Bump up the version to v0.2.1 (#1355) 2023-10-16 12:58:57 -07:00
d3a5bd9fb7 Fix sampler test (#1379) 2023-10-16 12:57:26 -07:00
e8ef4c0820 Fix PyTorch index URL in workflow (#1378) 2023-10-16 12:37:56 -07:00
348897af31 Fix PyTorch version to 2.0.1 in workflow (#1377) 2023-10-16 11:27:17 -07:00
9d9072a069 Implement prompt logprobs & Batched topk for computing logprobs (#1328)
Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com>
2023-10-16 10:56:50 -07:00
928de46888 Implement PagedAttention V2 (#1348) 2023-10-16 00:59:57 -07:00
29678cd213 Minor fix on AWQ kernel launch (#1356) 2023-10-15 21:53:56 -07:00
d0740dff1b Fix error message on TORCH_CUDA_ARCH_LIST (#1239)
Co-authored-by: Yunfeng Bai <yunfeng.bai@scale.com>
2023-10-14 14:47:43 -07:00
de89472897 Fix the issue for AquilaChat2-* models (#1339) 2023-10-13 11:51:29 -07:00
e7c8555d06 Bump up transformers version & Remove MistralConfig (#1254) 2023-10-13 10:05:26 -07:00
ec3b5ce9cc Improve detokenization performance (#1338) 2023-10-13 09:59:07 -07:00
6368e777a8 Add Aquila2 to README (#1331)
Signed-off-by: ldwang <ftgreat@gmail.com>
Co-authored-by: ldwang <ftgreat@gmail.com>
2023-10-12 12:11:16 -07:00
875afe38ab Add blacklist in model checkpoint (#1325) 2023-10-12 01:05:37 -07:00
ee8217e5be Add Mistral to quantization model list (#1278) 2023-10-11 00:26:24 -07:00
980dd4a2c4 Fix overflow in awq kernel (#1295)
Co-authored-by: 楚天翔 <tianxiang.ctx@alibaba-inc.com>
2023-10-11 00:19:53 -07:00
8285736840 workaround of AWQ for Turing GPUs (#1252) 2023-10-10 19:48:16 -07:00
91fce82c6f change the timing of sorting logits (#1309) 2023-10-10 19:37:42 -07:00
ac5cf86aa6 Fix __repr__ of SequenceOutputs (#1311) 2023-10-10 09:58:28 -07:00
6a6119554c lock torch version to 2.0.1 (#1290) 2023-10-10 09:21:57 -07:00
b95ee898fe [Minor] Fix comment in mistral.py (#1303) 2023-10-09 19:44:37 -07:00
9eed4d1f3e Update README.md (#1292) 2023-10-08 23:15:50 -07:00
6b5296aa3a [FIX] Explain why the finished_reason of ignored sequences are length (#1289) 2023-10-08 15:22:38 -07:00
ee92b58b3a Move bfloat16 check to worker (#1259) 2023-10-07 22:10:44 -07:00
09ff7f106a API server support ipv4 / ipv6 dualstack (#1288)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-10-07 15:15:54 -07:00
acbed3ef40 Use monotonic time where appropriate (#1249) 2023-10-02 19:22:05 -07:00
66d18a7fb0 add support for tokenizer revision (#1163)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-10-02 19:19:46 -07:00
ba0bfd40e2 TP/quantization/weight loading refactor part 1 - Simplify parallel linear logic (#1181) 2023-10-02 15:36:09 -07:00
84e4e37d14 [Minor] Fix type annotations (#1238) 2023-10-02 15:28:31 -07:00
a60b353005 support sharding llama2-70b on more than 8 GPUs (#1209)
Co-authored-by: JiCheng <247153481@qq.com>
2023-10-02 15:26:33 -07:00
ebe4d1db3a Fix boundary check in paged attention kernel (#1241) 2023-10-01 11:35:06 -07:00
b5a10eb0ef Added dtype arg to benchmarks (#1228) 2023-09-30 21:04:03 -07:00
0967102c6d fixing typo in tiiuae/falcon-rw-7b model name (#1226) 2023-09-29 13:40:25 -07:00
e2fb71ec9f Bump up the version to v0.2.0 (#1212) 2023-09-28 15:30:38 -07:00
f936657eb6 Provide default max model length (#1224) 2023-09-28 14:44:02 -07:00
6f88f762bf Fix OOM in attention kernel test (#1223) 2023-09-28 14:33:24 -07:00
202351d5bf Add Mistral to supported model list (#1221) 2023-09-28 14:33:04 -07:00
2e8e49fce3 [Fix] Remove false assertion (#1222) 2023-09-28 10:52:38 -07:00
a8e98aee0c Fix Mistral model (#1220) 2023-09-28 10:44:05 -07:00
bb1ba58f06 [Mistral] Mistral-7B-v0.1 support (#1196)
Co-authored-by: timlacroix <t@mistral.ai>
2023-09-28 10:41:03 -07:00
7bedab5748 Add rope_scaling to Qwen (#1210) 2023-09-28 00:49:23 -07:00
20f7cc4cde Add skip_special_tokens sampling params (#1186) 2023-09-27 19:21:42 -07:00
649aa730c5 Use standard extras for uvicorn (#1166) 2023-09-27 17:41:36 -07:00
a19bc5c628 Automatically configure max_num_batched_tokens (#1198) 2023-09-27 16:34:00 -07:00
28e616c4e3 fix qwen-14b model (#1173) 2023-09-27 16:33:16 -07:00
30e775281d fix typo (#1184)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-09-27 16:22:45 -07:00
21877b0d75 Support Longchat and RoPE scaling (#555)
Co-authored-by: Wing Lian <wing.lian@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2023-09-27 03:36:02 -07:00
cf5cb1e33e Allocate more shared memory to attention kernel (#1154) 2023-09-26 22:27:13 -07:00
03ffd0a022 Add comments on RoPE initialization (#1176) 2023-09-26 10:48:33 -07:00
a425bd9a9a [Setup] Enable TORCH_CUDA_ARCH_LIST for selecting target GPUs (#1074) 2023-09-26 10:21:08 -07:00
bbbf86565f Align max_tokens behavior with openai (#852) 2023-09-23 18:10:13 -07:00
9f6be8692e Fix config for Falcon (#1164) 2023-09-23 17:38:43 -07:00
f187877945 [FIX] Simplify sampler logic (#1156) 2023-09-23 17:21:56 -07:00
947b794146 [Sampler] Vectorized sampling (simplified) (#1048)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2023-09-22 17:48:04 -07:00
8d926e91f1 Announce the First vLLM Meetup (#1148) 2023-09-22 11:37:14 -07:00
4ee52bb169 Docs: Fix broken link to openai example (#1145)
Link to `openai_client.py` is no longer valid - updated to `openai_completion_client.py`
2023-09-22 11:36:09 -07:00
7d7e3b78a3 Use --ipc=host in docker run for distributed inference (#1125) 2023-09-21 18:26:47 -07:00
f98b745a81 feat: support stop_token_ids parameter. (#1097) 2023-09-21 15:34:02 -07:00
Roy
2d1e86f1b1 clean api code, remove redundant background task. (#1102) 2023-09-21 13:25:05 -07:00
1ac4ccf73c Add float16 and float32 (#1115) 2023-09-21 00:52:47 -07:00
2ac4d5e2bf Replace DtypeTensor (#1123) 2023-09-21 00:51:47 -07:00
3302f0aef3 rope_theta and max_position_embeddings from config (#1096)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: wnma3mz <wnma3mz@gmail.com>
2023-09-20 13:35:11 -07:00
6f2dd6c37e Add documentation to Triton server tutorial (#983) 2023-09-20 10:32:40 -07:00
bc0644574c Add gpu_memory_utilization and swap_space to LLM (#1090) 2023-09-19 22:16:04 -07:00
400b8289f7 Add pyarrow to dependencies & Print warning on Ray import error (#1094) 2023-09-18 22:36:17 -07:00
c1026311b5 [Community] Add vLLM Discord server (#1086) 2023-09-18 12:23:35 -07:00
2b1c116b5a Add minimum capability requirement for AWQ (#1064) 2023-09-18 12:02:01 -07:00
cc796b1358 Convert before transpose (#1073) 2023-09-18 11:51:48 -07:00
f029ef94d7 Fix get_max_num_running_seqs for waiting and swapped seq groups (#1068) 2023-09-18 11:49:40 -07:00
Roy
95592fa00a align llm_engine and async_engine. (#1081) 2023-09-18 11:49:10 -07:00
fbe66e1d0b added support for quantize on LLM module (#1080) 2023-09-18 11:04:21 -07:00
90979c38f8 [FIX] Don't initialize parameter by default (#1067) 2023-09-17 17:15:38 -07:00
e21d7687a9 Fix hanging when prompt exceeds limit (#1029) 2023-09-17 01:48:56 -07:00
ff36139ffc Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00
e3e79e9e8a Implement AWQ quantization support for LLaMA (#1032)
Co-authored-by: Robert Irvine <robert@seamlessml.com>
Co-authored-by: root <rirv938@gmail.com>
Co-authored-by: Casper <casperbh.96@gmail.com>
Co-authored-by: julian-q <julianhquevedo@gmail.com>
2023-09-16 00:03:37 -07:00
b9fe4616f9 Abort when coroutine is cancelled (#1020) 2023-09-14 17:40:18 -07:00
64ca424e75 Fix warning message on LLaMA FastTokenizer (#1037) 2023-09-14 17:33:32 -07:00
b5f93d0631 Only fail if logit_bias has actual values (#1045) 2023-09-14 17:33:01 -07:00
a58936966f Add pandas to requirements.txt (#1047)
* Add pandas to requirements.txt

* Minor
2023-09-14 17:31:38 -07:00
dd54a4b026 Fix detokenization leaving special tokens (#1044)
Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
2023-09-14 16:37:03 -07:00
eda1a7cad3 Announce paper release (#1036) 2023-09-13 17:38:13 -07:00
f04908cae7 [FIX] Minor bug fixes (#1035)
* [FIX] Minor bug fixes

* Address review comments
2023-09-13 16:38:12 -07:00
ab019eea75 Add Model Revision Support (#1014)
Co-authored-by: Jasmond Loh <Jasmond.Loh@hotmail.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-09-13 15:20:02 -07:00
9841d48a10 Use TGI-like incremental detokenization (#984) 2023-09-13 13:38:01 -07:00
3272d7a0b7 Fix typo in README.md (#1033) 2023-09-13 12:55:23 -07:00
0bb1e885a0 Make max_model_len configurable (#972) 2023-09-12 16:29:19 -07:00
d6545ad22e add option to shorten prompt print in log (#991)
Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-09-12 15:10:14 -07:00
90eb3f43ca Bump up the version to v0.1.7 (#1013) 2023-09-11 00:54:30 -07:00
e67b4f2c2a Use FP32 in RoPE initialization (#1004)
Co-authored-by: One <imone@tuta.io>
2023-09-11 00:26:35 -07:00
d6770d1f23 Update setup.py (#1006) 2023-09-10 23:42:45 -07:00
b9cecc2635 [Docs] Update installation page (#1005) 2023-09-10 14:23:31 -07:00
898285c9bf fix: CUDA error when inferencing with Falcon-40B base model (#992) 2023-09-10 01:39:02 -07:00
a62de9ecfd Fix wrong dtype in PagedAttentionWithALiBi bias (#996)
---------

Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
2023-09-09 14:58:35 -07:00
4042d192f5 fix "tansformers_module" ModuleNotFoundError when load model with trust_remote_code=True (#871) 2023-09-08 17:21:30 -07:00
1117aa1411 Bump up the version to v0.1.6 (#989) 2023-09-08 00:07:46 -07:00
080438477f Start background task in AsyncLLMEngine.generate (#988)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-09-08 00:03:39 -07:00
4b5bcf8906 faster startup of vLLM (#982)
* update

---------

Co-authored-by: Robert Irvine <robert@seamlessml.com>
2023-09-08 14:48:54 +09:00
852ef5b4f5 Bump up the version to v0.1.5 (#944) 2023-09-07 16:15:31 -07:00
db09d4ad83 [FIX] Fix Alibi implementation in PagedAttention kernel (#945)
* [FIX] Fix Alibi implementation in PagedAttention kernel

* Fix test_attention

* Fix

---------

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Oliver-ss <yuansongwx@outlook.com>
2023-09-07 15:53:14 -07:00
c957c741d9 Enable safetensors loading for all models (#974) 2023-09-07 15:49:52 -07:00
c07ece5ca4 Make AsyncLLMEngine more robust & fix batched abort (#969)
Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
Co-authored-by: Avnish Narayan <38871737+avnishn@users.noreply.github.com>
2023-09-07 13:43:45 -07:00
7a9c20c715 Bum up transformers version (#976) 2023-09-07 13:15:53 -07:00
005ba458b5 Set torch default dtype in a context manager (#971)
Signed-off-by: Antoni Baum <antoni.baum@protonmail.com>
2023-09-07 15:39:37 +09:00
320a622ec4 [BugFix] Implement RoPE for GPT-J (#941) 2023-09-06 11:54:33 +09:00
c9927c1a6a Use queue for finished requests (#957) 2023-09-05 19:27:23 -07:00
fbd80ad409 Clean up kernel unit tests (#938) 2023-09-05 16:57:38 -07:00
22379d5513 fix: typo (#948) 2023-09-04 23:22:30 -07:00
1696725879 Initialize AsyncLLMEngine bg loop correctly (#943) 2023-09-04 17:41:22 -07:00
002800f081 Align vLLM's beam search implementation with HF generate (#857) 2023-09-04 17:29:42 -07:00
e15932bb60 Only emit warning about internal tokenizer if it isn't being used (#939) 2023-09-05 00:50:55 +09:00
ce741ba3e4 Refactor AsyncLLMEngine (#880) 2023-09-03 21:43:43 -07:00
bf87484efa [BugFix] Fix NaN errors in paged attention kernel (#936) 2023-09-04 09:20:06 +09:00
8ce9c50d40 Avoid compiling kernels for double data type (#933) 2023-09-02 14:59:47 +09:00
32b6816e55 Add tests for models (#922) 2023-09-01 11:19:43 +09:00
c128d69856 Fix README.md Link (#927) 2023-08-31 17:18:34 -07:00
55b28b1eee [Docs] Minor fixes in supported models (#920)
* Minor fix in supported models

* Add another small fix for Aquila model

---------

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-08-31 16:28:39 -07:00
e11222333f fix: bug fix when penalties are negative (#913)
Co-authored-by: dongyong-lee <dongyong.lee@navercorp.com>
2023-09-01 00:37:17 +09:00
28873a2799 Improve _prune_hidden_states micro-benchmark (#707) 2023-08-31 13:28:43 +09:00
0080d8329d Add acknowledgement to a16z grant 2023-08-30 02:26:47 -07:00
0d93f15694 Accelerate LLaMA model loading (#234) 2023-08-30 01:00:13 -07:00
becd7a56f1 Enable request body OpenAPI spec for OpenAI endpoints (#865) 2023-08-29 21:54:08 -07:00
75471386de use flash-attn via xformers (#877) 2023-08-29 21:52:13 -07:00
d2b2eed67c [Fix] Fix a condition for ignored sequences (#867) 2023-08-27 23:00:56 -07:00
4b6f069b6f Add support for CodeLlama (#854) 2023-08-25 12:44:07 -07:00
791d79de32 Bump up the version to v0.1.4 (#846) 2023-08-25 12:28:00 +09:00
94d2f59895 Set replacement=True in torch.multinomial (#858) 2023-08-25 12:22:01 +09:00
75c0ca9d43 Clean up code (#844) 2023-08-23 16:44:15 -07:00
2a4ec90854 Fix for breaking changes in xformers 0.0.21 (#834) 2023-08-23 17:44:21 +09:00
85ebcda94d Fix typo of Aquila in README.md (#836) 2023-08-22 20:48:36 -07:00
d64bf1646c Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00
a41c20435e Add compute capability 8.9 to default targets (#829) 2023-08-23 07:28:38 +09:00
eedac9dba0 fix: revert code to avoid no attribute problem (#827) 2023-08-22 11:55:16 -07:00
14f9c72bfd Update Supported Model List (#825) 2023-08-22 11:51:44 -07:00
ad5f2fe34c Add support for aquila (#663)
* add aquila

Signed-off-by: ftgreat <ftgreat@163.com>

* fix some bug

Signed-off-by: shunxing1234 <xw747777271@gmail.com>

* delete pdb

Signed-off-by: shunxing1234 <xw747777271@gmail.com>

* fix bugs

Signed-off-by: shunxing1234 <xw747777271@gmail.com>

* fix bugs

Signed-off-by: shunxing1234 <xw747777271@gmail.com>

* delete whitespace

Signed-off-by: shunxing1234 <xw747777271@gmail.com>

* format

* fix order

---------

Signed-off-by: ftgreat <ftgreat@163.com>
Signed-off-by: shunxing1234 <xw747777271@gmail.com>
Co-authored-by: ftgreat <ftgreat@163.com>
2023-08-22 00:13:36 -07:00
4f8584756d Fix mqa is false case in gpt_bigcode (#806) 2023-08-21 22:22:06 -07:00
65fc1c3127 set default coompute capability according to cuda version (#773) 2023-08-21 16:05:44 -07:00
c393af6cd7 [Feature | CI] Added a github action to build wheels (#746) 2023-08-21 16:59:15 +09:00
0c04ce3234 Fix typo in sampling_params.py (#788) 2023-08-18 10:12:46 +09:00
73b3de79ea explicitly del state (#784) 2023-08-17 12:56:04 -07:00
d1744376ae Align with huggingface Top K sampling (#753) 2023-08-15 16:44:33 -07:00
805de738f6 Fix typo in tokenizer.py (#750)
conjuction -> conjunction
2023-08-14 22:26:36 -07:00
1b151ed181 Fix baichuan doc style (#748) 2023-08-13 20:57:31 -07:00
e06f504a76 Supports tokens and arrays of tokens as inputs to the OpenAI completion API (#715) 2023-08-11 12:14:34 -07:00
WRH
462ae5220a [Fix] unwantted bias in InternLM Model (#740) 2023-08-11 11:40:37 -07:00
66c54aa9c3 Check the max prompt length for the OpenAI completions API (#472) 2023-08-08 17:43:49 -07:00
735ecfff61 add internlm model (#528) 2023-08-08 16:35:06 -07:00
a57d13cc96 add QWen-7b (#685)
Co-authored-by: wq.chu <wq.chu@tianrang-inc.com>
2023-08-08 13:50:38 -07:00
79af7e96a0 [OPTIMIZATION] Optimizes the single_query_cached_kv_attention kernel (#420) 2023-08-04 10:57:29 -07:00
621980bdc0 fix: incorrect bigcode attention heads num (#676) 2023-08-04 10:35:22 -07:00
aa84c92ef6 Bump up version to 0.1.3 (#657) 2023-08-02 16:46:53 -07:00
f7389f4763 [Doc] Add Baichuan 13B to supported models (#656) 2023-08-02 16:45:12 -07:00
55fe8a81ec Refactor scheduler (#658) 2023-08-02 16:42:01 -07:00
e8ddc08ec8 [BUG FIX] upgrade fschat version to 0.2.23 (#650)
Co-authored-by: hao.yu <hao.yu@cn-c017.server.mila.quebec>
2023-08-02 14:05:59 -07:00
1b0bd0fe8a Add Falcon support (new) (#592) 2023-08-02 14:04:39 -07:00
20044cab7a Fix log message in scheduler (#652) 2023-08-02 13:35:10 -07:00
64f23c2900 fix baichuan for different position embedding for 7b and 13b models (#643) 2023-08-01 22:22:51 -07:00
d4c7755ca8 fix biachuan-7b tp (#598)
Co-authored-by: wq.chu <wq.chu@tianrang-inc.com>
2023-08-01 15:41:36 -07:00
aa39e42c5a fix doc (#622) 2023-07-31 13:11:57 -07:00
953f28cf9a fix ModuleNotFoundError (#599)
Co-authored-by: fangli <fangli@tencent.com>
2023-07-29 20:52:41 -07:00
c0d00f5be6 [Fix] fix import error of RayWorker (#604) (#605) 2023-07-27 23:37:40 -07:00
58a072be15 [Fix] Add model sequence length into model config (#575) 2023-07-25 23:46:30 -07:00
82ad323dee [Fix] Add chat completion Example and simplify dependencies (#576) 2023-07-25 23:45:48 -07:00
df5dd3c68e Add Baichuan-7B to README (#494) 2023-07-25 15:25:12 -07:00
2d867b55fa fixed tensor parallel is not defined (#564) 2023-07-25 14:16:51 -07:00
d7a1c6d614 Fix paged attention testing. (#495)
Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>
2023-07-24 21:01:56 -07:00
7d5a155e4a [Fix] Fix GPTBigcoder for distributed execution (#503) 2023-07-24 18:36:33 -07:00
1dde34e0f8 GPTJConfig has no attribute rotary. (#532) 2023-07-24 11:29:30 -07:00
6fc2a38b11 Add support for LLaMA-2 (#505) 2023-07-20 11:38:27 -07:00
c487a221ee Fix bad assert in initialize_cluster if PG already exists (#526) 2023-07-19 23:17:12 -07:00
9925c17940 Ray placement group support (#397) 2023-07-19 22:49:31 -07:00
8c4b2592fb fix: enable trust-remote-code in api server & benchmark. (#509) 2023-07-19 17:06:15 -07:00
WRH
cf21a9bd5c support trust_remote_code in benchmark (#518) 2023-07-19 17:02:40 -07:00
16c3e295a8 fix(ray_utils): ignore re-init error (#465) 2023-07-19 17:01:19 -07:00
bda41c70dd hotfix attn alibi wo head mapping (#496)
Co-authored-by: oliveryuan <oliveryuan@basemind.com>
2023-07-18 11:31:48 -07:00
453bafb96f Merge pull request #498 from MoeedDar/main
Fixed old name reference for max_seq_len
2023-07-18 09:22:56 -07:00
328d231c17 Fixed old name reference for max_seq_len 2023-07-18 16:47:59 +01:00
b4b195b360 fix max seq len (#489) 2023-07-17 23:20:20 -07:00
20b0d88d16 Add support for baichuan (#365) 2023-07-17 13:50:55 -07:00
2bdea7ac11 [Fix] Fix the condition of max_seq_len (#477) 2023-07-17 00:33:48 -04:00
58df2883cb [Doc] Add doc for running vLLM on the cloud (#426)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-07-16 13:37:14 -07:00
6d7d95a70a Offload port selection to OS (#467) 2023-07-15 23:11:02 -07:00
96853af5a8 Optimize MQA Kernel (#452) 2023-07-14 20:06:40 -04:00
dbed69058c Fix the KeyError when loading bloom-based models (#441) 2023-07-13 21:58:09 -07:00
7b6ae94059 add vocab padding for LLama(Support WizardLM) (#411) 2023-07-13 23:56:22 -04:00
c6dfc3cdbe Fix handling of special tokens in decoding. (#418) 2023-07-12 11:14:56 -04:00
51be365143 fix: freeze pydantic to v1 (#429) 2023-07-12 11:10:55 -04:00
c894836108 [Model] Add support for GPT-J (#226)
Co-authored-by: woWoosuk Kwon <woosuk.kwon@berkeley.edu>
2023-07-08 17:55:16 -07:00
75beba29b5 Don't try to load training_args.bin (#373) 2023-07-08 15:26:28 -07:00
ddfdf470ae Add trust_remote_code arg to get_config (#405) 2023-07-08 15:24:17 -07:00
b6fbb9a565 Sort the outputs before return (#402) 2023-07-08 14:48:18 -07:00
2179e4f4c5 avoid python list copy in sequence initialization (#401) 2023-07-08 12:42:08 -07:00
a945fcc2ae Add trust-remote-code flag to handle remote tokenizers (#364) 2023-07-07 11:04:58 -07:00
be54f8e5c4 [Fix] Change /generate response-type to json for non-streaming (#374) 2023-07-06 18:15:17 -07:00
b396cb4998 fix: only response [DONE] once when streaming response. (#378) 2023-07-06 18:08:40 -07:00
1c395b4eaa Bump up the version (#300) 2023-07-04 21:41:53 -07:00
3d64cf019e [Server] use fastchat.model.model_adapter.get_conversation_template method to get model template (#357) 2023-07-04 21:39:59 -07:00
98fe8cb542 [Server] Add option to specify chat template for chat endpoint (#345) 2023-07-03 23:01:56 -07:00
ffa6d2f9f9 [Docs] Fix typo (#346) 2023-07-03 16:51:47 -07:00
404422f42e [Model] Add support for MPT (#334) 2023-07-03 16:47:53 -07:00
7717d0838b Fix an endless loop issue when engine_step throws a RuntimeError (#339) 2023-07-03 15:22:28 -07:00
42e0c1df78 [Quality] Add CI for formatting (#343) 2023-07-03 14:50:56 -07:00
e41f06702c Add support for BLOOM (#331) 2023-07-03 13:12:35 -07:00
d6fa1be3a8 [Quality] Add code formatter and linter (#326) 2023-07-03 11:31:55 -07:00
0ffded812a [Fix] Better error message for batched prompts (#342) 2023-07-03 09:27:31 -07:00
0bd2a573a5 Allow send list of str for the Prompt on openai demo endpoint /v1/completions (#323)
* allow str or List[str] for prompt

* Update vllm/entrypoints/openai/api_server.py

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>

---------

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2023-07-03 09:17:50 -07:00
49b26e2cec feat: add ChatCompletion endpoint in OpenAI demo server. (#330) 2023-07-02 22:54:33 -07:00
dafd924c1f Raise error for long prompt (#273) 2023-06-30 18:48:49 -07:00
598dc4b79a [Fix] Weight loading for GPTBigCode (#313) 2023-06-29 22:14:17 -07:00
85de093472 [Fix] Do not pin memory when in WSL (#312) 2023-06-29 15:00:21 -07:00
f72297562f Add news for the vllm+skypilot example (#314) 2023-06-29 12:32:37 -07:00
9d27b09d12 Update README.md (#306) 2023-06-29 06:52:15 -07:00
998d9d1509 [Tokenizer] Add tokenizer mode (#298) 2023-06-28 14:19:22 -07:00
425040d4c1 remove floats == 0 comparison (#285) 2023-06-28 14:11:51 -07:00
4338cc4750 [Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00
bdd6b4c8bc Add LLM.set_tokenizer (#283) 2023-06-28 00:28:29 -07:00
2b7d3aca2e Update setup.py (#282)
Co-authored-by: neubig <neubig@gmail.com>
2023-06-27 14:34:23 -07:00
4026a049d3 expand coverage of gpt2 model loading (#271) 2023-06-27 06:27:41 -07:00
43710e8d09 [Fix] Fix default port number in benchmark scripts (#265) 2023-06-26 13:15:35 -07:00
526df28fb2 [BugFix] Fix a bug in counting running sequences (#266) 2023-06-26 13:09:02 -07:00
2cf1a333b6 [Doc] Documentation for distributed inference (#261) 2023-06-26 11:34:23 -07:00
0b7db411b5 [Bug] Fix the OOM condition for CPU cache (#260) 2023-06-26 11:16:13 -07:00
471a7a4566 Compatible with Decapoda Research llama hf version (#251) 2023-06-26 09:23:57 -07:00
6214dd6ce9 Update README.md (#236) 2023-06-25 16:58:06 -07:00
0603379863 fix wrong using getattr to get dict value (#232) 2023-06-24 22:00:24 -07:00
665c48963b [Docs] Add GPTBigCode to supported models (#213) 2023-06-22 15:05:11 -07:00
298695b766 GPTBigCode (StarCoder, SantaCoder Support) (#209) 2023-06-23 01:49:27 +08:00
83658c8ace Bump up version to 0.1.1 (#204) 2023-06-22 15:33:32 +08:00
1d24ccb96c [Fix] Better error message when there is OOM during cache initialization (#203) 2023-06-22 15:30:06 +08:00
14f0b39cda [Bugfix] Fix a bug in RequestOutput.finished (#202) 2023-06-22 00:17:24 -07:00
2e0d314384 fix-ray (#193) 2023-06-22 00:21:41 +08:00
67d96c29fb Use slow tokenizer for open llama models (#168) 2023-06-20 14:19:47 +08:00
033f5c78f5 Remove e.g. in README (#167) 2023-06-20 14:00:28 +08:00
794e578de0 [Minor] Fix URLs (#166) 2023-06-19 22:57:14 -07:00
caddfc14c1 [Minor] Fix icons in doc (#165) 2023-06-19 20:35:38 -07:00
fc72e39de3 Change image urls (#164) 2023-06-20 11:15:15 +08:00
b7e62d3454 Fix repo & documentation URLs (#163) 2023-06-19 20:03:40 -07:00
364536acd1 [Docs] Minor fix (#162) 2023-06-19 19:58:23 -07:00
0b32a987dd Add and list supported models in README (#161) 2023-06-20 10:57:46 +08:00
570fb2e9cc [PyPI] Fix package info in setup.py (#158) 2023-06-19 18:05:01 -07:00
a255885f83 Add logo and polish readme (#156) 2023-06-19 16:31:13 +08:00
5822ede66e Add performance figures for dark mode (#160) 2023-06-18 23:46:24 -07:00
0370afa2e5 Remove benchmark_async_llm_server.py (#155) 2023-06-19 11:12:37 +08:00
7e2a913c64 [Minor] Fix CompletionOutput.__repr__ (#157) 2023-06-18 19:58:25 -07:00
3f92038b99 Add comments on swap space (#154) 2023-06-18 11:39:35 -07:00
dcda03b4cb Write README and front page of doc (#147) 2023-06-18 03:19:38 -07:00
bf5f121c02 Reduce GPU memory utilization to make sure OOM doesn't happen (#153) 2023-06-18 17:33:50 +08:00
bec7b2dc26 Add quickstart guide (#148) 2023-06-18 01:26:12 +08:00
0b98ba15c7 Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00
e5464ee484 Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00
bab8f3dd0d [Minor] Fix benchmark_throughput.py (#151) 2023-06-16 21:00:52 -07:00
eedb46bf03 Rename servers and change port numbers to reduce confusion (#149) 2023-06-17 00:13:02 +08:00
311490a720 Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00
da5ddcd544 Remove redundant code in ColumnParallelLinear (#146) 2023-06-10 21:25:11 -07:00
5020e1e80c Non-streaming simple fastapi server (#144) 2023-06-10 10:43:07 -07:00
4298374265 Add docstrings for LLMServer and related classes and examples (#142) 2023-06-07 18:25:20 +08:00
e38074b1e6 Support FP32 (#141) 2023-06-07 00:40:21 -07:00
376725ce74 [PyPI] Packaging for PyPI distribution (#140) 2023-06-05 20:03:14 -07:00
456941cfe4 [Docs] Write the Adding a New Model section (#138) 2023-06-05 20:01:26 -07:00
1a956e136b Fix various issues of async servers (#135) 2023-06-05 23:44:50 +08:00
8274ca23ac Add docstrings for LLM (#137) 2023-06-04 12:52:41 -07:00
62ec38ea41 Document supported models (#127) 2023-06-02 22:35:17 -07:00
0eda2e0953 Add .readthedocs.yaml (#136) 2023-06-02 22:27:44 -07:00
211318d44a Add throughput benchmarking script (#133) 2023-05-28 03:20:05 -07:00
337871c6fd Enable LLaMA fast tokenizer (#132) 2023-05-28 02:51:42 -07:00
56b7f0efa4 Add a doc for installation (#128) 2023-05-27 01:13:06 -07:00
d721168449 Improve setup script & Add a guard for bfloat16 kernels (#130) 2023-05-27 00:59:32 -07:00
4a151dd453 Add activation registry (#126) 2023-05-25 00:09:07 -07:00
057daef778 OpenAI Compatible Frontend (#116) 2023-05-23 21:39:50 -07:00
e86717833d Incrementally decode output tokens (#121) 2023-05-23 20:46:32 -07:00
aedba6d5ec Print warnings/errors for large swap space (#123) 2023-05-23 18:22:26 -07:00
a283ec2eec Add contributing guideline and mypy config (#122) 2023-05-23 17:58:51 -07:00
3f942acfe1 Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00
19d2899439 Add initial sphinx docs (#120) 2023-05-22 17:02:44 -07:00
655a5e48df Introduce LLM class for offline inference (#115) 2023-05-21 17:04:18 -07:00
f746ced08d Implement stop strings and best_of (#114) 2023-05-21 11:18:00 -07:00
c3442c1f6f Refactor system architecture (#109) 2023-05-20 13:06:59 -07:00
7297fa6f7c Remove unused parts in Megatron-LM code and add copyright notice (#110) 2023-05-20 09:11:34 -06:00
b7955ef17b Fix timeout error in the FastAPI frontend (#34) 2023-05-19 14:00:46 -06:00
f756799b84 Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00
825d8892b5 Use pytest format for unit tests (#107) 2023-05-17 17:11:23 -07:00
b322fd1607 Add docstrings to some modules and classes (#100) 2023-05-14 22:32:38 -07:00
667ba3995c Add copyright headers to source files adapted from FT (#104) 2023-05-14 22:19:19 -07:00
707ec647bb Add copyright headers for HF models (#103) 2023-05-14 21:54:32 -07:00
89988ec8c2 Add Apache-2.0 license (#102) 2023-05-14 18:05:19 -07:00
6208d622ca Minor code cleaning for SamplingParams (#99) 2023-05-12 18:07:09 -07:00
42f1042e1c Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00
55f8b0a5de Implement presence and frequency penalties (#95) 2023-05-10 23:39:12 -07:00
9f88db35da Support top-k sampling (#94) 2023-05-10 12:51:36 -07:00
ae356774ab Avoid sorting waiting queue & Minor code cleaning (#93) 2023-05-10 01:57:07 -07:00
e331957784 Log system stats (#90) 2023-05-10 01:06:53 -07:00
8d66a7b6d7 Rename variables and methods (#91) 2023-05-10 00:58:31 -07:00
ce26e57fd3 Update sample prompts in simple_server.py (#89) 2023-05-09 16:47:39 -07:00
85eb631839 Use slow tokenizer for LLaMA (#84) 2023-05-09 16:03:44 -07:00
add055e151 Enhance model loader (#83) 2023-05-09 15:46:42 -07:00
7c041ab578 Refactor system architecture (#82) 2023-05-09 15:30:12 -07:00
8917782af6 Add a system logger (#85) 2023-05-08 23:03:35 -07:00
7addca5935 Specify python package dependencies in requirements.txt (#78) 2023-05-07 16:30:43 -07:00
c84e924287 [Minor] Fix a dtype bug (#79) 2023-05-06 02:12:12 -07:00
c9d5b6d4a8 Replace FlashAttention with xformers (#70) 2023-05-05 02:01:08 -07:00
189ae23133 Use dtype from model config & Add Dolly V2 (#63) 2023-05-04 03:05:37 -07:00
e548c1488a Add support for GPT-2 (#60) 2023-05-04 02:59:56 -07:00
130d5fd8c7 Fix a bug in attention kernel (#68) 2023-05-04 02:56:09 -07:00
e070829ae8 Support bfloat16 data type (#54) 2023-05-03 14:09:44 -07:00
436e523bf1 Refactor attention kernels (#53) 2023-05-03 13:40:13 -07:00
27f1410d06 New weight loader without np copy (#52) 2023-05-03 15:32:04 +08:00
4858f3bb45 Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00
a96d63c21d Add support for GPT-NeoX (Pythia) (#50) 2023-04-28 00:32:10 -07:00
1029 changed files with 187304 additions and 10622 deletions

View File

@ -0,0 +1,36 @@
import os
import zipfile
MAX_SIZE_MB = 250
def print_top_10_largest_files(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]:
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
def check_wheel_size(directory):
for root, _, files in os.walk(directory):
for f in files:
if f.endswith(".whl"):
wheel_path = os.path.join(root, f)
wheel_size = os.path.getsize(wheel_path)
wheel_size_mb = wheel_size / (1024 * 1024)
if wheel_size_mb > MAX_SIZE_MB:
print(
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
print_top_10_largest_files(wheel_path)
return 1
else:
print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb} MB).")
return 0
if __name__ == "__main__":
import sys
sys.exit(check_wheel_size(sys.argv[1]))

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.671
- name: "exact_match,flexible-extract"
value: 0.664
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.905
- name: "exact_match,flexible-extract"
value: 0.905
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.892
- name: "exact_match,flexible-extract"
value: 0.892
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.752
- name: "exact_match,flexible-extract"
value: 0.754
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.753
- name: "exact_match,flexible-extract"
value: 0.753
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.755
- name: "exact_match,flexible-extract"
value: 0.755
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.753
- name: "exact_match,flexible-extract"
value: 0.753
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.728
- name: "exact_match,flexible-extract"
value: 0.728
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.758
- name: "exact_match,flexible-extract"
value: 0.759
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.756
- name: "exact_match,flexible-extract"
value: 0.752
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.409
- name: "exact_match,flexible-extract"
value: 0.406
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
model_name: "nvidia/Minitron-4B-Base"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.252
- name: "exact_match,flexible-extract"
value: 0.252
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.86
- name: "exact_match,flexible-extract"
value: 0.86
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.624
- name: "exact_match,flexible-extract"
value: 0.624
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.616
- name: "exact_match,flexible-extract"
value: 0.632
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.578
- name: "exact_match,flexible-extract"
value: 0.585
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.593
- name: "exact_match,flexible-extract"
value: 0.588
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.595
- name: "exact_match,flexible-extract"
value: 0.582
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.792
- name: "exact_match,flexible-extract"
value: 0.824
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,5 @@
Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml

View File

@ -0,0 +1,10 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -0,0 +1,46 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
usage() {
echo``
echo "Runs lm eval harness on GSM8k using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -b - batch size to run the evaluation at"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo
}
while getopts "m:b:l:f:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model hf \
--model_args pretrained=$MODEL,parallelize=True \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE

View File

@ -0,0 +1,51 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.3
usage() {
echo``
echo "Runs lm eval harness on GSM8k using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -b - batch size to run the evaluation at"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE

View File

@ -0,0 +1,59 @@
#!/bin/bash
usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm and compares to "
echo "precomputed baseline (measured by HF transformers.)"
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
echo " -t - tensor parallel size"
echo
}
SUCCESS=0
while getopts "c:t:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
LOCAL_SUCCESS=0
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
export LM_EVAL_TP_SIZE=$TP_SIZE
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
else
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
fi
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
done
if [ "${SUCCESS}" -eq "0" ]; then
exit 0
else
exit 1
fi

View File

@ -0,0 +1,55 @@
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
* export LM_EVAL_TP_SIZE=4
* pytest -s test_lm_eval_correctness.py
"""
import os
from pathlib import Path
import lm_eval
import numpy
import yaml
RTOL = 0.02
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
def launch_lm_eval(eval_config):
model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={TP_SIZE}," \
f"add_bos_token=true"
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto")
return results
def test_lm_eval_correctness():
eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
# Launch eval requests.
results = launch_lm_eval(eval_config)
# Confirm scores match ground truth.
for task in eval_config["tasks"]:
for metric in task["metrics"]:
ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}')
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)

View File

@ -0,0 +1,152 @@
# vLLM benchmark suite
## Introduction
This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
**Benchmarking Duration**: about 1hr.
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
## Nightly benchmark quick overview
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
**Benchmarking Duration**: about 3.5hrs.
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `nightly-benchmarks` label.
## Performance benchmark details
See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
#### Latency test
Here is an example of one test inside `latency-tests.json`:
```json
[
{
"test_name": "latency_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
]
```
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
#### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
#### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
]
```
Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
#### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
#### Nightly tests
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
#### Docker containers
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -0,0 +1,61 @@
steps:
- label: "Wait for container to be ready"
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
containers:
- image: badouralix/curl-jq
command:
- sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100"
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
# - label: "H100"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# command:
# - bash
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
# mount-buildkite-agent: true
# propagate-environment: true
# ipc: host
# gpus: all
# environment:
# - VLLM_USAGE_SOURCE
# - HF_TOKEN

View File

@ -0,0 +1,45 @@
# Nightly benchmark
The main goal of this benchmarking is two-fold:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
## Docker images
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
## Hardware
One AWS node with 8x NVIDIA A100 GPUs.
## Workload description
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 500 prompts.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
## Plots
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
## Results
{nightly_results_benchmarking_table}

View File

@ -0,0 +1,120 @@
common_pod_spec: &common_pod_spec
priorityClassName: perf-benchmark
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /root/.cache/huggingface
type: Directory
common_container_settings: &common_container_settings
command:
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
- label: "A100 trt benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
<<: *common_container_settings
- label: "A100 lmdeploy benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: openmmlab/lmdeploy:v0.5.0
<<: *common_container_settings
- label: "A100 vllm benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:latest
<<: *common_container_settings
- label: "A100 tgi benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: ghcr.io/huggingface/text-generation-inference:2.1
<<: *common_container_settings
- wait
- label: "Plot"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:v0.5.0.post1
command:
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- wait

View File

@ -0,0 +1,380 @@
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
ensure_sharegpt_downloaded() {
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
if [ ! -f "$FILE" ]; then
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
else
echo "$FILE already exists."
fi
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
}
kill_gpu_processes() {
# kill all processes on GPU.
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
if [ -z "$pids" ]; then
echo "No GPU processes found."
else
for pid in $pids; do
kill -9 "$pid"
echo "Killed process with PID: $pid"
done
echo "All GPU processes have been killed."
fi
# waiting for GPU processes to be fully killed
# loop while nvidia-smi returns any processes
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
sleep 1
echo "Waiting for GPU processes to be killed"
done
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
if command -v buildkite-agent >/dev/null 2>&1; then
BUILDKITE_AGENT_COMMAND="buildkite-agent"
elif [ -f /workspace/buildkite-agent ]; then
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
else
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
run_latency_tests() {
# run latency tests using `benchmark_latency.py`
# $1: a json file specifying latency test cases
local latency_test_file
latency_test_file=$1
# Iterate over latency tests
jq -c '.[]' "$latency_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^latency_ ]]; then
echo "In latency-test.json, test_name must start with \"latency_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
latency_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params")
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
latency_command="python3 benchmark_latency.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
echo "Running test case $test_name"
echo "Latency command: $latency_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg latency "$latency_command" \
--arg gpu "$gpu_type" \
'{
latency_command: $latency,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$latency_command"
kill_gpu_processes
done
}
run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases
local throughput_test_file
throughput_test_file=$1
# Iterate over throughput tests
jq -c '.[]' "$throughput_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^throughput_ ]]; then
echo "In throughput-test.json, test_name must start with \"throughput_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
# check if there is enough GPU to run the test
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
throughput_command="python3 benchmark_throughput.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
echo "Running test case $test_name"
echo "Throughput command: $throughput_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$throughput_command" \
--arg gpu "$gpu_type" \
'{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes
done
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
echo "In serving-test.json, test_name must start with \"serving_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters')
client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $testname."
continue
fi
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
server_pid=$!
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill -9 $server_pid
kill_gpu_processes
done
}
main() {
check_gpus
check_hf_token
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
# get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOG_LEVEL="WARNING"
# prepare for benchmarking
cd benchmarks || exit 1
ensure_sharegpt_downloaded
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,76 @@
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
main() {
check_gpus
check_hf_token
df -h
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
cd $VLLM_SOURCE_CODE_LOC/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# run lmdeploy
if which lmdeploy >/dev/null; then
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
exit 0
fi
# run tgi
if [ -e /tgi-entrypoint.sh ]; then
echo "tgi is available, redirect to run-tgi-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
exit 0
fi
# run trt
if which trtllm-build >/dev/null; then
echo "trtllm is available, redirect to run-trt-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
exit 0
fi
# run vllm
if [ -e /vllm-workspace ]; then
echo "vllm is available, redirect to run-vllm-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
exit 0
fi
}
main "$@"

View File

@ -0,0 +1,192 @@
import json
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
results_folder = Path("results/")
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
"avg_latency": "Mean latency (ms)",
# "P10": "P10 (s)",
# "P25": "P25 (s)",
"P50": "Median latency (ms)",
# "P75": "P75 (s)",
# "P90": "P90 (s)",
"P99": "P99 latency (ms)",
}
# throughput tests and the keys that will be printed into markdown
throughput_results = []
throughput_results_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
# "num_requests": "# of req.",
# "total_num_tokens": "Total # of tokens",
# "elapsed_time": "Elapsed time (s)",
"requests_per_second": "Tput (req/s)",
# "tokens_per_second": "Tput (tok/s)",
}
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
# "completed": "# of req.",
"request_throughput": "Tput (req/s)",
# "input_throughput": "Input Tput (tok/s)",
# "output_throughput": "Output Tput (tok/s)",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
# "mean_tpot_ms": "Mean TPOT (ms)",
# "median_tpot_ms": "Median",
# "p99_tpot_ms": "P99",
"mean_itl_ms": "Mean ITL (ms)",
"median_itl_ms": "Median ITL (ms)",
"p99_itl_ms": "P99 ITL (ms)",
}
def read_markdown(file):
if os.path.exists(file):
with open(file, "r") as f:
return f.read() + "\n"
else:
return f"{file} not found.\n"
def results_to_json(latency, throughput, serving):
return json.dumps({
'latency': latency.to_dict(),
'throughput': throughput.to_dict(),
'serving': serving.to_dict()
})
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
elif "latency" in f.name:
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# get different percentiles
for perc in [10, 25, 50, 75, 90, 99]:
# Multiply 1000 to convert the time unit from s to ms
raw_result.update(
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
# add the result to raw_result
latency_results.append(raw_result)
continue
elif "throughput" in f.name:
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
throughput_results.append(raw_result)
continue
print(f"Skipping {test_file}")
latency_results = pd.DataFrame.from_dict(latency_results)
serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results)
raw_results_json = results_to_json(latency_results, throughput_results,
serving_results)
# remapping the key, for visualization purpose
if not latency_results.empty:
latency_results = latency_results[list(
latency_column_mapping.keys())].rename(
columns=latency_column_mapping)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
if not throughput_results.empty:
throughput_results = throughput_results[list(
throughput_results_column_mapping.keys())].rename(
columns=throughput_results_column_mapping)
processed_results_json = results_to_json(latency_results,
throughput_results,
serving_results)
# get markdown tables
latency_md_table = tabulate(latency_results,
headers='keys',
tablefmt='pipe',
showindex=False)
serving_md_table = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
throughput_md_table = tabulate(throughput_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# document the result
with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
results = results.format(
latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table,
benchmarking_results_in_json_string=processed_results_json)
f.write(results)
# document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f:
results = latency_results.to_dict(
orient='records') + throughput_results.to_dict(
orient='records') + serving_results.to_dict(orient='records')
f.write(json.dumps(results))

View File

@ -0,0 +1,26 @@
import argparse
from transformers import AutoTokenizer
def main(model, cachedir):
# Load the tokenizer and save it to the specified directory
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.save_pretrained(cachedir)
print(f"Tokenizer saved to {cachedir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer")
parser.add_argument("--model",
type=str,
required=True,
help="Name of the model")
parser.add_argument("--cachedir",
type=str,
required=True,
help="Directory to save the tokenizer")
args = parser.parse_args()
main(args.model, args.cachedir)

View File

@ -0,0 +1,6 @@
from lmdeploy.serve.openai.api_client import APIClient
api_client = APIClient("http://localhost:8000")
model_name = api_client.available_models[0]
print(model_name)

View File

@ -0,0 +1,102 @@
#!/bin/bash
server_params=$1
common_params=$2
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
cd ~
rm -rf tensorrt-demo
git clone https://github.com/neuralmagic/tensorrt-demo.git
cd tensorrt-demo
tensorrt_demo_dir=$(pwd)
# make sure the parameter inside tensorrt_demo is consistent to envvar
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
python ../quantization/quantize.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path} \
--qformat fp8 \
--kv_cache_dtype fp8 \
--calib_size 2
else
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
fi
trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path}
cd /tensorrtllm_backend/triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &

View File

@ -0,0 +1,40 @@
#!/bin/bash
set -ex
set -o pipefail
main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results."
exit 0
fi
# initial annotation
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results
cd $VLLM_SOURCE_CODE_LOC/benchmarks
mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls
ls results/
# generate figures
python3 -m pip install tabulate pandas matplotlib
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
--description $description \
--results-folder results/
# upload results and figures
/workspace/buildkite-agent artifact upload "nightly_results.png"
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
main "$@"

View File

@ -0,0 +1,135 @@
import argparse
import json
import math
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
from tabulate import tabulate
def parse_arguments():
parser = argparse.ArgumentParser(
description=
'Parse command line arguments for summary-nightly-results script.')
parser.add_argument('--results-folder',
type=str,
required=True,
help='The folder where the results are stored.')
parser.add_argument('--description',
type=str,
required=True,
help='Description of the results.')
args = parser.parse_args()
return args
def main(args):
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
results_folder = Path(args.results_folder)
results = []
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f:
results = results + json.loads(f.read())
# generate markdown table
df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f:
description = f.read()
description = description.format(
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f:
f.write(description)
plt.rcParams.update({'font.size': 20})
# plot results
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
fig.subplots_adjust(hspace=1)
methods = ["vllm", "trt", "lmdeploy", "tgi"]
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
for j, metric in enumerate(["TTFT", "ITL"]):
means, stds = [], []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
stds.append(0.)
else:
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
std = filtered_df[f"Std {metric} (ms)"].values[0]
success = filtered_df["Successful req."].values[0]
stds.append(std / math.sqrt(success))
print(model, metric)
print(means, stds)
ax = axes[i, j + 1]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
means,
yerr=stds,
capsize=10,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel(f"{metric} (ms)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
metric = "Tput"
j = 0
if True:
tputs = []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
tputs.append(0.)
else:
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
tputs.append(input_tput + output_tput)
print(model, metric)
print(tputs)
ax = axes[i, j]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
tputs,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel("Tput (token/s)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
fig.tight_layout()
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
if __name__ == '__main__':
args = parse_arguments()
main(args)

View File

@ -0,0 +1,218 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill lmdeploy || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append lmdeploy to the test name
test_name=lmdeploy_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
# prepare tokenizer
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "lmdeploy server is up and running."
else
echo ""
echo "lmdeploy failed to start within the timeout period."
break
fi
# get model name
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend lmdeploy \
--tokenizer /tokenizer_cache \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--model \"$model_name\" \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "lmdeploy" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
python -m pip install transformers==4.41.2
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,216 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill text-generation || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append tgi to the test name
test_name=tgi_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "tgi server is up and running."
else
echo ""
echo "tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tgi \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "tgi" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=tgi
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,214 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill tritonserver || true
# waiting for GPU processes to be fully killed
sleep 20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append trt to the test name
test_name=trt_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "Running test case $test_name"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "trt server is up and running."
else
echo ""
echo "trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tensorrt-llm \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command=""
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "trt" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python -m pip install transformers -U
export CURRENT_LLM_SERVING_ENGINE=trt
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,221 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append vllm to the test name
test_name=vllm_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "vllm" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=vllm
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,76 @@
import datetime
import json
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
results_folder = Path("results/")
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
"completed": "Successful req.",
"request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)",
"input_throughput": "Input Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
"engine": "Engine",
}
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
serving_md_table_with_headers = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split('\n')
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
# document benchmarking results in markdown
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
# document results with header.
# for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers)
f.write('\n')
# document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient='records')
f.write(json.dumps(results))

View File

@ -0,0 +1,17 @@
#!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
retries=0
while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0
fi
echo "Waiting for image to be available..."
retries=$((retries + 1))
sleep 5
done
exit 1

View File

@ -0,0 +1,67 @@
## Latency tests
This test suite aims to test vllm's end-to-end latency under a controlled setup.
- Input length: 32 tokens.
- Output length: 128 tokens.
- Batch size: fixed (8).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
### Latency benchmarking results
{latency_tests_markdown_table}
## Throughput tests
This test suite aims to test vllm's throughput.
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput.
### Throughput benchmarking results
{throughput_tests_markdown_table}
## Serving tests
This test suite aims to test vllm's real serving metrics.
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
### Serving benchmarking results
{serving_tests_markdown_table}
## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format.
You can load the benchmarking tables into pandas dataframes as follows:
```python
import json
import pandas as pd
benchmarking_results_json = """The json string"""
benchmarking_results = json.loads(benchmarking_results_json)
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
```
The json string for all benchmarking tables:
```json
{benchmarking_results_in_json_string}
```
You can also check the raw experiment data in the Artifact tab of the Buildkite page.

View File

@ -0,0 +1,32 @@
[
{
"test_name": "latency_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama70B_tp4",
"parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15
}
},
{
"test_name": "latency_mixtral8x7B_tp2",
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15
}
}
]

View File

@ -0,0 +1,116 @@
[
{
"test_name": "llama8B_tp1",
"qps_list": [4],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tp": 1,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
},
{
"test_name": "llama70B_tp4",
"qps_list": [2],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
},
{
"test_name": "mixtral8x7B_tp2",
"qps_list": [2],
"common_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tp": 2,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
}
]

View File

@ -0,0 +1,80 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama70B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1,
"use_v2_block_manager": ""
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
}
]

View File

@ -0,0 +1,35 @@
[
{
"test_name": "throughput_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama70B_tp4",
"parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_mixtral8x7B_tp2",
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@ -0,0 +1,19 @@
steps:
- label: "Build wheel - CUDA {{matrix.cuda_version}}"
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env:
DOCKER_BUILDKIT: "1"
matrix:
setup:
cuda_version:
- "11.8.0"
- "12.1.0"

View File

@ -0,0 +1,84 @@
# This script runs test inside the corresponding ROCm docker container.
set -ex
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- ROCm info"
rocminfo
# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes
docker volume prune -f
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
# Call the cleanup docker function
cleanup_docker
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull ${image_name}
remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
}
trap remove_docker_container EXIT
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${@}"

View File

@ -0,0 +1,78 @@
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set -ex
set -o pipefail
# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
--model meta-llama/Llama-2-7b-chat-hf \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer meta-llama/Llama-2-7b-chat-hf \
--save-result \
2>&1 | tee benchmark_serving.txt
bench_serving_exit_code=$?
kill $server_pid
# write the results into a markdown file
echo "### Latency Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
echo "### Throughput Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
echo '```' >> benchmark_results.md
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
echo '```' >> benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /usr/bin/buildkite-agent ]; then
exit 0
fi
# upload the results to buildkite
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
# exit with the exit code of the benchmarks
if [ $bench_latency_exit_code -ne 0 ]; then
exit $bench_latency_exit_code
fi
if [ $bench_throughput_exit_code -ne 0 ]; then
exit $bench_throughput_exit_code
fi
if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code
fi
rm ShareGPT_V3_unfiltered_cleaned_split.json
buildkite-agent artifact upload "*.json"

View File

@ -0,0 +1,40 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
# offline inference
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest Pillow protobuf
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# online inference
docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"

105
.buildkite/run-multi-node-test.sh Executable file
View File

@ -0,0 +1,105 @@
#!/bin/bash
set -euox pipefail
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi
WORKING_DIR=$1
NUM_NODES=$2
NUM_GPUS=$3
DOCKER_IMAGE=$4
shift 4
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}"
exit 1
fi
echo "List of commands"
for command in "${COMMANDS[@]}"; do
echo $command
done
start_network() {
docker network create --subnet=192.168.10.0/24 docker-net
}
start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
# start the container in detached mode
# things to note:
# 1. --shm-size=10.24gb is required. don't use --ipc=host
# 2. pass HF_TOKEN to the container
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
# organize containers into a ray cluster
if [ $node -eq 0 ]; then
# start the ray head node
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep 10
else
# start the ray worker nodes, and connect them to the head node
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
fi
done
# wait for the cluster to be ready
sleep 10
# print the cluster status
docker exec node0 /bin/bash -c "ray status"
}
run_nodes() {
# important: iterate in reverse order to start the head node last
# we start the worker nodes first, in detached mode, and then start the head node
# in the foreground, so that the output of the head node is visible in the buildkite logs
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -ne 0 ]; then
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do
docker stop node$node
done
docker network rm docker-net
}
trap cleanup EXIT
start_network
start_nodes
run_nodes

View File

@ -0,0 +1,51 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
echo $current_time > /tmp/neuron-docker-build-timestamp
fi
else
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
fi
docker build -t neuron -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() { docker rm -f neuron || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'

14
.buildkite/run-openvino-test.sh Executable file
View File

@ -0,0 +1,14 @@
# This script build the OpenVINO docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t openvino-test -f Dockerfile.openvino .
# Setup cleanup
remove_docker_container() { docker rm -f openvino-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py

View File

@ -0,0 +1,16 @@
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
python3 /workspace/vllm/examples/offline_inference_tpu.py

View File

@ -0,0 +1,14 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t xpu-test -f Dockerfile.xpu .
# Setup cleanup
remove_docker_container() { docker rm -f xpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py

View File

@ -0,0 +1,268 @@
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.
steps:
- label: Async Engine, Inputs, Utils, Worker Test
fast_check: true
fast_check_only: true
commands:
- pytest -v -s async_engine # Async Engine
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker
- label: Metrics, Tracing Test
fast_check: true
fast_check_only: true
commands:
- pytest -v -s metrics # Metrics
- "pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai" # Tracing
- pytest -v -s tracing
- label: Regression Test
mirror_hardwares: [amd]
fast_check: true
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
#mirror_hardwares: [amd]
command: pytest -v -s async_engine
- label: Basic Correctness Test
mirror_hardwares: [amd]
fast_check: true
commands:
# This flashinfer installation will fail on AMD ROCm, so it is set as optional.
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test
mirror_hardwares: [amd]
fast_check: true
commands:
- pytest -v -s core
- label: Distributed Comm Ops Test
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total)
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- label: Distributed Tests (2 GPUs)
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s distributed/test_chunked_prefill_distributed.py
- pytest -v -s distributed/test_multimodal_broadcast.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Distributed Tests (4 GPUs)
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
fast_check: true
commands:
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
- label: Pipeline Parallelism Test
working_dir: "/vllm-workspace/tests"
num_gpus: 4
commands:
- pytest -v -s distributed/test_pipeline_parallel.py
- label: Engine Test
mirror_hardwares: [amd]
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: Entrypoints Test
fast_check: true
mirror_hardwares: [amd]
commands:
- pytest -v -s entrypoints/llm
- pytest -v -s entrypoints/openai
- label: Examples Test
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install tensorizer for tensorize_vllm_model.py
- pip install awscli tensorizer
- python3 offline_inference.py
- python3 cpu_offload.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 offline_inference_vision_language.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- label: Inputs Test
#mirror_hardwares: [amd]
commands:
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
# - label: Kernels Test %N
# #mirror_hardwares: [amd]
# commands:
# - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
# - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
# parallelism: 4
- label: Models Test
#mirror_hardwares: [amd]
commands:
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
- pytest -v -s models -m \"not vlm\"
- label: Vision Language Models Test
mirror_hardwares: [amd]
commands:
- pytest -v -s models -m vlm
- label: Prefix Caching Test
mirror_hardwares: [amd]
commands:
- pytest -v -s prefix_caching
- label: Samplers Test
#mirror_hardwares: [amd]
command: pytest -v -s samplers
- label: LogitsProcessor Test
mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py
- label: Utils Test
commands:
- pytest -v -s test_utils.py
- pytest -v -s test_embedded_commit.py
- label: Worker Test
mirror_hardwares: [amd]
command: pytest -v -s worker
- label: Speculative decoding tests
#mirror_hardwares: [amd]
commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode
# - label: LoRA Test %N
# #mirror_hardwares: [amd]
# command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
# parallelism: 4
# - label: LoRA Long Context (Distributed)
# #mirror_hardwares: [amd]
# num_gpus: 4
# # This test runs llama 13B, so it is required to run on 4 GPUs.
# commands:
# # FIXIT: find out which code initialize cuda before running the test
# # before the fix, we need to use spawn to test it
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -v -s -x lora/test_long_context.py
- label: Tensorizer Test
#mirror_hardwares: [amd]
fast_check: true
commands:
- apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader
- label: Metrics Test
mirror_hardwares: [amd]
command: pytest -v -s metrics
- label: Quantization Test
#mirror_hardwares: [amd]
command: pytest -v -s quantization
- label: Tracing Test
commands:
- "pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai"
- pytest -v -s tracing
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
commands:
- pip install aiohttp
- bash run-benchmarks.sh
- label: LM Eval Small Models
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: LM Eval Large Models
gpu: a100
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4
- label: Documentation Build
working_dir: "/vllm-workspace/test_docs/docs"
fast_check: true
no_gpu: True
commands:
- pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html
- label: Distributed Tests (A100)
gpu: a100
num_gpus: 4
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s -x lora/test_mixtral.py

26
.clang-format Normal file
View File

@ -0,0 +1,26 @@
BasedOnStyle: Google
UseTab: Never
IndentWidth: 2
ColumnLimit: 80
# Force pointers to the type for C++.
DerivePointerAlignment: false
PointerAlignment: Left
# Reordering #include statements can (and currently will) introduce errors
SortIncludes: false
# Style choices
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
IndentPPDirectives: BeforeHash
IncludeCategories:
- Regex: '^<'
Priority: 4
- Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
Priority: 3
- Regex: '^"(qoda|\.\.)/'
Priority: 2
- Regex: '.*'
Priority: 1

1
.dockerignore Normal file
View File

@ -0,0 +1 @@
vllm/*.so

2
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1,2 @@
github: [vllm-project]
open_collective: [vllm]

View File

@ -0,0 +1,22 @@
name: 📚 Documentation
description: Report an issue related to https://docs.vllm.ai/
title: "[Doc]: "
labels: ["documentation"]
body:
- type: textarea
attributes:
label: 📚 The doc issue
description: >
A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
validations:
required: true
- type: textarea
attributes:
label: Suggest a potential alternative/fix
description: >
Tell us how we could improve the documentation in this regard.
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

View File

@ -0,0 +1,40 @@
name: 🛠️ Installation
description: Report an issue here when you hit errors during installation.
title: "[Installation]: "
labels: ["installation"]
body:
- type: markdown
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
attributes:
label: Your current environment
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
```
validations:
required: true
- type: textarea
attributes:
label: How you are installing vllm
description: |
Paste the full command you are trying to execute.
value: |
```sh
pip install -vvv vllm
```
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

38
.github/ISSUE_TEMPLATE/300-usage.yml vendored Normal file
View File

@ -0,0 +1,38 @@
name: 💻 Usage
description: Raise an issue here if you don't know how to use vllm.
title: "[Usage]: "
labels: ["usage"]
body:
- type: markdown
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
attributes:
label: Your current environment
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
```
validations:
required: true
- type: textarea
attributes:
label: How would you like to use vllm
description: |
A detailed description of how you want to use vllm.
value: |
I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

View File

@ -0,0 +1,86 @@
name: 🐛 Bug report
description: Raise an issue here if you find a bug.
title: "[Bug]: "
labels: ["bug"]
body:
- type: markdown
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
attributes:
label: Your current environment
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
```
validations:
required: true
- type: textarea
attributes:
label: 🐛 Describe the bug
description: |
Please provide a clear and concise description of what the bug is.
If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
```python
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="facebook/opt-125m")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.
```python
# Sample code to reproduce the problem
```
```
The error message you got, with the full traceback.
```
validations:
required: true
- type: markdown
attributes:
value: >
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
Thanks for contributing 🎉!

View File

@ -0,0 +1,31 @@
name: 🚀 Feature request
description: Submit a proposal/request for a new vllm feature
title: "[Feature]: "
labels: ["feature request"]
body:
- type: markdown
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
attributes:
label: 🚀 The feature, motivation and pitch
description: >
A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
validations:
required: true
- type: textarea
attributes:
label: Alternatives
description: >
A description of any alternative solutions or features you've considered, if any.
- type: textarea
attributes:
label: Additional context
description: >
Add any other context or screenshots about the feature request.
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

View File

@ -0,0 +1,33 @@
name: 🤗 Support request for a new model from huggingface
description: Submit a proposal/request for a new model from huggingface
title: "[New Model]: "
labels: ["new model"]
body:
- type: markdown
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
- type: textarea
attributes:
label: The model to consider.
description: >
A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
validations:
required: true
- type: textarea
attributes:
label: The closest model vllm already supports.
description: >
Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
- type: textarea
attributes:
label: What's your difficulty of supporting the model you want?
description: >
For example, any new operators or new architecture?
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

View File

@ -0,0 +1,52 @@
name: ⚡ Discussion on the performance of vllm
description: Submit a proposal/discussion about the performance of vllm
title: "[Performance]: "
labels: ["performance"]
body:
- type: markdown
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
attributes:
label: Proposal to improve performance
description: >
How do you plan to improve vllm's performance?
validations:
required: false
- type: textarea
attributes:
label: Report of performance regression
description: >
Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
validations:
required: false
- type: textarea
attributes:
label: Misc discussion on performance
description: >
Anything about the performance.
validations:
required: false
- type: textarea
attributes:
label: Your current environment (if you think it is necessary)
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
```
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

49
.github/ISSUE_TEMPLATE/750-RFC.yml vendored Normal file
View File

@ -0,0 +1,49 @@
name: 💬 Request for comments (RFC).
description: Ask for feedback on major architectural changes or design choices.
title: "[RFC]: "
labels: ["RFC"]
body:
- type: markdown
attributes:
value: >
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
- type: textarea
attributes:
label: Motivation.
description: >
The motivation of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Proposed Change.
description: >
The proposed change of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Feedback Period.
description: >
The feedback period of the RFC. Usually at least one week.
validations:
required: false
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC.
validations:
required: false
- type: textarea
attributes:
label: Any Other Things.
description: >
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

View File

@ -0,0 +1,21 @@
name: 🎲 Misc/random discussions that do not fit into the above categories.
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
title: "[Misc]: "
labels: ["misc"]
body:
- type: markdown
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: textarea
attributes:
label: Anything you want to discuss about vllm.
description: >
Anything you want to discuss about vllm.
validations:
required: true
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

1
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View File

@ -0,0 +1 @@
blank_issues_enabled: false

64
.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,64 @@
FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*)
**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
---
<details>
<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>
<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
<ul>
<li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
<h3>Code Quality</h3>
<p>The PR need to meet the following code quality standards:</p>
<ul>
<li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
<li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
<li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
</ul>
<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
<h3>What to Expect for the Reviews</h3>
<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
<li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
</li>
</ul>
<h3>Thank You</h3>
<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
</details>

View File

@ -0,0 +1,21 @@
name: Add label on auto-merge enabled
on:
pull_request_target:
types:
- auto_merge_enabled
jobs:
add-label-on-auto-merge:
runs-on: ubuntu-latest
steps:
- name: Add label
uses: actions/github-script@v5
with:
script: |
github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
labels: ['ready']
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -0,0 +1,23 @@
name: Add Ready Label on Ready Comment
on:
issue_comment:
types: [created]
jobs:
add-ready-label:
runs-on: ubuntu-latest
if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
steps:
- name: Add label
uses: actions/github-script@v5
with:
script: |
github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
labels: ['ready']
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

36
.github/workflows/clang-format.yml vendored Normal file
View File

@ -0,0 +1,36 @@
name: clang-format
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
jobs:
clang-format:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install clang-format==18.1.5
- name: Running clang-format
run: |
EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu'
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| xargs clang-format --dry-run --Werror

48
.github/workflows/mypy.yaml vendored Normal file
View File

@ -0,0 +1,48 @@
name: mypy
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install mypy==1.9.0
pip install types-setuptools
pip install types-PyYAML
pip install types-requests
pip install types-setuptools
- name: Mypy
run: |
mypy
mypy tests --follow-imports skip
mypy vllm/attention --follow-imports skip
mypy vllm/core --follow-imports skip
mypy vllm/distributed --follow-imports skip
mypy vllm/engine --follow-imports skip
mypy vllm/entrypoints --follow-imports skip
mypy vllm/executor --follow-imports skip
mypy vllm/lora --follow-imports skip
mypy vllm/model_executor --follow-imports skip
mypy vllm/prompt_adapter --follow-imports skip
mypy vllm/spec_decode --follow-imports skip
mypy vllm/worker --follow-imports skip

110
.github/workflows/publish.yml vendored Normal file
View File

@ -0,0 +1,110 @@
# This workflow will upload a Python Package to Release asset
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
name: Create Release
on:
push:
tags:
- v*
# Needed to create release and upload assets
permissions:
contents: write
jobs:
release:
# Retrieve tag and create release
name: Create Release
runs-on: ubuntu-latest
outputs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Extract branch info
shell: bash
run: |
echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
- name: Create Release
id: create_release
uses: "actions/github-script@v6"
env:
RELEASE_TAG: ${{ env.release_tag }}
with:
github-token: "${{ secrets.GITHUB_TOKEN }}"
script: |
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)
wheel:
name: Build Wheel
runs-on: ${{ matrix.os }}
needs: release
strategy:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
run: |
bash -x .github/workflows/scripts/env.sh
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install CUDA ${{ matrix.cuda-version }}
run: |
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
- name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
run: |
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
- name: Build wheel
shell: bash
env:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
echo "asset_name=${asset_name}" >> $GITHUB_ENV
- name: Upload Release Asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./dist/${{ env.wheel_name }}
asset_name: ${{ env.asset_name }}
asset_content_type: application/*
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# - name: Publish package
# uses: pypa/gh-action-pypi-publish@release/v1.8
# with:
# repository-url: https://test.pypi.org/legacy/
# password: ${{ secrets.PYPI_API_TOKEN }}
# skip-existing: true

21
.github/workflows/reminder_comment.yml vendored Normal file
View File

@ -0,0 +1,21 @@
name: PR Reminder Comment Bot
on:
pull_request_target:
types: [opened]
jobs:
pr_reminder:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
uses: actions/github-script@v6
with:
script: |
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -0,0 +1,23 @@
name: Remove ready Label on notready Comment
on:
issue_comment:
types: [created]
jobs:
add-ready-label:
runs-on: ubuntu-latest
if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
steps:
- name: Remove ready label
uses: actions/github-script@v5
with:
script: |
github.rest.issues.removeLabel({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
name: 'ready'
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

37
.github/workflows/ruff.yml vendored Normal file
View File

@ -0,0 +1,37 @@
name: ruff
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml
- name: Run isort
run: |
isort . --check-only

19
.github/workflows/scripts/build.sh vendored Normal file
View File

@ -0,0 +1,19 @@
#!/bin/bash
python_executable=python$1
cuda_home=/usr/local/cuda-$2
# Update paths
PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements
$python_executable -m pip install wheel packaging
$python_executable -m pip install -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
# Build
$python_executable setup.py bdist_wheel --dist-dir=dist

View File

@ -0,0 +1,20 @@
// Uses Github's API to create the release and wait for result.
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
module.exports = async (github, context, core) => {
try {
const response = await github.rest.repos.createRelease({
draft: false,
generate_release_notes: true,
name: process.env.RELEASE_TAG,
owner: context.repo.owner,
prerelease: true,
repo: context.repo.repo,
tag_name: process.env.RELEASE_TAG,
});
core.setOutput('upload_url', response.data.upload_url);
} catch (error) {
core.setFailed(error.message);
}
}

View File

@ -0,0 +1,23 @@
#!/bin/bash
# Replace '.' with '-' ex: 11.8 -> 11-8
cuda_version=$(echo $1 | tr "." "-")
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
OS=$(echo $2 | tr -d ".\-")
# Installs CUDA
wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
rm cuda-keyring_1.1-1_all.deb
sudo apt -qq update
sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
sudo apt clean
# Test nvcc
PATH=/usr/local/cuda-$1/bin:${PATH}
nvcc --version
# Log gcc, g++, c++ versions
gcc --version
g++ --version
c++ --version

56
.github/workflows/scripts/env.sh vendored Normal file
View File

@ -0,0 +1,56 @@
#!/bin/bash
# This file installs common linux environment tools
export LANG C.UTF-8
# python_version=$1
sudo apt-get update && \
sudo apt-get install -y --no-install-recommends \
software-properties-common \
sudo apt-get install -y --no-install-recommends \
build-essential \
apt-utils \
ca-certificates \
wget \
git \
vim \
libssl-dev \
curl \
unzip \
unrar \
cmake \
net-tools \
sudo \
autotools-dev \
rsync \
jq \
openssh-server \
tmux \
screen \
htop \
pdsh \
openssh-client \
lshw \
dmidecode \
util-linux \
automake \
autoconf \
libtool \
net-tools \
pciutils \
libpci-dev \
libaio-dev \
libcap2 \
libtinfo5 \
fakeroot \
devscripts \
debhelper \
nfs-common
# Remove github bloat files to free up disk space
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf "/usr/share/dotnet"

View File

@ -0,0 +1,15 @@
#!/bin/bash
python_executable=python$1
pytorch_version=$2
cuda_version=$3
# Install torch
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
# Print version information
$python_executable --version
$python_executable -c "import torch; print('PyTorch:', torch.__version__)"
$python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"

31
.github/workflows/yapf.yml vendored Normal file
View File

@ -0,0 +1,31 @@
name: yapf
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
jobs:
yapf:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
run: |
yapf --diff --recursive .

198
.gitignore vendored
View File

@ -1,10 +1,192 @@
**/*.pyc
**/__pycache__/
*.egg-info/
*.eggs/
*.so
build/
# vllm commit id, generated by setup.py
vllm/commit_id.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/*.rst
!**/*.template.rst
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# VSCode
.vscode/
# DS Store
.DS_Store
# Results
*.csv
# Python pickle files
*.pkl
*.png
**/log.txt
# Sphinx documentation
_build/
# vim swap files
*.swo
*.swp
# hip files generated by PyTorch
*.hip
*_hip*
hip_compat.h
# Benchmark dataset
*.json

22
.readthedocs.yaml Normal file
View File

@ -0,0 +1,22 @@
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
sphinx:
configuration: docs/source/conf.py
fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats:
- pdf
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt

1
.yapfignore Normal file
View File

@ -0,0 +1 @@
collect_env.py

269
CMakeLists.txt Normal file
View File

@ -0,0 +1,269 @@
cmake_minimum_required(VERSION 3.21)
project(vllm_extensions LANGUAGES CXX)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
#
# Supported/expected torch versions for CUDA/ROCm.
#
# Currently, having an incorrect pytorch version results in a warning
# rather than an error.
#
# Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
#
# Try to find python package with an executable that exactly matches
# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
#
if (VLLM_PYTHON_EXECUTABLE)
find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
else()
message(FATAL_ERROR
"Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
" before running cmake configure.")
endif()
#
# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
#
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
# Ensure the 'nvcc' command is in the PATH
find_program(NVCC_EXECUTABLE nvcc)
if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
message(FATAL_ERROR "nvcc not found")
endif()
#
# Import torch cmake configuration.
# Torch also imports CUDA (and partially HIP) languages with some customizations,
# so there is no need to do this explicitly with check_language/enable_language,
# etc.
#
find_package(Torch REQUIRED)
#
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
# cmake --build . --target default
#
add_custom_target(default)
message(STATUS "Enabling core extension.")
# Define _core_C extension
# built for (almost) every target platform, (excludes TPU and Neuron)
set(VLLM_EXT_SRC
"csrc/core/torch_bindings.cpp")
define_gpu_extension_target(
_core_C
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
USE_SABI 3
WITH_SOABI)
add_dependencies(default _core_C)
#
# Forward the non-CUDA device extensions to external CMake scripts.
#
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
else()
return()
endif()
return()
endif()
#
# Set up GPU language and check the torch version and warn if it isn't
# what is expected.
#
if (NOT HIP_FOUND AND CUDA_FOUND)
set(VLLM_GPU_LANG "CUDA")
if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
"expected for CUDA build, saw ${Torch_VERSION} instead.")
endif()
elseif(HIP_FOUND)
set(VLLM_GPU_LANG "HIP")
# Importing torch recognizes and sets up some HIP/ROCm configuration but does
# not let cmake recognize .hip files. In order to get cmake to understand the
# .hip extension automatically, HIP must be enabled explicitly.
enable_language(HIP)
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
endif()
else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
endif()
#
# Override the GPU architectures detected by cmake/torch and filter them by
# the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
#
# Query torch for additional GPU compilation flags for the given
# `VLLM_GPU_LANG`.
# The final set of arches is stored in `VLLM_GPU_FLAGS`.
#
get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
#
# Set nvcc parallelism.
#
if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
#
# Define other extension targets
#
#
# _C extension
#
set(VLLM_EXT_SRC
"csrc/cache_kernels.cu"
"csrc/attention/attention_kernels.cu"
"csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.1
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
GIT_PROGRESS TRUE
)
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/custom_all_reduce.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
#
# The CUTLASS kernels for Hopper require sm90a to be enabled.
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
set_source_files_properties(
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
PROPERTIES
COMPILE_FLAGS
"-gencode arch=compute_90a,code=sm_90a")
endif()
endif()
define_gpu_extension_target(
_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)
#
# _moe_C extension
#
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/topk_softmax_kernels.cu")
define_gpu_extension_target(
_moe_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
endif()

56
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,56 @@
# Contributing to vLLM
Thank you for your interest in contributing to vLLM!
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
There are several ways you can contribute to the project:
- Identify and report any issues or bugs.
- Request or add a new model.
- Suggest or implement new features.
However, remember that contributions aren't just about code.
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
## Setup for development
### Build from source
```bash
pip install -e . # This may take several minutes.
```
### Testing
```bash
pip install -r requirements-dev.txt
# linting and formatting
bash format.sh
# Static type checking
mypy
# Unit tests
pytest tests/
```
**Note:** Currently, the repository does not pass the mypy tests.
## Contributing Guidelines
### Issue Reporting
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
If not, please file a new issue, providing as much relevant information as possible.
### Pull Requests & Code Reviews
Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
### Thank You
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
Your contributions make vLLM a great tool for everyone!

232
Dockerfile Normal file
View File

@ -0,0 +1,232 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
ENV DEBIAN_FRONTEND=noninteractive
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
&& python3 --version
RUN apt-get update -y \
&& apt-get install -y git curl sudo
# Install pip s.t. it will be compatible with our PYTHON_VERSION
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
RUN python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR /workspace
# install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt
COPY requirements-adag.txt requirements-adag.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt
COPY requirements-mamba.txt requirements-mamba.txt
RUN python3 -m pip install packaging
RUN python3 -m pip install -r requirements-mamba.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG PYTHON_VERSION=3.10
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache
# files and directories related to build wheels
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-adag.txt requirements-adag.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm vllm
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
ARG buildkite_commit
ENV BUILDKITE_COMMIT=${buildkite_commit}
ARG USE_SCCACHE
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& if [ "$CUDA_VERSION" = "11.8.0" ]; then \
export SCCACHE_BUCKET=vllm-build-sccache-2; \
else \
export SCCACHE_BUCKET=vllm-build-sccache; \
fi \
&& export SCCACHE_REGION=us-west-2 \
&& export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" != "1" ]; then \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist
#################### EXTENSION Build IMAGE ####################
#################### DEV IMAGE ####################
FROM base as dev
COPY requirements-lint.txt requirements-lint.txt
COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
#################### DEV IMAGE ####################
#################### MAMBA Build IMAGE ####################
FROM dev as mamba-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
WORKDIR /usr/src/mamba
COPY requirements-mamba.txt requirements-mamba.txt
# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel -r requirements-mamba.txt \
--no-build-isolation --no-deps --no-cache-dir
#################### MAMBA Build IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
WORKDIR /vllm-workspace
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
&& python3 --version
RUN apt-get update -y \
&& apt-get install -y python3-pip git vim curl libibverbs-dev
# Install pip s.t. it will be compatible with our PYTHON_VERSION
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
RUN python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test
ADD . /vllm-workspace/
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0'
ENV VLLM_USAGE_SOURCE production-docker-image
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################

41
Dockerfile.cpu Normal file
View File

@ -0,0 +1,41 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
RUN apt-get update -y \
&& apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN pip install intel-openmp
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
RUN pip install --upgrade pip \
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
FROM cpu-test-1 AS build
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

36
Dockerfile.neuron Normal file
View File

@ -0,0 +1,36 @@
# default base image
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..
CMD ["/bin/bash"]

29
Dockerfile.openvino Normal file
View File

@ -0,0 +1,29 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
FROM ubuntu:22.04 AS dev
RUN apt-get update -y && \
apt-get install -y python3-pip git
WORKDIR /workspace
# copy requirements
COPY requirements-build.txt /workspace/vllm/
COPY requirements-common.txt /workspace/vllm/
COPY requirements-openvino.txt /workspace/vllm/
COPY vllm/ /workspace/vllm/vllm
COPY csrc/core /workspace/vllm/csrc/core
COPY cmake/utils.cmake /workspace/vllm/cmake/
COPY CMakeLists.txt /workspace/vllm/
COPY setup.py /workspace/vllm/
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
# build vLLM with OpenVINO backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
COPY examples/ /workspace/vllm/examples
COPY benchmarks/ /workspace/vllm/benchmarks
CMD ["/bin/bash"]

22
Dockerfile.ppc64le Normal file
View File

@ -0,0 +1,22 @@
FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
# These packages will be in rocketce eventually
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /vllm-workspace
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]

181
Dockerfile.rocm Normal file
View File

@ -0,0 +1,181 @@
# Default ROCm 6.1 base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
# Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
# Whether to install CK-based flash-attention
# If 0, will not install flash-attention
ARG BUILD_FA="1"
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
# architectures specified in `FA_GFX_ARCHS`
ARG TRY_FA_WHEEL="1"
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="23a2b1c2"
# Whether to build triton on rocm
ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e0fc12c"
### Base image build stage
FROM $BASE_IMAGE AS base
# Import arg(s) defined before this build stage
ARG PYTORCH_ROCM_ARCH
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
sudo \
git \
bzip2 \
libx11-6 \
build-essential \
wget \
unzip \
tmux \
ccache \
&& rm -rf /var/lib/apt/lists/*
# When launching the container, mount the code directory to /vllm-workspace
ARG APP_MOUNT=/vllm-workspace
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --no-cache-dir --pre \
torch==2.5.0.dev20240726 \
torchvision==0.20.0.dev20240726 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
*) ;; esac
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV CCACHE_DIR=/root/.cache/ccache
### AMD-SMI build stage
FROM base AS build_amdsmi
# Build amdsmi wheel always
RUN cd /opt/rocm/share/amd_smi \
&& python3 -m pip wheel . --wheel-dir=/install
### Flash-Attention wheel build stage
FROM base AS build_fa
ARG BUILD_FA
ARG TRY_FA_WHEEL
ARG FA_WHEEL_URL
ARG FA_GFX_ARCHS
ARG FA_BRANCH
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_FA" = "1" ]; then \
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
# If a suitable wheel exists, we download it instead of building FA
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
else \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout "${FA_BRANCH}" \
&& git submodule update --init \
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
fi; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
### Triton wheel build stage
FROM base AS build_triton
ARG BUILD_TRITON
ARG TRITON_BRANCH
# Build triton wheel if `BUILD_TRITON = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/OpenAI/triton.git \
&& cd triton \
&& git checkout "${TRITON_BRANCH}" \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=/install; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
### Final vLLM build stage
FROM base AS final
# Import the vLLM development directory from the build context
COPY . .
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
# Workaround for ray >= 2.10.0
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# Silences the HF Tokenizers warning
ENV TOKENIZERS_PARALLELISM=false
RUN --mount=type=cache,target=${CCACHE_DIR} \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -Ur requirements-rocm.txt \
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
# Prevent interference if torch bundles its own HIP runtime
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
*) ;; esac \
&& python3 setup.py clean --all \
&& python3 setup.py develop
# Copy amdsmi wheel into final image
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
mkdir -p libs \
&& cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y amdsmi;
# Copy triton wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y triton; fi
# Copy flash-attn wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y flash-attn; fi
# Install wheels that were built to the final image
RUN --mount=type=cache,target=/root/.cache/pip \
if ls libs/*.whl; then \
python3 -m pip install libs/*.whl; fi
CMD ["/bin/bash"]

23
Dockerfile.tpu Normal file
View File

@ -0,0 +1,23 @@
ARG NIGHTLY_DATE="20240726"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
WORKDIR /workspace
# Install aiohttp separately to avoid build errors.
RUN pip install aiohttp
# Install NumPy 1 instead of NumPy 2.
RUN pip install "numpy<2"
# Install the TPU and Pallas dependencies.
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
# Fix FastAPI dependence
RUN pip install "starlette<0.38.0"
# Build vLLM.
COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python setup.py develop
CMD ["/bin/bash"]

22
Dockerfile.xpu Normal file
View File

@ -0,0 +1,22 @@
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-xpu.txt
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
CMD ["/bin/bash"]

Some files were not shown because too many files have changed in this diff Show More