Compare commits

...

1069 Commits

Author SHA1 Message Date
617fb893d5 add compile 2024-07-26 19:29:36 -07:00
55712941e5 [Bug Fix] Illegal memory access, FP8 Llama 3.1 405b (#6852) 2024-07-27 02:27:44 +00:00
981b0d5673 [Frontend] Factor out code for running uvicorn (#6828) 2024-07-27 09:58:25 +08:00
d09b94ca58 [TPU] Support collective communications in XLA devices (#6813) 2024-07-27 01:45:57 +00:00
bb5494676f enforce eager mode with bnb quantization temporarily (#6846) 2024-07-27 01:32:20 +00:00
b5f49ee55b Update README.md (#6847) 2024-07-27 00:26:45 +00:00
150a1ffbfd [Doc] Update SkyPilot doc for wrong indents and instructions for update service (#4283) 2024-07-26 14:39:10 -07:00
281977bd6e [Doc] Add Nemotron to supported model docs (#6843) 2024-07-26 17:32:44 -04:00
3bbb4936dc [Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-26 13:50:10 -07:00
aa4867791e [Misc][TPU] Support TPU in initialize_ray_cluster (#6812) 2024-07-26 19:39:49 +00:00
71734f1bf2 [Build/CI][ROCm] Minor simplification to Dockerfile.rocm (#6811) 2024-07-26 12:28:32 -07:00
50704f52c4 [Bugfix][Kernel] Promote another index to int64_t (#6838) 2024-07-26 18:41:04 +00:00
07278c37dd [Model] Support Nemotron models (Nemotron-3, Nemotron-4, Minitron) (#6611) 2024-07-26 14:33:42 -04:00
85ad7e2d01 [doc][debugging] add known issues for hangs (#6816) 2024-07-25 21:48:05 -07:00
89a84b0bb7 [Core] Use array to speedup padding (#6779) 2024-07-25 21:31:31 -07:00
084a01fd35 [Bugfix] [Easy] Fixed a bug in the multiprocessing GPU executor. (#6770) 2024-07-25 21:25:35 -07:00
062a1d0fab Fix ReplicatedLinear weight loading (#6793) 2024-07-25 19:24:58 -07:00
2eb9f4ff26 [ci] Mark tensorizer as soft fail and separate from grouped test (#6810)
[ci] Mark tensorizer test as soft fail and separate it from grouped test in fast check (#6810)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-25 18:08:33 -07:00
443c7cf4cf [ci][distributed] fix flaky tests (#6806) 2024-07-25 17:44:09 -07:00
1adddb14bf [Core] Fix ray forward_dag error mssg (#6792) 2024-07-25 16:53:25 -07:00
b7215de2c5 [Docs] Publish 5th meetup slides (#6799) 2024-07-25 16:47:55 -07:00
f3ff63c3f4 [doc][distributed] improve multinode serving doc (#6804) 2024-07-25 15:38:32 -07:00
cd7edc4e87 [Bugfix] Fix empty (nullptr) channelwise scales when loading wNa16 using compressed tensors (#6798) 2024-07-25 15:05:09 -07:00
6a1e25b151 [Doc] Add documentations for nightly benchmarks (#6412) 2024-07-25 11:57:16 -07:00
95db75de64 [Bugfix] Add synchronize to prevent possible data race (#6788)
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2024-07-25 10:40:01 -07:00
65b1f121c8 [Bugfix] Fix kv_cache_dtype=fp8 without scales for FP8 checkpoints (#6761) 2024-07-25 09:46:15 -07:00
889da130e7 [ Misc ] fp8-marlin channelwise via compressed-tensors (#6524)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-07-25 09:46:04 -07:00
b75e314fff [Bugfix] Add image placeholder for OpenAI Compatible Server of MiniCPM-V (#6787)
Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-25 09:42:49 -07:00
316a41ac1d [Bugfix] Fix encoding_format in examples/openai_embedding_client.py (#6755) 2024-07-24 22:48:07 -07:00
0310029a2f [Bugfix] Fix awq_marlin and gptq_marlin flags (#6745) 2024-07-24 22:34:11 -07:00
309aaef825 [Bugfix] Fix decode tokens w. CUDA graph (#6757) 2024-07-24 22:33:56 -07:00
9e169a4c61 [Model] Adding support for MiniCPM-V (#4087) 2024-07-24 20:59:30 -07:00
5689e256ba [Frontend] Represent tokens with identifiable strings (#6626) 2024-07-25 09:51:00 +08:00
740374d456 [core][distributed] fix zmq hang (#6759) 2024-07-24 17:37:12 -07:00
d88c458f44 [Doc][AMD][ROCm]Added tips to refer to mi300x tuning guide for mi300x users (#6754) 2024-07-24 14:32:57 -07:00
421e218b37 [Bugfix] Bump transformers to 4.43.2 (#6752) 2024-07-24 13:22:16 -07:00
5448f67635 [Core] Tweaks to model runner/input builder developer APIs (#6712) 2024-07-24 12:17:12 -07:00
0e63494cf3 Add fp8 support to reshape_and_cache_flash (#6667) 2024-07-24 18:36:52 +00:00
ee812580f7 [Frontend] split run_server into build_server and run_server (#6740) 2024-07-24 10:36:04 -07:00
40468b13fa [Bugfix] Miscalculated latency lead to time_to_first_token_seconds inaccurate. (#6686) 2024-07-24 08:58:42 -07:00
2cf0df3381 [Bugfix] Fix speculative decode seeded test (#6743) 2024-07-24 08:58:31 -07:00
545146349c Adding f-string to validation error which is missing (#6748) 2024-07-24 08:55:53 -07:00
f4f8a9d892 [Bugfix]fix modelscope compatible issue (#6730) 2024-07-24 05:04:46 -07:00
b570811706 [Build/CI] Update run-amd-test.sh. Enable Docker Hub login. (#6711) 2024-07-24 05:01:14 -07:00
ccc4a73257 [Docs][ROCm] Detailed instructions to build from source (#6680) 2024-07-24 01:07:23 -07:00
0a740a11ba [Bugfix] Fix token padding for chameleon (#6724) 2024-07-24 01:05:09 -07:00
c882a7f5b3 [SpecDecoding] Update MLPSpeculator CI tests to use smaller model (#6714) 2024-07-24 07:34:22 +00:00
5e8ca973eb [Bugfix] fix flashinfer cudagraph capture for PP (#6708) 2024-07-24 01:49:44 +00:00
87525fab92 [bitsandbytes]: support read bnb pre-quantized model (#5753)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-23 23:45:09 +00:00
2f808e69ab [Bugfix] StatLoggers: cache spec decode metrics when they get collected. (#6645)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-23 23:05:05 +00:00
01c16ede6b [CI] Add smoke test for non-uniform AutoFP8 quantization (#6702) 2024-07-23 22:45:12 +00:00
72fc704803 [build] relax wheel size limit (#6704) 2024-07-23 14:03:49 -07:00
1bedf210e3 Bump transformers version for Llama 3.1 hotfix and patch Chameleon (#6690) 2024-07-23 13:47:48 -07:00
507ef787d8 [Model] Pipeline Parallel Support for DeepSeek v2 (#6519)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-23 12:22:09 -07:00
58f53034ad [Frontend] Add Usage data in each chunk for chat_serving. #6540 (#6652) 2024-07-23 11:41:55 -07:00
0eb0757bef [Misc] Add ignored layers for fp8 quantization (#6657) 2024-07-23 14:04:04 -04:00
38c4b7e863 Bump version to 0.5.3.post1 (#6696) 2024-07-23 10:08:59 -07:00
a112a84aad [BugFix] Fix RoPE error in Llama 3.1 (#6693) 2024-07-23 09:46:05 -07:00
461089a21a [Bugfix] Fix a log error in chunked prefill (#6694) 2024-07-23 09:27:58 -07:00
71950af726 [doc][distributed] fix doc argument order (#6691) 2024-07-23 08:55:33 -07:00
cb1362a889 [Docs] Announce llama3.1 support (#6688) 2024-07-23 08:18:15 -07:00
bb2fc08072 Bump version to v0.5.3 (#6674) 2024-07-23 00:00:08 -07:00
3eda4ec780 support ignore patterns in model loader (#6673) 2024-07-22 23:59:42 -07:00
22fa2e35cb [VLM][Model] Support image input for Chameleon (#6633) 2024-07-22 23:50:48 -07:00
c5201240a4 [misc] only tqdm for first rank (#6672) 2024-07-22 21:57:27 -07:00
97234be0ec [Misc] Manage HTTP connections in one place (#6600) 2024-07-22 21:32:02 -07:00
c051bfe4eb [doc][distributed] doc for setting up multi-node environment (#6529)
[doc][distributed] add more doc for setting up multi-node environment (#6529)
2024-07-22 21:22:09 -07:00
9e0b558a09 [Misc] Support FP8 kv cache scales from compressed-tensors (#6528) 2024-07-23 04:11:50 +00:00
e519ae097a add tqdm when loading checkpoint shards (#6569)
Co-authored-by: tianyi.zhao <tianyi.zhao@transwarp.io>
Co-authored-by: youkaichao <youkaichao@126.com>
2024-07-22 20:48:01 -07:00
7c2749a4fd [misc] add start loading models for users information (#6670) 2024-07-22 20:08:02 -07:00
729171ae58 [Misc] Enable chunked prefill by default for long context models (#6666) 2024-07-22 20:03:13 -07:00
c5e8330997 [Bugfix] Fix null modules_to_not_convert in FBGEMM Fp8 quantization (#6665) 2024-07-22 19:25:05 -07:00
e0c15758b8 [Core] Modulize prepare input and attention metadata builder (#6596) 2024-07-23 00:45:24 +00:00
bdf5fd1386 [Misc] Remove deprecation warning for beam search (#6659) 2024-07-23 00:21:58 +00:00
5a96ee52a3 [ci][build] add back vim in docker (#6661) 2024-07-22 16:26:29 -07:00
42c7f66a38 [Core] Support dynamically loading Lora adapter from HuggingFace (#6234)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-07-22 15:42:40 -07:00
69d5ae38dc [ci] Use different sccache bucket for CUDA 11.8 wheel build (#6656)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-22 14:20:41 -07:00
fea59c7712 [Bugfix][Kernel] Use int64_t for indices in fp8 quant kernels (#6649) 2024-07-22 14:08:30 -06:00
739b61a348 [Frontend] Refactor prompt processing (#4028)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-22 10:13:53 -07:00
89c1c6a196 [Bugfix] Fix vocab_size field access in llava_next.py (#6624) 2024-07-22 05:02:51 +00:00
42de2cefcb [Misc] Add a wrapper for torch.inference_mode (#6618) 2024-07-21 18:43:11 -07:00
c9eef37f32 [Model] Initial Support for Chameleon (#5770) 2024-07-21 17:37:51 -07:00
396d92d5e0 [Kernel][Core] Add AWQ support to the Marlin kernel (#6612) 2024-07-21 19:41:42 -04:00
25e778aa16 [Model] Refactor and decouple phi3v image embedding (#6621) 2024-07-21 16:07:58 -07:00
b6df37f943 [Misc] Remove abused noqa (#6619) 2024-07-21 23:47:04 +08:00
14f91fe67c [Spec Decode] Disable Log Prob serialization to CPU for spec decoding for both draft and target models. (#6485) 2024-07-20 23:58:58 -07:00
d7f4178dd9 [Frontend] Move chat utils (#6602)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-21 08:38:17 +08:00
082ecd80d5 [ Bugfix ] Fix AutoFP8 fp8 marlin (#6609) 2024-07-20 17:25:56 -06:00
f952bbc8ff [Misc] Fix input_scale typing in w8a8_utils.py (#6579) 2024-07-20 23:11:13 +00:00
9364f74eee [ Kernel ] Enable fp8-marlin for fbgemm-fp8 models (#6606) 2024-07-20 18:50:10 +00:00
06d6c5fe9f [Bugfix][CI/Build][Hardware][AMD] Fix AMD tests, add HF cache, update CK FA, add partially supported model notes (#6543) 2024-07-20 09:39:07 -07:00
683e3cb9c4 [ Misc ] fbgemm checkpoints (#6559) 2024-07-20 09:36:57 -07:00
9042d68362 [Misc] Consolidate and optimize logic for building padded tensors (#6541) 2024-07-20 04:17:24 +00:00
3f8d42c81f Pipeline Parallel: Guard for KeyErrors at request abort (#6587)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-19 19:18:19 -07:00
7bd82002ae [Core] Allow specifying custom Executor (#6557) 2024-07-20 01:25:06 +00:00
2e26564259 [ Kernel ] FP8 Dynamic Per Token Quant - Add scale_ub (#6593)
Co-authored-by: Varun Sundar Rabindranth <varun@neuralmagic.com>
2024-07-19 18:15:26 -07:00
e81522e879 [build] add ib in image for out-of-the-box infiniband support (#6599)
[build] add ib so that multi-node support with infiniband can be supported out-of-the-box (#6599)
2024-07-19 17:16:57 -07:00
45ceb85a0c [Docs] Update PP docs (#6598) 2024-07-19 16:38:21 -07:00
4cc24f01b1 [ Kernel ] Enable Dynamic Per Token fp8 (#6547) 2024-07-19 23:08:15 +00:00
07eb6f19f3 [bugfix][distributed] fix multi-node bug for shared memory (#6597) 2024-07-19 15:34:34 -07:00
f0bbfaf917 [Bugfix] [SpecDecode] AsyncMetricsCollector: update time since last collection (#6578) 2024-07-19 14:01:03 -07:00
30efe41532 [Docs] Update docs for wheel location (#6580) 2024-07-19 12:14:11 -07:00
9ed82e7074 [Misc] Small perf improvements (#6520) 2024-07-19 12:10:56 -07:00
51f8aa90ad [Bugfix][Frontend] remove duplicate init logger (#6581) 2024-07-19 10:16:27 -07:00
a5314e8698 [Model] RowParallelLinear: pass bias to quant_method.apply (#6327)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-19 07:15:22 -06:00
a921e86392 [BUGFIX] Raise an error for no draft token case when draft_tp>1 (#6369) 2024-07-19 06:01:09 -07:00
6366efc67b [Bugfix][Frontend] Fix missing /metrics endpoint (#6463) 2024-07-19 03:55:13 +00:00
dbe5588554 [ Misc ] non-uniform quantization via compressed-tensors for Llama (#6515) 2024-07-18 22:39:18 -04:00
d4201e06d5 [Bugfix] Make spec. decode respect per-request seed. (#6034)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-07-18 19:22:08 -07:00
b5672a112c [Core] Multiprocessing Pipeline Parallel support (#6130)
Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-18 19:15:52 -07:00
c5df56f88b Add support for a rope extension method (#6553) 2024-07-19 01:53:03 +00:00
1689219ebf [CI/Build] Build on Ubuntu 20.04 instead of 22.04 (#6517) 2024-07-18 17:29:25 -07:00
4ffffccb7e [Kernel] Implement fallback for FP8 channelwise using torch._scaled_mm (#6552) 2024-07-18 23:52:22 +00:00
f53b8f0d05 [ci][test] add correctness test for cpu offloading (#6549) 2024-07-18 23:41:06 +00:00
2d4733ba2d Fix PR comment bot (#6554)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-18 14:48:29 -07:00
15c6a079b1 [Model] Support Mistral-Nemo (#6548) 2024-07-18 20:31:50 +00:00
ecdb462c24 [ci] Reword Github bot comment (#6534) 2024-07-18 08:01:45 -07:00
58ca663224 [ Misc ] Improve Min Capability Checking in compressed-tensors (#6522) 2024-07-18 14:39:12 +00:00
4634c8728b [TPU] Refactor TPU worker & model runner (#6506) 2024-07-18 01:34:16 -07:00
c8a7d51c49 [Bugfix] Update flashinfer.py with PagedAttention forwards - Fixes Gemma2 OpenAI Server Crash (#6501) 2024-07-18 07:47:13 +00:00
e2fbaee725 [BugFix][Frontend] Use LoRA tokenizer in OpenAI APIs (#6227)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-18 15:13:30 +08:00
8a74c68bd1 [Misc] Minor patch for draft model runner (#6523) 2024-07-18 06:06:21 +00:00
61e592747c [Core] Introduce SPMD worker execution using Ray accelerated DAG (#6032)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Stephanie Wang <swang@cs.berkeley.edu>
2024-07-17 22:27:09 -07:00
d25877dd9b [BugFix] Avoid secondary error in ShmRingBuffer destructor (#6530) 2024-07-17 22:24:43 -07:00
1c27d25fb5 [core][model] yet another cpu offload implementation (#6496)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-17 20:54:35 -07:00
18fecc3559 [ Kernel ] Fp8 Channelwise Weight Support (#6487) 2024-07-18 03:18:13 +00:00
b5af8c223c [Model] Pipeline parallel support for Mixtral (#6516) 2024-07-17 19:26:04 -07:00
b5241e41d9 [ Kernel ] FP8 Dynamic-Per-Token Quant Kernel (#6511)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-07-18 01:38:35 +00:00
e76466dde2 [Core] draft_model_runner: Implement prepare_inputs on GPU for advance_step (#6338) 2024-07-17 14:30:28 -07:00
5f0b9933e6 [Bugfix] Fix Ray Metrics API usage (#6354) 2024-07-17 19:40:10 +00:00
a38524f338 [DOC] - Add docker image to Cerebrium Integration (#6510) 2024-07-17 10:22:53 -07:00
2fa4623d9e [Core] Refactor _prepare_model_input_tensors - take 2 (#6164) 2024-07-17 09:37:16 -07:00
a9a2e74d21 [Misc] Use torch.Tensor for type annotation (#6505) 2024-07-17 13:01:10 +00:00
e09ce759aa [TPU] Remove multi-modal args in TPU backend (#6504) 2024-07-17 04:02:53 -07:00
5fa6e9876e [Bugfix] Fix for multinode crash on 4 PP (#6495)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-17 08:25:10 +00:00
5bf35a91e4 [Doc][CI/Build] Update docs and tests to use vllm serve (#6431) 2024-07-17 07:43:21 +00:00
a19e8d3726 [Misc][Speculative decoding] Typos and typing fixes (#6467)
Co-authored-by: caishangming.csm <caishangming.csm@alibaba-inc.com>
2024-07-17 07:17:07 +00:00
10383887e0 [ROCm] Cleanup Dockerfile and remove outdated patch (#6482) 2024-07-16 22:47:02 -07:00
1d094fd7c0 [Distributed][PP] only create embedding & lm head when necessary (#6455)
original title: [Distributed][Model] Rank-based Component Creation for Pipeline Parallelism Memory Optimization
2024-07-16 19:20:26 -07:00
ce37be7ba0 [misc][distributed] add seed to dummy weights (#6491) 2024-07-16 19:16:34 -07:00
7f62077af5 [misc][distributed] improve tests (#6488) 2024-07-16 17:35:52 -07:00
09c2eb85dd [ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00
978aed5300 [Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081) 2024-07-16 15:31:32 -07:00
160e1d8c99 [Misc] Log spec decode metrics (#6454) 2024-07-16 20:37:10 +00:00
94162beb9f [Doc] Fix the lora adapter path in server startup script (#6230) 2024-07-16 10:11:04 -07:00
c467dff24f [Hardware][TPU] Support MoE with Pallas GMM kernel (#6457) 2024-07-16 09:56:28 -07:00
9f4ccec761 [doc][misc] remind to cancel debugging environment variables (#6481)
[doc][misc] remind users to cancel debugging environment variables after debugging (#6481)
2024-07-16 09:45:30 -07:00
38ef94888a [CI/Build] Remove "boardwalk" image asset (#6460) 2024-07-16 08:59:36 -07:00
2bb0489cb3 [Core] Use numpy to speed up padded token processing (#6442) 2024-07-16 08:13:25 -07:00
7508a3dc34 [Misc] Fix typos in spec. decode metrics logging. (#6470)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-16 13:55:15 +00:00
7a3d2a5b95 [Frontend] Support for chat completions input in the tokenize endpoint (#5923) 2024-07-16 20:18:09 +08:00
d97011512e [CI/Build] vLLM cache directory for images (#6444) 2024-07-15 23:12:25 -07:00
37d776606f [Docs] Announce 5th meetup (#6458) 2024-07-15 21:04:58 -07:00
Joe
d92b3c5cde [Bugfix][CI/Build] Test prompt adapters in openai entrypoint tests (#6419) 2024-07-15 18:54:15 -07:00
9ad32dacd9 [BugFix][Model] Jamba - Handle aborted requests, Add tests and fix cleanup bug (#6425)
Co-authored-by: Mor Zusman <morz@ai21.com>
2024-07-16 01:32:55 +00:00
d6f3b3d5c4 Pin sphinx-argparse version (#6453)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-16 01:26:11 +00:00
4552e37b55 [CI/Build][TPU] Add TPU CI test (#6277)
Co-authored-by: kevin <kevin@anyscale.com>
2024-07-15 14:31:16 -07:00
ec9933f4a5 [Misc] Add CustomOp Interface to UnquantizedFusedMoEMethod (#6289) 2024-07-15 19:02:14 +00:00
3dee97b05f [Docs] Add Google Cloud to sponsor list (#6450) 2024-07-15 11:58:10 -07:00
4cf256ae7f [misc][distributed] fix pp missing layer condition (#6446) 2024-07-15 10:32:35 -07:00
64fdc08c72 bump version to v0.5.2 (#6433) 2024-07-15 17:27:40 +00:00
4ef95b0f06 [Bugfix] use float32 precision in samplers/test_logprobs.py for comparing with HF (#6409)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-15 13:14:49 -04:00
eaec4b9153 [Bugfix] Add custom Triton cache manager to resolve MoE MP issue (#6140)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Chih-Chieh-Yang <chih.chieh.yang@ibm.com>
2024-07-15 10:12:47 -07:00
a63a4c6341 [Misc] Use 0.0.9 version for flashinfer (#6447)
Co-authored-by: Pernekhan Utemuratov <pernekhan@deepinfra.com>
2024-07-15 10:10:26 -07:00
c8fd97f26d [Kernel] Use CUTLASS kernels for the FP8 layers with Bias (#6270) 2024-07-15 13:05:52 -04:00
94b82e8c18 [doc][distributed] add suggestion for distributed inference (#6418) 2024-07-15 09:45:51 -07:00
6ae1597ddf [VLM] Minor space optimization for ClipVisionModel (#6436) 2024-07-15 17:29:51 +08:00
22e79ee8f3 [doc][misc] doc update (#6439) 2024-07-14 23:33:25 -07:00
de19916314 [Bugfix] Convert image to RGB by default (#6430) 2024-07-15 05:39:15 +00:00
69672f116c [core][distributed] simplify code to support pipeline parallel (#6406) 2024-07-14 21:20:51 -07:00
44874a0bf9 [Doc] add env docs for flashinfer backend (#6437) 2024-07-14 21:16:51 -07:00
b47008b4d2 [BugFix] BatchResponseData body should be optional (#6345)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-15 04:06:09 +00:00
9bfece89fd Add FUNDING.yml (#6435) 2024-07-14 20:36:16 -07:00
32c9d7f765 Report usage for beam search (#6404) 2024-07-14 19:37:35 -07:00
ccb20db8bd [Bugfix] Benchmark serving script used global parameter 'args' in function 'sample_random_requests' (#6428) 2024-07-14 19:27:01 -07:00
a754dc2cb9 [CI/Build] Cross python wheel (#6394) 2024-07-14 18:54:46 -07:00
61e85dbad8 [Doc] xpu backend requires running setvars.sh (#6393) 2024-07-14 17:10:11 -07:00
dbfe254eda [Feature] vLLM CLI (#5090)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-07-14 15:36:43 -07:00
73030b7dae [ Misc ] Enable Quantizing All Layers of DeekSeekv2 (#6423) 2024-07-14 21:38:42 +00:00
ccd3c04571 [ci][build] fix commit id (#6420)
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-07-14 22:16:21 +08:00
9dad5cc859 [Kernel] Turn off CUTLASS scaled_mm for Ada Lovelace (#6384) 2024-07-14 13:37:19 +00:00
6ef3bf912c Remove unnecessary trailing period in spec_decode.rst (#6405) 2024-07-14 07:58:09 +00:00
540c0368b1 [Model] Initialize Fuyu-8B support (#3924)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-14 05:27:14 +00:00
fb6af8bc08 [ Misc ] Apply MoE Refactor to Deepseekv2 To Support Fp8 (#6417) 2024-07-13 20:03:58 -07:00
eeceadaecc [Misc] Add deprecation warning for beam search (#6402) 2024-07-13 11:52:22 -07:00
babf52dade [ Misc ] More Cleanup of Marlin (#6359)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-07-13 10:21:37 +00:00
9da4aad44b Updating LM Format Enforcer version to v10.3 (#6411) 2024-07-13 10:09:12 +00:00
41708e5034 [ci] try to add multi-node tests (#6280)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-12 21:51:48 -07:00
d80aef3776 [Docs] Clean up latest news (#6401) 2024-07-12 19:36:53 -07:00
e1684a766a [Bugfix] Fix hard-coded value of x in context_attention_fwd (#6373)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-12 18:30:54 -07:00
a27f87da34 [Doc] Fix Typo in Doc (#6392)
Co-authored-by: Saliya Ekanayake <esaliya@d-matrix.ai>
2024-07-13 00:48:23 +00:00
16ff6bd58c [ci] Fix wording for GH bot (#6398)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-12 16:34:37 -07:00
f8f9ff57ee [Bugfix][TPU] Fix megacore setting for v5e-litepod (#6397) 2024-07-12 15:59:47 -07:00
6bc9710f6e Fix release pipeline's dir permission (#6391) 2024-07-12 15:52:43 -07:00
111fc6e7ec [Misc] Add generated git commit hash as vllm.__commit__ (#6386) 2024-07-12 22:52:15 +00:00
75f64d8b94 [Bugfix] Fix illegal memory access in FP8 MoE kernel (#6382) 2024-07-12 21:33:33 +00:00
21b2dcedab Fix release pipeline's -e flag (#6390) 2024-07-12 14:08:04 -07:00
07b35af86d Fix interpolation in release pipeline (#6389) 2024-07-12 14:03:39 -07:00
bb1a784b05 Fix release-pipeline.yaml (#6388) 2024-07-12 14:00:57 -07:00
d719ba24c5 Build some nightly wheels by default (#6380) 2024-07-12 13:56:59 -07:00
aa48e502fb [MISC] Upgrade dependency to PyTorch 2.3.1 (#5327) 2024-07-12 12:04:26 -07:00
4dbebd03cc [ci] Add GHA workflows to enable full CI run (#6381)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-12 11:36:26 -07:00
b75bce1008 [ci] Add grouped tests & mark tests to run by default for fastcheck pipeline (#6365)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-12 09:58:38 -07:00
b039cbbce3 [Misc] add fixture to guided processor tests (#6341) 2024-07-12 09:55:39 -07:00
f9d25c2519 [Build/CI] Checking/Waiting for the GPU's clean state (#6379) 2024-07-12 09:42:24 -07:00
024ad87cdc [Bugfix] Fix dtype mismatch in PaliGemma (#6367) 2024-07-12 08:22:18 -07:00
aea19f0989 [ Misc ] Support Models With Bias in compressed-tensors integration (#6356) 2024-07-12 11:11:29 -04:00
f7160d946a [Misc][Bugfix] Update transformers for tokenizer issue (#6364) 2024-07-12 08:40:07 +00:00
6047187cd8 [ Misc ] Remove separate bias add (#6353) 2024-07-12 05:06:09 +00:00
b6c16cf8ff [ROCm][AMD] unify CUDA_VISIBLE_DEVICES usage in cuda/rocm (#6352) 2024-07-11 21:30:46 -07:00
d26a8b3f1f [CI/Build] (2/2) Switching AMD CI to store images in Docker Hub (#6350) 2024-07-11 21:26:26 -07:00
d59eb98489 [Model][Phi3-Small] Remove scipy from blocksparse_attention (#6343) 2024-07-12 10:47:17 +08:00
adf32e0a0f [Bugfix] Fix usage stats logging exception warning with OpenVINO (#6349) 2024-07-12 10:47:00 +08:00
2b0fb53481 [distributed][misc] be consistent with pytorch for libcudart.so (#6346)
[distributed][misc] keep consistent with how pytorch finds libcudart.so (#6346)
2024-07-11 19:35:17 -07:00
d6ab528997 [Misc] Remove flashinfer warning, add flashinfer tests to CI (#6351) 2024-07-12 01:32:06 +00:00
7ed6a4f0e1 [ BugFix ] Prompt Logprobs Detokenization (#6223)
Co-authored-by: Zifei Tong <zifeitong@gmail.com>
2024-07-11 22:02:29 +00:00
a4feba929b [CI/Build] Add nightly benchmarking for tgi, tensorrt-llm and lmdeploy (#5362) 2024-07-11 13:28:38 -07:00
2d23b42d92 [doc] update pipeline parallel in readme (#6347) 2024-07-11 11:38:40 -07:00
1df43de9bb [bug fix] Fix llava next feature size calculation. (#6339)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
2024-07-11 17:21:10 +00:00
52b7fcb35a Benchmark: add H100 suite (#6047) 2024-07-11 09:17:07 -07:00
b675069d74 [ Misc ] Refactor Marlin Python Utilities (#6082)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-07-11 15:40:11 +00:00
55f692b46e [BugFix] get_and_reset only when scheduler outputs are not empty (#6266) 2024-07-11 07:40:20 -07:00
8a1415cf77 [Bugfix] GPTBigCodeForCausalLM: Remove lm_head from supported_lora_modules. (#6326)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-11 07:05:59 -07:00
546b101fa0 [BugFix]: fix engine timeout due to request abort (#6255)
Signed-off-by: yatta zhang <ytzhang01@foxmail.com>
Signed-off-by: zhangyuntao.dev <zhangyuntao.dev@bytedance.com>
Co-authored-by: zhangyuntao.dev <zhangyuntao.dev@bytedance.com>
2024-07-11 06:46:31 -07:00
3963a5335b [Misc] refactor(config): clean up unused code (#6320) 2024-07-11 09:39:07 +00:00
c4774eb841 [Bugfix] Fix snapshot download in serving benchmark (#6318) 2024-07-11 07:04:05 +00:00
fc17110bbe [BugFix]: set outlines pkg version (#6262) 2024-07-11 04:37:11 +00:00
439c84581a [Doc] Update description of vLLM support for CPUs (#6003) 2024-07-10 21:15:29 -07:00
99ded1e1c4 [Doc] Remove comments incorrectly copied from another project (#6286) 2024-07-10 17:05:26 -07:00
997df46a32 [Bugfix][Neuron] Fix soft prompt method error in NeuronExecutor (#6313) 2024-07-10 16:39:02 -07:00
ae151d73be [Speculative Decoding] Enabling bonus token in speculative decoding for KV cache based models (#5765) 2024-07-10 16:02:47 -07:00
44cc76610d [Bugfix] Fix OpenVINOExecutor abstractmethod error (#6296)
Signed-off-by: sangjune.park <sangjune.park@navercorp.com>
2024-07-10 10:03:32 -07:00
b422d4961a [CI/Build] Enable mypy typing for remaining folders (#6268) 2024-07-10 22:15:55 +08:00
c38eba3046 [Bugfix] MLPSpeculator: Use ParallelLMHead in tie_weights=False case. (#6303)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-10 09:04:07 -04:00
e72ae80b06 [Bugfix] Support 2D input shape in MoE layer (#6287) 2024-07-10 09:03:16 -04:00
8a924d2248 [Doc] Guide for adding multi-modal plugins (#6205) 2024-07-10 14:55:34 +08:00
5ed3505d82 [Bugfix][TPU] Add prompt adapter methods to TPUExecutor (#6279) 2024-07-09 19:30:56 -07:00
da78caecfa [core][distributed] zmq fallback for broadcasting large objects (#6183)
[core][distributed] add zmq fallback for broadcasting large objects (#6183)
2024-07-09 18:49:11 -07:00
2416b26e11 [Speculative Decoding] Medusa Implementation with Top-1 proposer (#4978) 2024-07-09 18:34:02 -07:00
d3a245138a [Bugfix]fix and needs_scalar_to_array logic check (#6238)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-07-09 23:43:24 +00:00
673dd4cae9 [Docs] Docs update for Pipeline Parallel (#6222)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-07-09 16:24:58 -07:00
4d6ada947c [CORE] Adding support for insertion of soft-tuned prompts (#4645)
Co-authored-by: Swapnil Parekh <swapnilp@ibm.com>
Co-authored-by: Joe G <joseph.granados@h2o.ai>
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-07-09 13:26:36 -07:00
a0550cbc80 Add support for multi-node on CI (#5955)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-09 12:56:56 -07:00
08c5bdecae [Bugfix][TPU] Fix outlines installation in TPU Dockerfile (#6256) 2024-07-09 02:56:06 -07:00
5d5b4c5fe5 [Bugfix][TPU] Add missing None to model input (#6245) 2024-07-09 00:21:37 -07:00
70c232f85a [core][distributed] fix ray worker rank assignment (#6235) 2024-07-08 21:31:44 -07:00
a3c9435d93 [hardware][cuda] use device id under CUDA_VISIBLE_DEVICES for get_device_capability (#6216) 2024-07-08 20:02:15 -07:00
4f0e0ea131 Add FlashInfer to default Dockerfile (#6172) 2024-07-08 13:38:03 -07:00
ddc369fba1 [Bugfix] Mamba cache Cuda Graph padding (#6214) 2024-07-08 11:25:51 -07:00
185ad31f37 [Bugfix] use diskcache in outlines _get_guide #5436 (#6203) 2024-07-08 11:23:24 -07:00
543aa48573 [Kernel] Correctly invoke prefill & decode kernels for cross-attention (towards eventual encoder/decoder model support) (#4888)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-07-08 17:12:15 +00:00
f7a8fa39d8 [Kernel] reloading fused_moe config on the last chunk (#6210) 2024-07-08 08:00:38 -07:00
717f4bcea0 Feature/add benchmark testing (#5947)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-08 07:52:06 +00:00
16620f439d do not exclude object field in CompletionStreamResponse (#6196) 2024-07-08 10:32:57 +08:00
3b08fe2b13 [misc][frontend] log all available endpoints (#6195)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-07-07 15:11:12 -07:00
abfe705a02 [ Misc ] Support Fp8 via llm-compressor (#6110)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-07-07 20:42:11 +00:00
333306a252 add benchmark for fix length input and output (#5857)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-07 07:42:13 +00:00
6206dcb29e [Model] Add PaliGemma (#5189)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-07-07 09:25:50 +08:00
9389380015 [Doc] Move guide for multimodal model and other improvements (#6168) 2024-07-06 17:18:59 +08:00
175c43eca4 [Doc] Reorganize Supported Models by Type (#6167) 2024-07-06 05:59:36 +00:00
bc96d5c330 Move release wheel env var to Dockerfile instead (#6163) 2024-07-05 17:19:53 -07:00
f0250620dd Fix release wheel build env var (#6162) 2024-07-05 16:24:31 -07:00
2de490d60f Update wheel builds to strip debug (#6161) 2024-07-05 14:51:25 -07:00
79d406e918 [Docs] Fix readthedocs for tag build (#6158) 2024-07-05 12:44:40 -07:00
abad5746a7 bump version to v0.5.1 (#6157) 2024-07-05 12:04:51 -07:00
e58294ddf2 [Bugfix] Add verbose error if scipy is missing for blocksparse attention (#5695) 2024-07-05 10:41:01 -07:00
f1e15da6fe [Frontend] Continuous usage stats in OpenAI completion API (#5742) 2024-07-05 10:37:09 -07:00
0097bb1829 [Bugfix] Use templated datasource in grafana.json to allow automatic imports (#6136)
Signed-off-by: Christian Rohmann <christian.rohmann@inovex.de>
2024-07-05 09:49:47 -07:00
ea4b570483 [VLM] Cleanup validation and update docs (#6149) 2024-07-05 05:49:38 +00:00
a41357e941 [VLM] Improve consistency between feature size calculation and dummy data for profiling (#6146) 2024-07-05 09:29:47 +08:00
ae96ef8fbd [VLM] Calculate maximum number of multi-modal tokens by model (#6121) 2024-07-04 16:37:23 -07:00
69ec3ca14c [Kernel][Model] logits_soft_cap for Gemma2 with flashinfer (#6051)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-07-04 16:35:51 -07:00
81d7a50f24 [Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008)
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
2024-07-04 15:22:12 -07:00
27902d42be [misc][doc] try to add warning for latest html (#5979) 2024-07-04 09:57:09 -07:00
56b325e977 [ROCm][AMD][Model]Adding alibi slopes support in ROCm triton flash attention and naive flash attention (#6043)
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
2024-07-03 22:19:38 -07:00
3dd507083f [CI/Build] Cleanup VLM tests (#6107) 2024-07-03 18:58:18 -07:00
0ed646b7aa [Distributed][Core] Support Py39 and Py38 for PP (#6120)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-03 17:52:29 -07:00
1dab9bc8a9 [Bugfix] set OMP_NUM_THREADS to 1 by default for multiprocessing (#6109)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-07-03 16:56:59 -07:00
3de6e6a30e [core][distributed] support n layers % pp size != 0 (#6115) 2024-07-03 16:40:31 -07:00
966fe72141 [doc][misc] bump up py version in installation doc (#6119) 2024-07-03 15:52:04 -07:00
62963d129e [ Misc ] Clean Up CompressedTensorsW8A8 (#6113) 2024-07-03 22:50:08 +00:00
d9e98f42e4 [vlm] Remove vision language config. (#6089)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-03 22:14:16 +00:00
3c6325f0fc [core][distributed] custom allreduce when pp size > 1 (#6117) 2024-07-03 14:41:32 -07:00
47f0954af0 [Kernel] Expand FP8 support to Ampere GPUs using FP8 Marlin (#5975) 2024-07-03 17:38:00 +00:00
7cd2ebb025 [Bugfix] Fix compute_logits in Jamba (#6093) 2024-07-03 00:32:35 -07:00
f1c78138aa [Doc] Fix Mock Import (#6094) 2024-07-03 00:13:56 -07:00
3a86b54fb0 [VLM][Frontend] Proper Image Prompt Formatting from OpenAI API (#6091)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-02 23:41:23 -07:00
f666207161 [misc][distributed] error on invalid state (#6092) 2024-07-02 23:37:29 -07:00
d830656a97 [BugFix] Avoid unnecessary Ray import warnings (#6079) 2024-07-03 14:09:40 +08:00
d18bab3587 [CI] Fix base url doesn't strip "/" (#6087) 2024-07-02 21:31:25 -07:00
9831aec49f [Core] Dynamic image size support for VLMs (#5276)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: ywang96 <ywang@roblox.com>
Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-07-02 20:34:00 -07:00
482045ee77 [hardware][misc] introduce platform abstraction (#6080) 2024-07-02 20:12:22 -07:00
9d6a8daa87 [Model] Jamba support (#4115)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
Co-authored-by: Erez Schwartz <erezs@ai21.com>
Co-authored-by: Mor Zusman <morz@ai21.com>
Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Co-authored-by: Tomer Asida <tomera@ai21.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-02 23:11:29 +00:00
ee93f4f92a [CORE] Quantized lm-head Framework (#4442)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
Co-authored-by: ZX <zx@lbx.dev>
2024-07-02 22:25:17 +00:00
7c008c51a9 [ Misc ] Refactor MoE to isolate Fp8 From Mixtral (#5970)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-02 21:54:35 +00:00
4d26d806e1 Update conftest.py (#6076) 2024-07-02 20:14:22 +00:00
c5832d2ae9 [Core] Pipeline Parallel Support (#4412)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-02 10:58:08 -07:00
15aba081f3 [Speculative Decoding] MLPSpeculator Tensor Parallel support (1/2) (#6050)
Co-authored-by: Sirej Dua <sirej.dua@databricks.com>
Co-authored-by: Sirej Dua <Sirej Dua>
2024-07-02 07:20:29 -07:00
31354e563f [Doc] Reinstate doc dependencies (#6061) 2024-07-02 10:53:16 +00:00
98d6682cd1 [VLM] Remove image_input_type from VLM config (#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-02 07:57:09 +00:00
2c37540aa6 [Frontend] Add template related params to request (#5709) 2024-07-01 23:01:57 -07:00
3476ed0809 [Core] Optimize block_manager_v2 vs block_manager_v1 (to make V2 default) (#5602) 2024-07-01 20:10:37 -07:00
54600709b6 [Model] Changes to MLPSpeculator to support tie_weights and input_scale (#5965)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Joshua Rosenkranz <jmrosenk@us.ibm.com>
2024-07-01 16:40:02 -07:00
e373853e12 [Frontend] Relax api url assertion for openai benchmarking (#6046) 2024-07-01 23:39:10 +00:00
c87ebc3ef9 [BugFix] Ensure worker model loop is always stopped at the right time (#5987) 2024-07-01 16:17:58 -07:00
c4059ea54f [Bugfix] Add explicit end_forward calls to flashinfer (#6044) 2024-07-01 23:08:58 +00:00
8e0817c262 [Bugfix][Doc] Fix Doc Formatting (#6048) 2024-07-01 15:09:11 -07:00
83bdcb6ac3 add FAQ doc under 'serving' (#5946) 2024-07-01 14:11:36 -07:00
12a59959ed [Bugfix] adding chunking mechanism to fused_moe to handle large inputs (#6029) 2024-07-01 21:08:29 +00:00
dec6fc6f3b [Bugfix] Use RayActorError for older versions of Ray in RayTokenizerGroupPool (#6039) 2024-07-01 20:12:40 +00:00
8893130b63 [doc][misc] further lower visibility of simple api server (#6041)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-07-01 10:50:56 -07:00
bb60326836 [Misc] update benchmark backend for scalellm (#6018) 2024-07-01 10:20:33 -07:00
4050d646e5 [doc][misc] remove deprecated api server in doc (#6037) 2024-07-01 12:52:43 -04:00
d76084c12f [ CI ] Re-enable Large Model LM Eval (#6031) 2024-07-01 12:40:45 -04:00
80ca1e6a3a [Speculative Decoding 2/2 ] Integrate typical acceptance sampler into Spec Decode Worker (#5348) 2024-07-01 00:33:05 -07:00
614aa51203 [misc][cuda] use nvml to avoid accidentally cuda initialization (#6007) 2024-06-30 20:07:34 -07:00
af9ad46fca [ Misc ] Refactor w8a8 to use process_weights_after_load (Simplify Weight Loading) (#5940)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-30 23:06:27 +00:00
7836fdcc11 [Misc] Fix get_min_capability (#5971) 2024-06-30 20:15:16 +00:00
deacb7ec44 [ CI ] Temporarily Disable Large LM-Eval Tests (#6005)
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic>
2024-06-30 11:56:56 -07:00
f5e73c9f1b [Lora] Use safetensor keys instead of adapter_config.json to find unexpected modules. (#5909)
Co-authored-by: sang <sangcho@anyscale.com>
2024-06-30 17:11:15 +00:00
c6c240aa0a [Frontend]: Support base64 embedding (#5935)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-06-30 23:53:00 +08:00
2be6955a3f [ci][distributed] fix device count call
[ci][distributed] fix some cuda init that makes it necessary to use spawn (#5991)
2024-06-30 08:06:13 +00:00
9d47f64eb6 [CI/Build] [3/3] Reorganize entrypoints tests (#5966) 2024-06-30 12:58:49 +08:00
cff6a1fec1 [CI/Build] Reuse code for checking output consistency (#5988) 2024-06-30 11:44:25 +08:00
bcc6a09b63 [CI/Build] Temporarily Remove Phi3-Vision from TP Test (#5989) 2024-06-30 09:18:31 +08:00
9def10664e [Bugfix][CI/Build][Hardware][AMD] Install matching torchvision to fix AMD tests (#5949) 2024-06-29 12:47:58 -07:00
75aa1442db [ CI/Build ] LM Eval Harness Based CI Testing (#5838)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-29 13:04:30 -04:00
99397da534 [CI/Build] Add TP test for vision models (#5892) 2024-06-29 15:45:54 +00:00
8dbfcd35bf [ CI/Build ] Added E2E Test For Compressed Tensors (#5839)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-29 21:12:58 +08:00
f7dac83d95 [Kernel] Raise an exception in MoE kernel if the batch size is larger then 65k (#5939) 2024-06-29 21:04:20 +08:00
7c01f70641 [Core] Optimize SequenceStatus.is_finished by switching to IntEnum (#5974) 2024-06-29 12:47:53 +00:00
51e971d39e [Bugfix] Support eos_token_id from config.json (#5954) 2024-06-29 11:19:02 +00:00
329df38f1a [Misc] Update Phi-3-Vision Example (#5981)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-06-29 14:34:29 +08:00
580353da93 [Bugfix] Fix precisions in Gemma 1 (#5913) 2024-06-29 03:10:21 +00:00
ba4994443a [Kernel] Add punica dimensions for Granite 3b and 8b (#5930)
Signed-off-by: Joe Runde <joe@joerun.de>
2024-06-29 10:48:25 +08:00
906a19cdb0 [Misc] Extend vLLM Metrics logging API (#5925)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-06-29 10:36:06 +08:00
c4bca740e8 [Bugfix] fix missing last itl in openai completions benchmark (#5926) 2024-06-29 10:34:42 +08:00
7f83f40dee [Bugfix][TPU] Fix pad slot id (#5977) 2024-06-28 18:55:17 -07:00
54814fd85b [Bugfix][TPU] Fix TPU sampler output (#5978) 2024-06-28 18:14:16 -07:00
7041de4384 [Kernel] Flashinfer for prefill & decode, with Cudagraph support for decode (#4628)
Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>, bong-furiosa <bongwon.jang@furiosa.ai>
2024-06-28 15:28:49 -07:00
6a62cb82cc [Bugfix] Fix Engine Failing After Invalid Request - AsyncEngineDeadError (#5963)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-28 17:46:30 -04:00
5d2a1a9cf0 Unmark more files as executable (#5962) 2024-06-28 17:34:56 -04:00
4bf35ed9ae [Bugfix] Only add Attention.kv_scale if kv cache quantization is enabled (#5936) 2024-06-28 21:12:40 +00:00
be0b3af9e0 Support Deepseek-V2 (#4650)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
2024-06-28 13:24:57 -07:00
2cd402e169 [ Bugfix ] Enabling Loading Models With Fused QKV/MLP on Disk with FP8 (#5921)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-28 18:43:49 +00:00
b185230744 [ Misc ] Remove fp8_shard_indexer from Col/Row Parallel Linear (Simplify Weight Loading) (#5928)
Co-authored-by: Robert Shaw <rshaw@neuralmagic>
2024-06-28 13:49:57 -04:00
6a2d659d28 [Bugfix] Fix compute datatype for cutlass 3.x epilogues (#5931) 2024-06-28 17:10:34 +00:00
b2c620230a [Spec Decode] Introduce DraftModelRunner (#5799) 2024-06-28 09:17:51 -07:00
b90d8cd832 [Distributed] Make it clear that % should not be in tensor dict keys. (#5927)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
2024-06-28 15:20:22 +00:00
3b752a6555 [CI/Build] [2/3] Reorganize entrypoints tests (#5904) 2024-06-28 07:59:18 -07:00
ec1ad0046c [Bugfix] Better error message for MLPSpeculator when num_speculative_tokens is set too high (#5894)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-28 07:42:17 -07:00
57f09a419c [Hardware][Intel] OpenVINO vLLM backend (#5379) 2024-06-28 13:50:16 +00:00
5932634409 Unmark fused_moe config json file as executable (#5960) 2024-06-28 06:36:12 -07:00
5cbe8d155c [Core] Registry for processing model inputs (#5214)
Co-authored-by: ywang96 <ywang@roblox.com>
2024-06-28 12:09:56 +00:00
0d0e3a42ac [Bugfix][Hardware][Intel CPU] Fix unpassed multi_modal_kwargs for CPU runner (#5956) 2024-06-28 12:03:41 +00:00
74d55c065b [VLM][BugFix] Make sure that multi_modal_kwargs can broadcast properly with ring buffer. (#5905)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-28 07:29:13 +00:00
f136da15e1 [Hardware][TPU] Optimize KV cache swapping (#5878) 2024-06-27 21:12:13 -07:00
c3dde367f1 [Kernel][ROCm][AMD] fused_moe Triton configs v2 for mi300X (#5932) 2024-06-27 13:41:08 -07:00
64e8d2a783 [core][misc] remove logical block (#5882) 2024-06-27 13:34:55 -07:00
79c92c7c8a [Model] Add Gemma 2 (#5908) 2024-06-27 13:33:56 -07:00
736ed38849 [CI/Build] Fix Args for _get_logits_warper in Sampler Test (#5922) 2024-06-27 11:43:04 -07:00
365791ff81 [BugFix] Fix min_tokens behaviour for multiple eos tokens (#5849) 2024-06-27 11:31:11 -07:00
691e29ecf3 [BugFix] Fix MLPSpeculator handling of num_speculative_tokens (#5876) 2024-06-27 10:59:33 -07:00
3fd02bda51 [doc][misc] add note for Kubernetes users (#5916) 2024-06-27 10:07:07 -07:00
98cf2ed678 [Model][Bugfix] Implicit model flags and reenable Phi-3-Vision (#5896) 2024-06-27 09:08:10 -07:00
e9d32d077d [CI/Build] [1/3] Reorganize entrypoints tests (#5526) 2024-06-27 12:43:17 +00:00
2061f0b8a7 [Bugfix] Fix img_sizes Parsing in Phi3-Vision (#5888) 2024-06-27 08:29:24 +00:00
96354d6a29 [Model] Add base class for LoRA-supported models (#5018) 2024-06-27 16:03:04 +08:00
d12af207d2 [VLM][Bugfix] Make sure that multi_modal_kwargs is broadcasted properly (#5880)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
2024-06-27 15:15:24 +08:00
6eabc6cb0e [Doc] Add note about context length in Phi-3-Vision example (#5887) 2024-06-26 23:20:01 -07:00
2110557dab [BugFix] Fix cuda graph for MLPSpeculator (#5875)
Co-authored-by: Abhinav Goyal <abhinav.goyal@flipkart.com>
2024-06-27 04:12:10 +00:00
b9e84259e9 [Misc] Add example for LLaVA-NeXT (#5879) 2024-06-26 17:57:16 -07:00
294104c3f9 [doc] update usage of env var to avoid conflict (#5873) 2024-06-26 17:57:12 -04:00
38a1674abb Support CPU inference with VSX PowerPC ISA (#5652) 2024-06-26 21:53:04 +00:00
f5c8628fdc [Bugfix][TPU] Fix CPU cache allocation (#5869) 2024-06-26 13:42:40 -07:00
cbc53b6b8d [Hardware][TPU] Support parallel sampling & Swapping (#5855) 2024-06-26 11:07:49 -07:00
c54269d967 [Frontend] Add tokenize/detokenize endpoints (#5054) 2024-06-26 16:54:22 +00:00
5bfd1bbc98 [Kernel] Adding bias epilogue support for cutlass_scaled_mm (#5560)
Co-authored-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2024-06-26 15:16:00 +00:00
6984c02a27 [CI/Build] Refactor image test assets (#5821) 2024-06-26 01:02:34 -07:00
3439c5a8e3 [Bugfix][TPU] Fix KV cache size calculation (#5860) 2024-06-26 00:58:23 -07:00
6806998bf9 [Bugfix] Fix embedding to support 2D inputs (#5829) 2024-06-26 00:15:22 -07:00
515080ad2f [bugfix][distributed] fix shm broadcast when the queue size is full (#5801) 2024-06-25 21:56:02 -07:00
3aa7b6cf66 [Misc][Doc] Add Example of using OpenAI Server with VLM (#5832) 2024-06-25 20:34:25 -07:00
dda4811591 [Core] Refactor Worker and ModelRunner to consolidate control plane communication (#5408)
Signed-off-by: Stephanie Wang <swang@cs.berkeley.edu>
Signed-off-by: Stephanie <swang@anyscale.com>
Co-authored-by: Stephanie <swang@anyscale.com>
2024-06-25 20:30:03 -07:00
82079729cc [Bugfix] Fix assertion in NeuronExecutor (#5841) 2024-06-25 19:52:10 -07:00
c2a8ac75e0 [CI/Build] Add E2E tests for MLPSpeculator (#5791)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-26 00:04:08 +00:00
f178e56c68 [Hardware][TPU] Raise errors for unsupported sampling params (#5850) 2024-06-25 16:58:23 -07:00
dd793d1de5 [Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1, Dockerfile improvements, test fixes (#5422) 2024-06-25 15:56:15 -07:00
bc34937d68 [Hardware][TPU] Refactor TPU backend (#5831) 2024-06-25 15:25:52 -07:00
dd248f7675 [Misc] Update w4a16 compressed-tensors support to include w8a16 (#5794) 2024-06-25 19:23:35 +00:00
d9b34baedd [CI/Build] Add unit testing for FlexibleArgumentParser (#5798) 2024-06-25 12:18:03 -07:00
c18ebfdd71 [doc][distributed] add both gloo and nccl tests (#5834) 2024-06-25 15:10:28 -04:00
67882dbb44 [Core] Add fault tolerance for RayTokenizerGroupPool (#5748) 2024-06-25 10:15:10 -07:00
7b99314301 [Misc] Remove useless code in cpu_worker (#5824) 2024-06-25 09:41:36 -07:00
2ce5d6688b [Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) 2024-06-25 09:56:06 +00:00
f23871e9ee [Doc] Add notice about breaking changes to VLMs (#5818) 2024-06-25 01:25:03 -07:00
e9de9dd551 [ci] Remove aws template (#5757)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-24 21:09:02 -07:00
ba991d5c84 [Bugfix] Fix FlexibleArgumentParser replaces _ with - for actual args (#5795) 2024-06-24 17:01:19 -06:00
1744cc99ba [Doc] Add Phi-3-medium to list of supported models (#5788) 2024-06-24 10:48:55 -07:00
e72dc6cb35 [Doc] Add "Suggest edit" button to doc pages (#5789) 2024-06-24 10:26:17 -07:00
c246212952 [doc][faq] add warning to download models for every nodes (#5783) 2024-06-24 15:37:42 +08:00
edd5fe5fa2 [Bugfix] Add phi3v resize for dynamic shape and fix torchvision requirement (#5772) 2024-06-24 12:11:53 +08:00
5d4d90536f [Distributed] Add send and recv helpers (#5719) 2024-06-23 14:42:28 -07:00
6c916ac8a8 [BugFix] [Kernel] Add Cutlass2x fallback kernels (#5744)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-23 21:07:11 +00:00
832ea88fcb [core][distributed] improve shared memory broadcast (#5754) 2024-06-22 10:00:43 -07:00
8c00f9c15d [Docs][TPU] Add installation tip for TPU (#5761) 2024-06-21 23:09:40 -07:00
0cbc1d2b4f [Bugfix] Fix pin_lora error in TPU executor (#5760) 2024-06-21 22:25:14 -07:00
ff9ddbceee [Misc] Remove #4789 workaround left in vllm/entrypoints/openai/run_batch.py (#5756) 2024-06-22 03:33:12 +00:00
9c62db07ed [Model] Support Qwen-VL and Qwen-VL-Chat models with text-only inputs (#5710)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-22 02:07:08 +00:00
cf90ae0123 [CI][Hardware][Intel GPU] add Intel GPU(XPU) ci pipeline (#5616) 2024-06-21 17:09:34 -07:00
f5dda63eb5 [LoRA] Add support for pinning lora adapters in the LRU cache (#5603) 2024-06-21 15:42:46 -07:00
7187507301 [ci][test] fix ca test in main (#5746) 2024-06-21 14:04:26 -07:00
f1e72cc19a [BugFix] exclude version 1.15.0 for modelscope (#5668) 2024-06-21 13:15:48 -06:00
5b15bde539 [Doc] Documentation on supported hardware for quantization methods (#5745) 2024-06-21 12:44:29 -04:00
bd620b01fb [Kernel][CPU] Add Quick gelu to CPU (#5717) 2024-06-21 06:39:40 +00:00
d9a252bc8e [Core][Distributed] add shm broadcast (#5399)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-06-21 05:12:35 +00:00
67005a07bc [Bugfix] Add fully sharded layer for QKVParallelLinearWithLora (#5665)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-06-21 04:46:28 +00:00
c35e4a3dd7 [BugFix] Fix test_phi3v.py (#5725) 2024-06-21 04:45:34 +00:00
1f5674218f [Kernel] Add punica dimension for Qwen2 LoRA (#5441) 2024-06-20 17:55:41 -07:00
b12518d3cf [Model] MLPSpeculator speculative decoding support (#4947)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>

Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Davis Wertheimer <Davis.Wertheimer@ibm.com>
2024-06-20 20:23:12 -04:00
6c5b7af152 [distributed][misc] use fork by default for mp (#5669) 2024-06-20 17:06:34 -07:00
8065a7e220 [Frontend] Add FlexibleArgumentParser to support both underscore and dash in names (#5718) 2024-06-20 17:00:13 -06:00
3f3b6b2150 [Bugfix] Fix the CUDA version check for FP8 support in the CUTLASS kernels (#5715) 2024-06-20 18:36:10 +00:00
a7dcc62086 [Kernel] Update Cutlass int8 kernel configs for SM80 (#5275)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-20 13:33:21 +00:00
ad137cd111 [Model] Port over CLIPVisionModel for VLMs (#5591) 2024-06-20 11:52:09 +00:00
111af1fa2c [Kernel] Update Cutlass int8 kernel configs for SM90 (#5514)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-20 06:37:08 +00:00
1b2eaac316 [Bugfix][Doc] FIx Duplicate Explicit Target Name Errors (#5703) 2024-06-19 23:10:47 -07:00
3730a1c832 [Misc] Improve conftest (#5681) 2024-06-19 19:09:21 -07:00
949e49a685 [ci] Limit num gpus if specified for A100 (#5694)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-19 16:30:03 -07:00
4a30d7e3cc [Misc] Add per channel support for static activation quantization; update w8a8 schemes to share base classes (#5650) 2024-06-19 18:06:44 -04:00
e83db9e7e3 [Doc] Update docker references (#5614)
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
2024-06-19 15:01:45 -07:00
78687504f7 [Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00
d571ca0108 [ci][distributed] add tests for custom allreduce (#5689) 2024-06-19 20:16:04 +00:00
afed90a034 [Frontend][Bugfix] Fix preemption_mode -> preemption-mode for CLI arg in arg_utils.py (#5688) 2024-06-19 14:41:42 -04:00
3ee5c4bca5 [ci] Add A100 queue into AWS CI template (#5648)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-19 08:42:13 -06:00
e9c2732b97 [CI/Build] Add tqdm to dependencies (#5680) 2024-06-19 08:37:33 -06:00
d8714530d1 [Misc]Add param max-model-len in benchmark_latency.py (#5629) 2024-06-19 18:19:08 +08:00
7d46c8d378 [Bugfix] Fix sampling_params passed incorrectly in Phi3v example (#5684) 2024-06-19 17:58:32 +08:00
da971ec7a5 [Model] Add FP8 kv cache for Qwen2 (#5656) 2024-06-19 09:38:26 +00:00
3eea74889f [misc][distributed] use 127.0.0.1 for single-node (#5619) 2024-06-19 08:05:00 +00:00
f758aed0e8 [Bugfix][CI/Build][AMD][ROCm]Fixed the cmake build bug which generate garbage on certain devices (#5641) 2024-06-18 23:21:29 -07:00
e5150f2c28 [Bugfix] Added test for sampling repetition penalty bug. (#5659)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-19 06:03:55 +00:00
59a1eb59c9 [Bugfix] Fix Phi-3 Long RoPE scaling implementation (#5628) 2024-06-19 01:46:38 +00:00
6820724e51 [Bugfix] Fix w8a8 benchmarks for int8 case (#5643) 2024-06-19 00:33:25 +00:00
b23ce92032 [Bugfix] Fix CUDA version check for mma warning suppression (#5642) 2024-06-18 23:48:49 +00:00
2bd231a7b7 [Doc] Added cerebrium as Integration option (#5553) 2024-06-18 15:56:59 -07:00
8a173382c8 [Bugfix] Fix for inconsistent behaviour related to sampling and repetition penalties (#5639)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-18 14:18:37 -07:00
07feecde1a [Model] LoRA support added for command-r (#5178) 2024-06-18 11:01:21 -07:00
19091efc44 [ci] Setup Release pipeline and build release wheels with cache (#5610)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-18 11:00:36 -07:00
95db455e7f [Misc] Add channel-wise quantization support for w8a8 dynamic per token activation quantization (#5542) 2024-06-18 12:45:05 -04:00
7879f24dcc [Misc] Add OpenTelemetry support (#4687)
This PR adds basic support for OpenTelemetry distributed tracing.
It includes changes to enable tracing functionality and improve monitoring capabilities.

I've also added a markdown with print-screens to guide users how to use this feature. You can find it here
2024-06-19 01:17:03 +09:00
13db4369d9 [ci] Deprecate original CI template (#5624)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-18 14:26:20 +00:00
4ad7b53e59 [CI/Build][Misc] Update Pytest Marker for VLMs (#5623) 2024-06-18 13:10:04 +00:00
f0cc0e68e3 [Misc] Remove import from transformers logging (#5625) 2024-06-18 12:12:19 +00:00
db5ec52ad7 [bugfix][distributed] improve p2p capability test (#5612)
[bugfix][distributed] do not error if two processes do not agree on p2p capability (#5612)
2024-06-18 07:21:05 +00:00
114d7270ff [CI] Avoid naming different metrics with the same name in performance benchmark (#5615) 2024-06-17 21:37:18 -07:00
32c86e494a [Misc] Fix typo (#5618) 2024-06-17 20:58:30 -07:00
8eadcf0b90 [misc][typo] fix typo (#5620) 2024-06-17 20:54:57 -07:00
5002175e80 [Kernel] Add punica dimensions for Granite 13b (#5559)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-06-18 03:54:11 +00:00
daef218b55 [Model] Initialize Phi-3-vision support (#4986) 2024-06-17 19:34:33 -07:00
fa9e385229 [Speculative Decoding 1/2 ] Add typical acceptance sampling as one of the sampling techniques in the verifier (#5131) 2024-06-17 21:29:09 -05:00
26e1188e51 [Fix] Use utf-8 encoding in entrypoints/openai/run_batch.py (#5606) 2024-06-17 23:16:10 +00:00
a3e8a05d4c [Bugfix] Fix KV head calculation for MPT models when using GQA (#5142) 2024-06-17 15:26:41 -07:00
e441bad674 [Optimization] use a pool to reuse LogicalTokenBlock.token_ids (#5584) 2024-06-17 22:08:05 +00:00
1b44aaf4e3 [bugfix][distributed] fix 16 gpus local rank arrangement (#5604) 2024-06-17 21:35:04 +00:00
9e4e6fe207 [CI] the readability of benchmarking and prepare for dashboard (#5571)
[CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571)
2024-06-17 11:41:08 -07:00
ab66536dbf [CI/BUILD] Support non-AVX512 vLLM building and testing (#5574) 2024-06-17 14:36:10 -04:00
728c4c8a06 [Hardware][Intel GPU] Add Intel GPU(XPU) inference backend (#3814)
Co-authored-by: Jiang Li <jiang1.li@intel.com>
Co-authored-by: Abhilash Majumder <abhilash.majumder@intel.com>
Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
2024-06-17 11:01:25 -07:00
1f12122b17 [Misc] use AutoTokenizer for benchmark serving when vLLM not installed (#5588) 2024-06-17 09:40:35 -07:00
890d8d960b [Kernel] compressed-tensors marlin 24 support (#5435) 2024-06-17 12:32:48 -04:00
9e74d9d003 Correct alignment in the seq_len diagram. (#5592)
Co-authored-by: Liqian Chen <liqian.chen@deeplang.ai>
2024-06-17 12:05:33 -04:00
9333fb8eb9 [Model] Rename Phi3 rope scaling type (#5595) 2024-06-17 12:04:14 -04:00
e2b85cf86a Fix w8a8 benchmark and add Llama-3-8B (#5562) 2024-06-17 06:48:06 +00:00
845a3f26f9 [Doc] add debugging tips for crash and multi-node debugging (#5581) 2024-06-17 10:08:01 +08:00
f07d513320 [build][misc] limit numpy version (#5582) 2024-06-16 16:07:01 -07:00
4a6769053a [CI][BugFix] Flip is_quant_method_supported condition (#5577) 2024-06-16 14:07:34 +00:00
f31c1f90e3 Add basic correctness 2 GPU tests to 4 GPU pipeline (#5518) 2024-06-16 07:48:02 +00:00
3ce2c050dd [Fix] Correct OpenAI batch response format (#5554) 2024-06-15 16:57:54 -07:00
1c0afa13c5 [BugFix] Don't start a Ray cluster when not using Ray (#5570) 2024-06-15 16:30:51 -07:00
d919ecc771 add gptq_marlin test for bug report https://github.com/vllm-project/vllm/issues/5088 (#5145) 2024-06-15 13:38:16 -04:00
e691918e3b [misc] Do not allow to use lora with chunked prefill. (#5538)
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-06-15 14:59:36 +00:00
81fbb3655f [CI/Build] Test both text and token IDs in batched OpenAI Completions API (#5568) 2024-06-15 07:29:42 -04:00
0e9164b40a [mypy] Enable type checking for test directory (#5017) 2024-06-15 04:45:31 +00:00
1b8a0d71cf [Core][Bugfix]: fix prefix caching for blockv2 (#5364)
Signed-off-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-06-14 17:23:56 -07:00
bd7efe95d0 Add ccache to amd (#5555) 2024-06-14 17:18:22 -07:00
f5bb85b435 [Core][Distributed] improve p2p cache generation (#5528) 2024-06-14 14:47:45 -07:00
28c145eb57 [Bugfix] Fix typo in Pallas backend (#5558) 2024-06-14 14:40:09 -07:00
e2afb03c92 [Bugfix] Enable loading FP8 checkpoints for gpt_bigcode models (#5460)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-06-14 20:28:11 +00:00
6e2527a7cb [Doc] Update documentation on Tensorizer (#5471) 2024-06-14 11:27:57 -07:00
cdab68dcdb [Docs] Add ZhenFund as a Sponsor (#5548) 2024-06-14 11:17:21 -07:00
d1c3d7d139 [misc][distributed] fix benign error in is_in_the_same_node (#5512) 2024-06-14 10:59:28 -07:00
77490c6f2f [Core] Remove duplicate processing in async engine (#5525) 2024-06-14 10:04:42 -07:00
48f589e18b [mis] fix flaky test of test_cuda_device_count_stateless (#5546) 2024-06-14 10:02:23 -07:00
348616ac4b [Kernel] Suppress mma.sp warning on CUDA 12.5 and later (#5401) 2024-06-14 10:02:00 -07:00
15985680e2 [ Misc ] Rs/compressed tensors cleanup (#5432)
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
2024-06-14 10:01:46 -07:00
d74674bbd9 [Misc] Fix arg names (#5524) 2024-06-14 09:47:44 -07:00
703475f6c2 [Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (#5516) 2024-06-14 09:30:15 -07:00
d47af2bc02 [CI/Build] Disable LLaVA-NeXT CPU test (#5529) 2024-06-14 09:27:30 -07:00
319ad7f1d3 [CI/Build][Misc] Add CI that benchmarks vllm performance on those PRs with perf-benchmarks label (#5073)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-06-13 22:36:20 -07:00
0f0d8bc065 bump version to v0.5.0.post1 (#5522) 2024-06-13 19:42:06 -07:00
55d6361b13 [Misc] Fix arg names in quantizer script (#5507) 2024-06-13 19:02:53 -07:00
cd9c0d65d9 [Hardware][Intel] Support CPU inference with AVX2 ISA (#5452) 2024-06-13 17:22:24 -06:00
50eed24d25 Add cuda_device_count_stateless (#5473) 2024-06-13 16:06:49 -07:00
e38042d4af [Kernel] Disable CUTLASS kernels for fp8 (#5505) 2024-06-13 13:38:05 -07:00
33e3b37242 [CI/Build] Disable test_fp8.py (#5508) 2024-06-13 13:37:48 -07:00
1696efe6c9 [misc] fix format.sh (#5511) 2024-06-13 12:09:16 -07:00
6b0511a57b Revert "[Core] Remove unnecessary copies in flash attn backend" (#5478) 2024-06-13 11:22:50 -07:00
a8fda4f661 Seperate dev requirements into lint and test (#5474) 2024-06-13 11:22:41 -07:00
30299a41fa [MISC] Remove FP8 warning (#5472)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
2024-06-13 11:22:30 -07:00
85657b5607 [Kernel] Factor out epilogues from cutlass kernels (#5391)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: zifeitong <zifei.tong@parasail.io>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-06-13 11:22:19 -07:00
0ce7b952f8 [Doc] Update LLaVA docs (#5437)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-13 11:22:07 -07:00
39873476f8 [CI/Build] Simplify OpenAI server setup in tests (#5100) 2024-06-13 11:21:53 -07:00
03dccc886e [Misc] Add vLLM version getter to utils (#5098) 2024-06-13 11:21:39 -07:00
a65634d3ae [Docs] Add 4th meetup slides (#5509) 2024-06-13 10:18:26 -07:00
80aa7e91fc [Hardware][Intel] Optimize CPU backend and add more performance tips (#4971)
Co-authored-by: Jianan Gu <jianan.gu@intel.com>
2024-06-13 09:33:14 -07:00
bd43973522 [Kernel] Tune Qwen2MoE kernel configurations with tp2,4 (#5497)
Tune Qwen2-57B-A14B configs based on #4921

Throughput Performance
command: python benchmarks/benchmark_throughput.py --model=Qwen/Qwen2-57B-A14B-Instruct --input-len 1000 --output-len 50 -tp 2

A100 GPU

benchmark	no config	w/ PR
tp=2	10.53 requests/s, 11058.17 tokens/s	12.47 requests/s, 13088.57 tokens/s
tp=4	17.77 requests/s, 18662.95 tokens/s	20.20 requests/s, 21212.32 tokens/s
2024-06-13 09:01:10 -07:00
23ec72fa03 [CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466) 2024-06-13 15:18:08 +00:00
c2637a613b [Kernel] w4a16 support for compressed-tensors (#5385)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-06-13 10:19:56 -04:00
88407532e7 [Bugfix]if the content is started with ":"(response of ping), client should i… (#5303)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-12 20:16:41 -07:00
916d219d62 [ci] Use sccache to build images (#5419)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-12 17:58:12 -07:00
ea3890a5f0 [Core][Distributed] code deduplication in tp&pp with coordinator(#5293)
[Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293)
2024-06-12 17:27:08 -07:00
2135cacb45 [Bugfix] Fix wrong multi_modal_input format for CPU runner (#5451) 2024-06-12 16:20:18 -07:00
7d19de2e9c [Frontend] Add "input speed" to tqdm postfix alongside output speed (#5425) 2024-06-12 18:42:12 -04:00
94a07bbdd8 [Bugfix] Fix typo in scheduler.py (requeset -> request) (#5470) 2024-06-12 21:59:44 +00:00
b8d4dfff9c [Doc] Update debug docs (#5438) 2024-06-12 14:49:31 -07:00
622d45128c [misc] add hint for AttributeError (#5462) 2024-06-12 21:46:35 +00:00
51602eefd3 [Frontend] [Core] Support for sharded tensorized models (#4990)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Sanger Steel <sangersteel@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-12 14:13:52 -07:00
5cc50a531f [Bugfix] TYPE_CHECKING for MultiModalData (#5444) 2024-06-12 14:08:52 -07:00
5985e3427d [Kernel] Vectorized FP8 quantize kernel (#5396)
Inspired by #5146, this PR improves FP8 quantize kernel by vectorizing data transfer to better utilize memory bandwidth. Microbenchmark shows that this improved kernel can achieve 1.0x-1.5x speedup (especially when hidden size is large).

In details, we applied 3 optimizations:

- Use inverted scale so that most divisions are changed to multiplications.
- Unroll the loop by 4 times to improve ILP.
- Use vectorized 4 to transfer data between HBM and SRAM.
2024-06-12 14:07:26 -07:00
8b82a89997 [ci] Add AMD, Neuron, Intel tests for AWS CI and turn off default soft fail for GPU tests (#5464)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-12 14:00:18 -07:00
c3c2903e72 [Bugfix] Add device assertion to TorchSDPA (#5402) 2024-06-12 12:58:53 -07:00
1a8bfd92d5 [Hardware] Initial TPU integration (#5292) 2024-06-12 11:53:03 -07:00
847cdcca1c [CI] Upgrade codespell version. (#5381) 2024-06-12 10:06:14 -07:00
e3c12bf6d2 Revert "[CI/Build] Add is_quant_method_supported to control quantization test configurations" (#5463) 2024-06-12 10:03:24 -07:00
3dd6853bc8 [CI/Build] Add is_quant_method_supported to control quantization test configurations (#5253) 2024-06-12 09:58:02 -07:00
8f89d72090 [Doc] add common case for long waiting time (#5430) 2024-06-11 11:12:13 -07:00
99dac099ab [Core][Doc] Default to multiprocessing for single-node distributed case (#5230)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-06-11 11:10:41 -07:00
c4bd03c7c5 [Core][Distributed] add same-node detection (#5369) 2024-06-11 10:53:59 -07:00
dcbf4286af [Frontend] Customizable RoPE theta (#5197) 2024-06-11 10:42:26 -07:00
00e6a2dc53 [Bugfix] fix lora_dtype value type in arg_utils.py (#5398) 2024-06-11 10:40:23 -07:00
2e02311a1b [Bugfix] Fix MultiprocessingGPUExecutor.check_health when world_size == 1 (#5254) 2024-06-11 10:38:07 -07:00
89ec06c33b [Docs] [Spec decode] Fix docs error in code example (#5427) 2024-06-11 10:31:56 -07:00
9fde251bf0 [Doc] Add an automatic prefix caching section in vllm documentation (#5324)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-06-11 10:24:59 -07:00
4c2ffb28ff [Speculative decoding] Initial spec decode docs (#5400) 2024-06-11 10:15:40 -07:00
246598a6b1 [CI] docfix (#5410)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: ywang96 <ywang@roblox.com>
2024-06-11 01:28:50 -07:00
8bab4959be [Misc] Remove VLLM_BUILD_WITH_NEURON env variable (#5389) 2024-06-11 00:37:56 -07:00
3c4cebf751 [Doc][Typo] Fixing Missing Comma (#5403) 2024-06-11 00:20:28 -07:00
d8f31f2f8b [Doc] add debugging tips (#5409) 2024-06-10 23:21:43 -07:00
640052b069 [Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026) 2024-06-10 22:36:46 -07:00
351d5e7b82 [Bugfix] OpenAI entrypoint limits logprobs while ignoring server defined --max-logprobs (#5312)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-06-11 10:30:31 +08:00
a008629807 [Misc] Various simplifications and typing fixes (#5368) 2024-06-11 10:29:02 +08:00
76477a93b7 [ci] Fix Buildkite agent path (#5392)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-10 18:58:07 -07:00
77c87beb06 [Doc] Add documentation for FP8 W8A8 (#5388) 2024-06-10 18:55:12 -06:00
114332b88e Bump version to v0.5.0 (#5384) 2024-06-10 15:56:06 -07:00
cb77ad836f [Docs] Alphabetically sort sponsors (#5386) 2024-06-10 15:17:19 -05:00
856c990041 [Docs] Add Docs on Limitations of VLM Support (#5383) 2024-06-10 09:53:50 -07:00
c5602f0baa [ci] Mount buildkite agent on Docker container to upload benchmark results (#5330)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-10 09:22:34 -07:00
f7f9c5f97b [ci] Use small_cpu_queue for doc build (#5331)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-10 09:21:11 -07:00
2c0d933594 [Bugfix] Fix LLaVA-NeXT (#5380) 2024-06-10 15:38:47 +00:00
774d1035e4 [Feature][Frontend]: Continued stream_options implementation also in CompletionRequest (#5319) 2024-06-10 14:22:09 +00:00
6b29d6fe70 [Model] Initial support for LLaVA-NeXT (#4199)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-06-10 12:47:15 +00:00
0bfa1c4f13 [Misc] Improve error message when LoRA parsing fails (#5194) 2024-06-10 19:38:49 +08:00
c81da5f56d [misc][typo] fix typo (#5372) 2024-06-10 09:51:02 +00:00
68bc81703e [Frontend][Misc] Enforce Pixel Values as Input Type for VLMs in API Server (#5374) 2024-06-10 09:13:39 +00:00
5884c2b454 [Misc] Update to comply with the new compressed-tensors config (#5350)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-06-10 03:49:46 +00:00
45f92c00cf [Bugfix] Fix KeyError: 1 When Using LoRA adapters (#5164) 2024-06-09 16:23:14 -07:00
5467ac3196 [Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) 2024-06-09 16:23:30 -04:00
5d7e3d0176 [mis][ci/test] fix flaky test in test_sharded_state_loader.py (#5361)
[mis][ci/test] fix flaky test in tests/test_sharded_state_loader.py (#5361)
2024-06-09 03:50:14 +00:00
0373e1837e [Core][CUDA Graph] add output buffer for cudagraph (#5074)
[Core][CUDA Graph] add output buffer for cudagraph to reduce memory footprint (#5074)
2024-06-08 19:14:43 -07:00
c09dade2a2 [Misc][Breaking] Change FP8 checkpoint format from act_scale -> input_scale (#5353) 2024-06-08 13:54:05 -04:00
8ea5e44a43 [CI/Test] improve robustness of test (vllm_runner) (#5357)
[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
2024-06-08 08:59:20 +00:00
9fb900f90c [CI/Test] improve robustness of test (hf_runner) (#5347)
[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347)
2024-06-07 22:31:32 -07:00
c96fc06747 [ROCm][AMD] Use pytorch sdpa math backend to do naive attention (#4965) 2024-06-07 19:13:12 -07:00
b3376e5c76 [Misc] Add args for selecting distributed executor to benchmarks (#5335) 2024-06-08 09:20:16 +08:00
e69ded7d1c [Bug Fix] Fix the support check for FP8 CUTLASS (#5352)
Bug description:
With torch 2.4.0.dev20240603+cu121,
cutlass_fp8_supported outputs False, and the (capability, version) before the comparison is (90, 11111111112)

This PR fixes the support check for FP8 CUTLASS ( cutlass_fp8_supported) which was introduced in https://github.com/vllm-project/vllm/pull/5183.
2024-06-08 00:42:05 +00:00
767c727a81 fix DbrxFusedNormAttention missing cache_config (#5340)
Co-authored-by: team <calvinn.ng@ahrefs.com>
2024-06-07 14:10:21 -07:00
6840a71610 [Misc] Remove unused cuda_utils.h in CPU backend (#5345) 2024-06-07 14:09:13 -07:00
7a9cb294ae [Frontend] Add OpenAI Vision API Support (#5237)
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-06-07 11:23:32 -07:00
ca3ea51bde [Kernel] Dynamic Per-Token Activation Quantization (#5037)
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-06-07 09:36:26 -07:00
dc49fb892c Addition of lacked ignored_seq_groups in _schedule_chunked_prefill (#5296) 2024-06-07 13:35:42 +00:00
18a277b52d Remove Ray health check (#4693) 2024-06-07 10:01:56 +00:00
8d75fe48ca [Kernel] Switch fp8 layers to use the CUTLASS kernels (#5183)
Switching from torch._scaled_mm to vLLM's cutlass fp8 kernels when supported as we are seeing 5-15% improvement in e2e performance on neuralmagic/Meta-Llama-3-8B-Instruct-FP8

see https://docs.google.com/spreadsheets/d/1GiAnmzyGHgZ6zL_LDSTm35Bdrt4A8AaFEurDlISYYA4/ for some quick e2e benchmarks and #5144 for comparisons across different GEMM sizes.
2024-06-07 08:42:35 +00:00
388596c914 [Misc][Utils] allow get_open_port to be called for multiple times (#5333) 2024-06-06 22:15:11 -07:00
baa15a9ec3 [Feature][Frontend]: Add support for stream_options in ChatCompletionRequest (#5135) 2024-06-07 03:29:24 +00:00
15063741e3 [Misc] Missing error message for custom ops import (#5282) 2024-06-06 20:17:21 -07:00
ccdc490dda [Core] Change LoRA embedding sharding to support loading methods (#5038) 2024-06-06 19:07:57 -07:00
a31cab7556 [Core] Avoid copying prompt/output tokens if no penalties are used (#5289) 2024-06-06 18:12:00 -07:00
828da0d44e [Frontend] enable passing multiple LoRA adapters at once to generate() (#5300) 2024-06-06 15:48:13 -05:00
abe855d637 [Kernel] Retune Mixtral 8x22b configs for FP8 on H100 (#5294) 2024-06-06 09:29:29 -07:00
4efff036f0 Bugfix: fix broken of download models from modelscope (#5233)
Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
2024-06-06 09:28:10 -07:00
89c920785f [CI/Build] Update vision tests (#5307) 2024-06-06 05:17:18 -05:00
7b0a0dfb22 [Frontend][Core] Update Outlines Integration from FSM to Guide (#4109)
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Breno Faria <breno.faria@intrafind.com>
2024-06-05 16:49:12 -07:00
3a6ae1d33c [CI] Disable flash_attn backend for spec decode (#5286) 2024-06-05 15:49:27 -07:00
8f1729b829 [Docs] Add Ray Summit CFP (#5295) 2024-06-05 15:25:18 -07:00
6a7c7711a2 [Misc] Skip for logits_scale == 1.0 (#5291) 2024-06-05 15:19:02 -07:00
0f83ddd4d7 [Bugfix][Frontend/Core] Don't log exception when AsyncLLMEngine gracefully shuts down. (#5290) 2024-06-05 15:18:12 -07:00
065aff6c16 [Bugfix] Make EngineArgs use named arguments for config construction (#5285) 2024-06-05 15:16:56 -07:00
3d33e372a1 [BugFix] Fix log message about default max model length (#5284) 2024-06-05 14:53:16 -07:00
faf71bcd4b [Speculative Decoding] Add ProposerWorkerBase abstract class (#5252) 2024-06-05 14:53:05 -07:00
f270a39537 [Docs] Add Sequoia as sponsors (#5287) 2024-06-05 18:02:56 +00:00
51a08e7d8f [Kernel] Re-tune Mixtral MoE configurations for FP8 on H100 (#5238) 2024-06-05 10:59:14 -07:00
eb8fcd2666 [BugFix] Apply get_cached_tokenizer to the tokenizer setter of LLM (#5207)
Co-authored-by: qiujiawei9 <qiujiawei9@jd.com>
2024-06-05 10:59:02 -07:00
5563a4dea8 [Model] Correct Mixtral FP8 checkpoint loading (#5231) 2024-06-05 10:58:50 -07:00
ccd4f129e8 [Kernel] Add GPU architecture guards to the CUTLASS w8a8 kernels to reduce binary size (#5157)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-06-05 10:44:15 -07:00
02cc3b51a7 [misc] benchmark_serving.py -- add ITL results and tweak TPOT results (#5263) 2024-06-05 10:17:51 -07:00
d5b1eb081e [CI] Add nightly benchmarks (#5260) 2024-06-05 09:42:08 -07:00
f0a500545f [Frontend] OpenAI API server: Add add_special_tokens to ChatCompletionRequest (default False) (#5278) 2024-06-05 09:32:58 -07:00
c65146e75e [Misc] Fix docstring of get_attn_backend (#5271) 2024-06-05 09:18:59 -07:00
41ca62cf03 [Misc] Add CustomOp interface for device portability (#5255) 2024-06-05 09:18:19 -07:00
974fc9b845 [Bugfix] Fix prompt_logprobs when SamplingParams.detokenize is set to True (#5226) 2024-06-04 19:37:28 -07:00
fee4dcc33a [Misc] update collect env (#5261) 2024-06-04 17:29:09 -05:00
650a4cc55e [Misc] Add transformers version to collect_env.py (#5259) 2024-06-04 12:52:28 -07:00
9ca62d8668 [CI] mark AMD test as softfail to prevent blockage (#5256) 2024-06-04 11:34:53 -07:00
45c35f0d58 [CI/Build] Reducing CPU CI execution time (#5241) 2024-06-04 10:26:40 -07:00
9ba093b4f4 [CI/Build] Simplify model loading for HfRunner (#5251) 2024-06-04 10:09:19 -07:00
27208be66e [Kernel] Add back batch size 1536 and 3072 to MoE tuning (#5242) 2024-06-04 09:58:47 -07:00
87d5abef75 [Bugfix] Fix a bug caused by pip install setuptools>=49.4.0 for CPU backend (#5249) 2024-06-04 09:57:51 -07:00
ec784b2526 [CI/Build] Add inputs tests (#5215) 2024-06-03 21:01:46 -07:00
a58f24e590 [Bugfix] Fix torch.compile() error when using MultiprocessingGPUExecutor (#5229) 2024-06-03 20:55:50 -07:00
f42a006b15 [Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 20:32:57 -07:00
3a434b07ed [Kernel] Enhance MoE benchmarking & tuning script (#4921) 2024-06-03 20:06:59 -07:00
bd0e7802e0 [Bugfix] Add warmup for prefix caching example (#5235) 2024-06-03 19:36:41 -07:00
06b2550cbb [Bugfix] Support prompt_logprobs==0 (#5217) 2024-06-03 17:59:30 -07:00
f775a07e30 [FRONTEND] OpenAI tools support named functions (#5032) 2024-06-03 18:25:29 -05:00
4f0d17c05c New CI template on AWS stack (#5110)
Signed-off-by: kevin <kevin@anyscale.com>
2024-06-03 16:16:43 -07:00
10c38e3e46 [Misc]: Implement CPU/GPU swapping in BlockManagerV2 (#3834) 2024-06-03 13:37:11 -07:00
cafb8e06c5 [CI/BUILD] enable intel queue for longer CPU tests (#4113) 2024-06-03 10:39:50 -07:00
cbb2f59cc8 [Kernel] Pass a device pointer into the quantize kernel for the scales (#5159) 2024-06-03 09:52:30 -07:00
0ab278ca31 [Core] Remove unnecessary copies in flash attn backend (#5138) 2024-06-03 09:39:31 -07:00
7a64d24aad [Core] Support image processor (#4197) 2024-06-02 22:56:41 -07:00
dfbe60dc62 [Misc] Simplify code and fix type annotations in conftest.py (#5118) 2024-06-02 16:05:50 -07:00
a66cf40b20 [Kernel][ROCm][AMD] enable fused topk_softmax kernel for moe layer (#4927)
This PR enables the fused topk_softmax kernel used in moe layer for HIP
2024-06-02 14:13:26 -07:00
f790ad3c50 [Frontend][OpenAI] Support for returning max_model_len on /v1/models response (#4643) 2024-06-02 08:06:13 +00:00
ed59a7ed23 Update test_ignore_eos (#4898) 2024-06-02 02:21:53 +00:00
044793d8df [BugFix] Prevent LLM.encode for non-generation Models (#5184)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-06-01 23:35:41 +00:00
c2d6d2f960 [Bugfix]: Fix issues related to prefix caching example (#5177) (#5180) 2024-06-01 15:53:52 -07:00
8279078e21 [Bugfix] Remove deprecated @abstractproperty (#5174) 2024-06-01 22:40:25 +00:00
b9c0605a8e [Feature][Kernel] Support bitsandbytes quantization and QLoRA (#4776) 2024-06-01 14:51:10 -06:00
37464a0f74 [Bugfix] Fix call to init_logger in openai server (#4765) 2024-06-01 17:18:50 +00:00
c354072828 [Minor] Fix the path typo in loader.py: save_sharded_states.py -> save_sharded_state.py (#5151)
Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
2024-06-01 17:11:22 +00:00
f081c3ce4b [Kernel] Update Cutlass fp8 configs (#5144)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-06-01 08:46:07 +00:00
260d119e86 [Kernel] Refactor CUTLASS kernels to always take scales that reside on the GPU (#5137) 2024-06-01 06:45:32 +00:00
a360ff80bb [CI/Build] CMakeLists: build all extensions' cmake targets at the same time (#5034) 2024-05-31 22:06:45 -06:00
1197e02141 [Build] Guard against older CUDA versions when building CUTLASS 3.x kernels (#5168) 2024-05-31 17:21:38 -07:00
657579113f [Doc] Add checkmark for GPTBigCodeForCausalLM LoRA support (#5171) 2024-05-31 17:20:19 -07:00
e9899fb7a4 [Model] Enable FP8 QKV in MoE and refine kernel tuning script (#5039) 2024-05-31 14:29:19 -07:00
a377f0bd5e [Misc]: optimize eager mode host time (#4196)
Co-authored-by: xuhao <xuhao@cambricon.com>
2024-05-31 13:14:50 +08:00
e9d3aa04f6 Revert "[Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5)" (#5149) 2024-05-30 22:00:26 -07:00
a22dea54d3 [Model] Support MAP-NEO model (#5081)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-05-30 19:24:41 -07:00
533c217792 Fix cutlass sm_90a vesrion in CMakeList 2024-05-31 02:13:01 +00:00
6d21fa1cad [Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::ordered_metadata modifier (introduced with PTX 8.5) (#5136) 2024-05-30 21:02:11 -05:00
b35be5403f [Bugfix] Avoid Warnings in SparseML Activation Quantization (#5120) 2024-05-30 17:04:37 -07:00
45a1a69b98 [Build] Disable sm_90a in cu11 (#5141) 2024-05-30 14:37:16 -07:00
87a658c812 Bump version to v0.4.3 (#5046) 2024-05-30 11:13:46 -07:00
429d89720e add doc about serving option on dstack (#3074)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-05-30 10:11:07 -07:00
a9bcc7afb2 [Doc] Use intersphinx and update entrypoints docs (#5125) 2024-05-30 09:59:23 -07:00
d79d9eaaff [Misc] remove duplicate definition of seq_lens_tensor in model_runner.py (#5129) 2024-05-30 06:56:19 -07:00
f758505c73 [CI/Build] increase wheel size limit to 200 MB (#5130) 2024-05-30 06:29:48 -07:00
d910816c73 [Bugfix] Automatically Detect SparseML models (#5119) 2024-05-30 12:58:37 +00:00
87d41c849d [BUGFIX] [FRONTEND] Correct chat logprobs (#5029)
Co-authored-by: Breno Faria <breno.faria@intrafind.com>
2024-05-30 02:52:14 -07:00
e07aff9e52 [CI/Build] Docker cleanup functionality for amd servers (#5112)
Co-authored-by: Alexey Kondratiev <alexey.kondratiev@amd.com>
Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Co-authored-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Co-authored-by: omkarkakarparthi <okakarpa>
2024-05-30 03:27:39 +00:00
5bf185a1c4 [Bugfix] gptq_marlin: Ensure g_idx_sort_indices is not a Parameter (#5108) 2024-05-30 00:30:18 +00:00
4fbcb0f27e [Doc][Build] update after removing vllm-nccl (#5103)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-05-29 23:51:18 +00:00
7c3604fb68 [Bugfix] logprobs is not compatible with the OpenAI spec #4795 (#5031) 2024-05-29 16:13:22 -07:00
b1c255630d [Core] Avoid the need to pass None values to Sequence.inputs (#5099) 2024-05-29 16:05:01 -07:00
eb6c50cdc2 [Bugfix][CI/Build] Fix codespell failing to skip files in git diff (#5097) 2024-05-29 16:02:54 -07:00
eecd864388 [Bugfix][CI/Build] Fix test and improve code for merge_async_iterators (#5096) 2024-05-29 16:02:25 -07:00
ae495c74ea [Doc]Replace deprecated flag in readme (#4526) 2024-05-29 22:26:33 +00:00
4238bc82f2 [Core] Cross-attention KV caching and memory-management (towards eventual encoder/decoder model support) (#4837) 2024-05-29 16:09:13 +00:00
594392d27a [Core][Distributed] improve p2p access check (#4992) 2024-05-29 11:29:07 +00:00
18c1f16d86 [Bugfix] Fix arguments passed to Sequence in stop checker test (#5092) 2024-05-29 07:16:41 +00:00
5bd3c65072 [Core][Optimization] remove vllm-nccl (#5091) 2024-05-29 05:13:52 +00:00
616e600e0b [Misc] add gpu_memory_utilization arg (#5079)
Signed-off-by: pandyamarut <pandyamarut@gmail.com>
2024-05-28 17:16:18 -07:00
dfba529b40 [Bugfix] Remove the last EOS token unless explicitly specified (#5077) 2024-05-28 17:15:35 -07:00
5ae5ed1e60 [Core] Consolidate prompt arguments to LLM engines (#4328)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-05-28 13:29:31 -07:00
290f4ada2b [Docs] Add Dropbox as sponsors (#5089) 2024-05-28 10:29:09 -07:00
dd8de11f0a [Kernel][ROCm][AMD] Add fused_moe Triton configs for MI300X (#4951)
This PR adds Triton kernel configs for the MoE kernel for MI300X
2024-05-28 16:03:23 +00:00
9ba415588a [BugFix] Fix Embedding Models with TP>1 (#5075) 2024-05-28 08:32:42 -07:00
d4f3985907 [Core] Sliding window for block manager v2 (#4545)
Co-authored-by: Ruth Evans <ruthevans@Ruths-MacBook-Pro.local>
2024-05-28 11:07:07 +09:00
890aa93d27 [Model] Add support for falcon-11B (#5069) 2024-05-27 16:41:43 -07:00
fbdb7b3ee2 [Core] Allow AQLM on Pascal (#5058) 2024-05-27 15:26:14 -07:00
1102bef219 [Bugfix / Core] Prefix Caching Guards (merged with main) (#4846)
Co-authored-by: rsnm2 <rshaw@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-27 15:18:17 -07:00
f17a1a8f96 [Misc] Make Serving Benchmark More User-friendly (#5044) 2024-05-25 17:28:16 +00:00
d5a1697772 [Dynamic Spec Decoding] Minor fix for disabling speculative decoding (#5000) 2024-05-25 10:00:14 -07:00
325c119961 [Misc] add logging level env var (#5045) 2024-05-24 23:49:49 -07:00
8e192ff967 [Kernel][Backend][Model] Blocksparse flash attention kernel and Phi-3-Small model (#4799)
Co-authored-by: beagleski <yunanzhang@microsoft.com>
Co-authored-by: bapatra <bapatra@microsoft.com>
Co-authored-by: Barun Patra <codedecde@users.noreply.github.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-05-24 22:00:52 -07:00
e64fde4b01 [Core][Bugfix]: fix prefix caching for blockv2 (#4764)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-05-24 10:07:09 -07:00
919770957f [Bugfix] Fix Mistral v0.3 Weight Loading (#5005)
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-05-24 12:28:27 +00:00
6a50f4cafa [Doc] add ccache guide in doc (#5012)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-05-23 23:21:54 +00:00
e3470f8753 [Core]: Option To Use Prompt Token Ids Inside Logits Processor (#4985)
Co-authored-by: Elisei Smirnov <el.smirnov@innopolis.university>
2024-05-23 22:04:24 +00:00
a1242324c9 [Kernel] Initial Activation Quantization Support (#4525)
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-05-23 21:29:18 +00:00
5eda2ea02a [Core][1/N] Support send/recv in PyNCCL Groups (#4988)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-05-23 09:54:48 -07:00
2ba80bed27 [Bugfix] Update Dockerfile.cpu to fix NameError: name 'vllm_ops' is not defined (#5009) 2024-05-23 09:08:58 -07:00
6066253296 Marlin 24 prefill performance improvement (about 25% better on average) (#4983) 2024-05-23 02:39:27 -04:00
ee3eea0a1b [Misc] Take user preference in attention selector (#4960) 2024-05-23 07:55:56 +09:00
a36de682d4 [Minor] Fix small typo in llama.py: QKVParallelLinear -> QuantizationConfig (#4991) 2024-05-22 22:26:56 +00:00
eb6d3c264d [Core] Eliminate parallel worker per-step task scheduling overhead (#4894) 2024-05-23 06:17:27 +09:00
97b030005c [Model] LoRA gptbigcode implementation (#3949) 2024-05-22 13:58:59 -07:00
a3a73ab069 [Misc] Load FP8 kv-cache scaling factors from checkpoints (#4893)
The 2nd PR for #4532.

This PR supports loading FP8 kv-cache scaling factors from a FP8 checkpoint (with .kv_scale parameter).
2024-05-22 13:28:20 -07:00
8674f9880e [Kernel] Fixup for CUTLASS kernels in CUDA graphs (#4954)
Pass the CUDA stream into the CUTLASS GEMMs, to avoid future issues with CUDA graphs
2024-05-22 14:10:43 +00:00
c74c913bfb [misc] remove comments that were supposed to be removed (#4977) 2024-05-22 09:02:58 -04:00
5f6d10c14c [CI/Build] Enforce style for C++ and CUDA code with clang-format (#4722) 2024-05-22 07:18:41 +00:00
9b9a10d6cb [Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 01:32:35 -04:00
99eff67ba9 [Bugfix][Kernel] Add head size check for attention backend selection (#4944) 2024-05-21 15:33:25 -04:00
14772eeb8e [Bugfix] Fix flag name for max_seq_len_to_capture (#4935)
Signed-off-by: kerthcet <kerthcet@gmail.com>
2024-05-21 09:30:52 -07:00
757b62c495 [CI/Build] Codespell ignore build/ directory (#4945) 2024-05-21 09:06:10 -07:00
e941f88584 [Docs] Add acknowledgment for sponsors (#4925) 2024-05-21 00:17:25 -07:00
f12c3b5b3d [Model] Add Phi-2 LoRA support (#4886) 2024-05-21 14:24:17 +09:00
d130b573a0 [Model] add rope_scaling support for qwen2 (#4930) 2024-05-21 05:22:22 +00:00
65ae8c2c8f [Core] Fix scheduler considering "no LoRA" as "LoRA" (#4897) 2024-05-20 17:48:32 -07:00
c3af44722c [Doc]Add documentation to benchmarking script when running TGI (#4920) 2024-05-20 20:16:57 +00:00
1937e29848 [Core] Sharded State Loader download from HF (#4889) 2024-05-20 11:46:12 -07:00
f0eecee610 [Bugfix] Fix dummy weight for fp8 (#4916)
Allow dummy load format for fp8,
torch.uniform_ doesn't support FP8 at the moment

Co-authored-by: Mor Zusman <morz@ai21.com>
2024-05-20 18:44:25 +00:00
943e72ca56 [Build/CI] Enabling AMD Entrypoints Test (#4834)
Co-authored-by: Alexey Kondratiev <alexey.kondratiev@amd.com>
2024-05-20 11:29:28 -07:00
546a97ef69 [Misc]: allow user to specify port in distributed setting (#4914) 2024-05-20 17:45:06 +00:00
da5a0b539d Remove marlin warning (#4918) 2024-05-20 14:55:34 +00:00
6287537a0c [Model] LLaVA model refactor (#4910) 2024-05-20 08:11:25 +00:00
b57e6c5949 [Kernel] Add flash-attn back (#4907) 2024-05-19 18:11:30 -07:00
27ce85476e [Kernel] Add marlin_24 unit tests (#4901) 2024-05-19 11:37:34 -04:00
f68470e803 [Bugfix][Model] Add base class for vision-language models (#4809) 2024-05-19 00:13:33 -07:00
2e9a2227ec [Lora] Support long context lora (#4787)
Currently we need to call rotary embedding kernel for each LoRA, which makes it hard to serve multiple long context length LoRA. Add batched rotary embedding kernel and pipe it through.

It replaces the rotary embedding layer to the one that is aware of multiple cos-sin-cache per scaling factors.

Follow up of https://github.com/vllm-project/vllm/pull/3095/files
2024-05-18 16:05:23 +09:00
c0724fc915 [ROCm][Hardware][AMD] Adding Navi21 to fallback to naive attention if Triton is not used (#4658) 2024-05-18 05:09:11 +00:00
86b45ae065 [Bugfix] Relax tiktoken to >= 0.6.0 (#4890) 2024-05-17 12:58:52 -06:00
c5711ef985 [Doc] Update Ray Data distributed offline inference example (#4871) 2024-05-17 10:52:11 -07:00
48d5985a08 Sync huggingface modifications of qwen Moe model (#4774) 2024-05-17 09:43:19 -07:00
33e0823de5 [Bugfix] fix rope error when load models with different dtypes (#4835) 2024-05-17 18:43:34 +09:00
26148120b3 [Build/CI] Extending the set of AMD tests with Regression, Basic Correctness, Distributed, Engine, Llava Tests (#4797) 2024-05-16 20:58:25 -07:00
0150a10630 [Frontend] OpenAI API server: Do not add bos token by default when encoding (#4688) 2024-05-16 18:47:22 -07:00
8e7fb5d43a Support to serve vLLM on Kubernetes with LWS (#4829)
Signed-off-by: kerthcet <kerthcet@gmail.com>
2024-05-16 16:37:29 -07:00
9a31a817a8 [Bugfix] Fix FP8 KV cache support (#4869) 2024-05-16 22:42:29 +00:00
2060e93659 [Kernel] Add w8a8 CUTLASS kernels (#4749) 2024-05-16 18:32:50 -04:00
8435b207af [Kernel] Add punica dimension for Qwen1.5-32B LoRA (#4850)
Co-authored-by: Silencio <silencio@adsl-99-6-187-6.dsl.irvnca.sbcglobal.net>
2024-05-16 11:16:09 -07:00
10fa9eea21 [Misc] remove old comments (#4866) 2024-05-16 11:07:41 -07:00
e08188081b [Core][Distributed] remove graph mode function (#4818) 2024-05-16 10:59:52 -07:00
b5853f9963 [ROCm][AMD][Bugfix] adding a missing triton autotune config (#4845) 2024-05-16 10:46:52 -07:00
f09edd8a25 Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00
6979ade384 Add GPTQ Marlin 2:4 sparse structured support (#4790)
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-05-16 12:56:15 -04:00
9216b9cc38 [Bugfix] Bypass authorization API token for preflight requests (#4862) 2024-05-16 09:42:21 -07:00
5e0391c040 [Frontend] Separate OpenAI Batch Runner usage from API Server (#4851) 2024-05-17 00:42:41 +09:00
dbc0754ddf [docs] Fix typo in examples filename openi -> openai (#4864) 2024-05-17 00:42:17 +09:00
99caa49106 [Kernel] add bfloat16 support for gptq marlin kernel (#4788) 2024-05-16 09:55:29 -04:00
5c342570d7 Add marlin unit tests and marlin benchmark script (#4815) 2024-05-16 09:36:49 -04:00
973617ae02 [Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840)
Co-authored-by: Cade Daniel <edacih@gmail.com>
Co-authored-by: Cade Daniel <cade@anyscale.com>
2024-05-16 00:53:51 -07:00
30e754390c [Core] Implement sharded state loader (#4690)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-05-15 22:11:54 -07:00
52f8107cf2 [Frontend] Support OpenAI batch file format (#4794)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-15 19:13:36 -04:00
fc0d9dfc3a [Frontend] Re-enable custom roles in Chat Completions API (#4758) 2024-05-15 14:58:46 -07:00
361c461a12 [Doc] Highlight the fourth meetup in the README (#4842) 2024-05-15 11:38:49 -07:00
a5675d348b [Bugfix] Properly set distributed_executor_backend in ParallelConfig (#4816) 2024-05-15 07:22:09 -07:00
e9cdd2b1e2 [CI/Build] Further decouple HuggingFace implementation from ours during tests (#4166) 2024-05-14 23:38:40 -07:00
65bf2ac165 [Core][2/N] Model runner refactoring part 2. Combine prepare prefill / decode to a single API (#4681)
This PR combines prepare_prompt and prepare_decode into a single API. This PR also coelsce the attn metadata for prefill/decode to a single class and allow to slice them when running attn backend.

It also refactors subquery_start_loc which was not refactored in the previous PR
2024-05-15 14:00:10 +09:00
8a7cc254a0 Revert "[Kernel] Use flash-attn for decoding (#3648)" (#4820)
Lora 3 & 4 test seems to have illegal memory access failure after this commit;

[2024-05-14 23:51:18,182 E 22 22] logging.cc:101: Unhandled exception: N3c105ErrorE. what(): CUDA error: an illegal memory access was encountered
<br class="Apple-interchange-newline">
Exmaple: https://buildkite.com/vllm/ci/builds/7382#018f793d-1527-4e1c-ab59-c3a34ec55241

This reverts commit 1356df5.

FILL IN THE PR DESCRIPTION HERE

FIX #xxxx (link existing issues this PR will resolve)
2024-05-15 11:52:45 +09:00
29bc01bf3b Add 4th meetup announcement to readme (#4817) 2024-05-14 18:33:06 -04:00
676a99982f [Core] Add MultiprocessingGPUExecutor (#4539)
Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com>
2024-05-14 10:38:59 -07:00
dc72402b57 [Bugfix][Doc] Fix CI failure in docs (#4804)
This PR fixes the CI failure introduced by #4798.

The failure originates from having duplicate target names in reST, and is fixed by changing the ref targets to anonymous ones. For more information, see this discussion.

I have also changed the format of the links to be more distinct from each other.
2024-05-15 01:57:08 +09:00
ccb63a8245 [Core][Hash][Automatic Prefix caching] Accelerating the hashing function by avoiding deep copies (#4696) 2024-05-14 21:34:33 +09:00
c579b750a0 [Doc] Add meetups to the doc (#4798) 2024-05-13 18:48:00 -07:00
4bfa7e7f75 [Doc] Add API reference for offline inference (#4710) 2024-05-13 17:47:42 -07:00
ac1fbf7fd2 [Doc] Shorten README by removing supported model list (#4796) 2024-05-13 16:23:54 -07:00
33d3914b1e [Bugfix] Fix dynamic FP8 quantization for Mixtral (#4793) 2024-05-13 19:00:27 -04:00
1356df53bd [Kernel] Use flash-attn for decoding (#3648)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
2024-05-13 15:50:33 -07:00
ce532ff45c [Speculative decoding] Improve n-gram efficiency (#4724) 2024-05-13 15:00:13 -07:00
8bc68e198c [Frontend] [Core] perf: Automatically detect vLLM-tensorized model, update tensorizer to version 2.9.0 (#4208) 2024-05-13 14:57:07 -07:00
0fca3cdcf2 [Misc] Enhance attention selector (#4751) 2024-05-13 10:47:25 -07:00
e7c46b9527 [Scheduler] Warning upon preemption and Swapping (#4647)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-13 23:50:44 +09:00
350f9e107f [CI/Build] Move test_utils.py to tests/utils.py (#4425)
Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time)

Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py.
2024-05-13 23:50:09 +09:00
702bee461f [Core][Distributed] refactor custom allreduce to support multiple tp groups (#4754) 2024-05-12 17:47:59 -07:00
a7be4d0072 [CORE] Improvement in ranks code (#4718) 2024-05-12 17:47:47 -07:00
a709e87a4f [CI/Build] Tweak Marlin Nondeterminism Issues (#4713) 2024-05-12 17:46:31 -07:00
6eaccb7353 [Model] Add support for IBM Granite Code models (#4636) 2024-05-11 21:27:24 -07:00
e254497b66 [Model][Misc] Add e5-mistral-7b-instruct and Embedding API (#3734) 2024-05-11 11:30:37 -07:00
4e12131089 [Core][Test] fix function name typo in custom allreduce (#4750) 2024-05-10 15:14:40 -07:00
fcc2994be6 [CI] Nits for bad initialization of SeqGroup in testing (#4748) 2024-05-10 18:01:01 -04:00
2e7796f2cf [Speculative decoding] CUDA graph support (#4295)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-05-10 17:36:25 +00:00
706588a77d [Bugfix] Fix CLI arguments in OpenAI server docs (#4729) 2024-05-11 00:00:56 +09:00
6a0f617210 [Core] Fix circular reference which leaked llm instance in local dev env (#4737)
Storing exception frame is extremely prone to circular refernece because it contains the reference to objects.

When tensorizer is not installed, it leaks llm instance because error frame has references to various modules which cause circular reference problem.

I also found spec decoding has a circular reference issue, and I solved it using weakref.proxy.
2024-05-10 23:54:32 +09:00
dac6a3f6ed [Misc] Apply a couple g++ cleanups (#4719) 2024-05-10 13:37:05 +00:00
64b77dfd7e [Core]fix type annotation for swap_blocks (#4726) 2024-05-10 21:52:48 +09:00
51d4094fda chunked-prefill-doc-syntax (#4603)
Fix the docs: https://docs.vllm.ai/en/latest/models/performance.html

Co-authored-by: sang <rkooo567@gmail.com>
2024-05-10 14:13:23 +09:00
e965d46184 [Misc] Keep only one implementation of the create_dummy_prompt function. (#4716) 2024-05-09 21:42:38 -07:00
208b71bcc1 [Core][Distributed] refactor pynccl (#4591)
[Core][Distributed] refactor pynccl to hold multiple communicators (#4591)
2024-05-09 19:48:43 -07:00
c833101740 [Kernel] Refactor FP8 kv-cache with NVIDIA float8_e4m3 support (#4535) 2024-05-09 18:04:17 -06:00
379da6dcb5 [Kernel] [FP8] Improve FP8 linear layer performance (#4691)
This PR improves the FP8 performance of linear layers, which had been lacking before (#4118 (comment) and #4118 (comment)).

We noticed that CUBLASLt can find a better algorithm if the first dimension of the matrix is greater than 16. So this PR enlarges matrices appropriately during quantization. This improves FP8 performance and removes the performance regression vs. FP16, in many cases exceeding FP16 performance.

Here are benchmarks on llama3 70b (ITL numbers for 1000 input and 50 output tokens at fixed qps and at TP 4), all FP8 measurements are for dynamic quantization:

qps = 1: 24 ms (FP8, this PR), 32 ms (FP8, previous main), 26 ms (FP16)
qps = 2: 26 ms (FP8, this PR), 34ms (FP8, previous main), 28 ms (FP16) 
qps = 4: 33 ms (FP8, this PR), 44 ms (FP8, previous main), 36 ms (FP16)
qps = 6: 46 ms (FP8, this PR), 56 ms (FP8, previous main), 54 ms (FP16)
qps = 8: 85 ms (FP8, this PR), 85 ms (FP8, previous main), 138 ms (FP16)
2024-05-09 16:38:07 -07:00
ebce310b74 [Model] Snowflake arctic model implementation (#4652)
Co-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com>
Co-authored-by: Aurick Qiao <qiao@aurick.net>
Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
Co-authored-by: Aurick Qiao <aurickq@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-05-09 22:37:14 +00:00
be0c5180ac [Bugfix] Add logs for all model dtype casting (#4717) 2024-05-09 18:36:25 +00:00
cea64430f6 [Bugfix] Update grafana.json (#4711) 2024-05-09 10:10:13 -07:00
a3c124570a [Bugfix] Fix CLI arguments in OpenAI server docs (#4709) 2024-05-09 09:53:14 -07:00
ff5abcd746 [ROCm] Add support for Punica kernels on AMD GPUs (#3140)
Co-authored-by: miloice <jeffaw99@hotmail.com>
2024-05-09 09:19:50 -07:00
0ee535b294 [Misc] Set block size at initialization & Fix test_model_runner (#4705) 2024-05-09 09:04:59 -07:00
190bc838e1 [Misc] Remove unnecessary ModelRunner imports (#4703) 2024-05-09 00:17:17 -07:00
f12b20decc [Frontend] Move async logic outside of constructor (#4674) 2024-05-08 22:48:33 -07:00
16bc0a098f [Frontend] add tok/s speed metric to llm class when using tqdm (#4400)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-05-08 22:02:31 -07:00
e288df0632 [Bugfix] Fine-tune gptq_marlin configs to be more similar to marlin (#4626) 2024-05-08 17:14:31 -07:00
8b9241be3a [Speculative decoding] [Bugfix] Fix overallocation in ngram + spec logprobs (#4672) 2024-05-08 23:24:46 +00:00
f942efb5a3 [Dynamic Spec Decoding] Auto-disable by the running queue size (#4592)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-05-08 21:44:00 +00:00
89579a201f [Misc] Use vllm-flash-attn instead of flash-attn (#4686) 2024-05-08 13:15:34 -07:00
230c4b38c1 [CI/Test] fix swap test for multi gpu (#4689) 2024-05-08 13:14:02 -07:00
20cfcdec99 [Core][Optimization] change python dict to pytorch tensor for blocks to swap (#4659) 2024-05-08 12:07:05 -07:00
ad932a221d [Core] Faster startup for LoRA enabled models (#4634) 2024-05-08 10:33:18 -07:00
5510cf0e8a [Misc] Add get_name method to attention backends (#4685) 2024-05-08 09:59:31 -07:00
0f9a6e3d22 [Bugfix][Kernel] allow non-power-of-2 for prefix prefill with alibi (#4573) 2024-05-08 09:19:58 -07:00
f6a593093a [CI] Make mistral tests pass (#4596) 2024-05-08 08:44:35 -07:00
d7740ea4dc [Core] Optimize sampler get_logprobs (#4594) 2024-05-08 08:42:28 -07:00
cc466a3290 [Core][Distributed] support cpu&device in broadcast tensor dict (#4660)
[Core][Distributed] support both cpu and device tensor in broadcast tensor dict (#4660)
2024-05-07 19:34:47 -07:00
8344f7742b [Bug fix][Core] fixup ngram not setup correctly (#4551)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Cade Daniel <edacih@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-05-07 11:40:18 -07:00
469f85c782 [Core][Optimization] change copy-on-write from dict[int, list] to list (#4648) 2024-05-07 11:06:32 -07:00
10760da800 [Bugfix] Fixed error in slice_lora_b for MergedQKVParallelLinearWithLora (#4609) 2024-05-07 10:59:07 -07:00
478aed5827 [Build/CI] Fixing 'docker run' to re-enable AMD CI tests. (#4642) 2024-05-07 09:23:17 -07:00
63575bc2e1 [Core][Optimization] change python dict to pytorch tensor (#4607) 2024-05-06 21:30:27 -07:00
a98187cf72 [Kernel] Make static FP8 scaling more robust (#4570)
Previously FP8 static scaling works if the scales are overestimating the maxima of all activation tensors during computation. However this will not always be the case even if the scales were calibrated very carefully. For example, with the activations in my checkpoint

https://huggingface.co/pcmoritz/Mixtral-8x7B-v0.1-fp8-act-scale

(which was calibrated on https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k), I'm getting the following mostly random performance on MMLU:

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.2295|±  |0.0035|
| - humanities     |N/A    |none  |     5|acc   |0.2421|±  |0.0062|
| - other          |N/A    |none  |     5|acc   |0.2398|±  |0.0076|
| - social_sciences|N/A    |none  |     5|acc   |0.2171|±  |0.0074|
| - stem           |N/A    |none  |     5|acc   |0.2125|±  |0.0073|
With the fix in this PR where the scaled activations are clamped between [-std::numeric_limits<c10::Float8_e4m3fn>::max(), std::numeric_limits<c10::Float8_e4m3fn>::max()] to make sure there are no NaNs, the performance is

|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7008|±  |0.0036|
| - humanities     |N/A    |none  |     5|acc   |0.6453|±  |0.0065|
| - other          |N/A    |none  |     5|acc   |0.7692|±  |0.0072|
| - social_sciences|N/A    |none  |     5|acc   |0.8083|±  |0.0070|
| - stem           |N/A    |none  |     5|acc   |0.6115|±  |0.0083|
This is not perfect yet but is getting very close to the FP16 / dynamic activation scale performance.
2024-05-06 17:39:28 -07:00
bd99d22629 Update lm-format-enforcer to 0.10.1 (#4631) 2024-05-06 23:51:59 +00:00
19cb4716ee [CI] Add retry for agent lost (#4633) 2024-05-06 23:18:57 +00:00
e186d37cb1 [CI] use ccache actions properly in release workflow (#4629) 2024-05-06 22:23:36 +00:00
323f27b904 [Bugfix] Fix asyncio.Task not being subscriptable (#4623) 2024-05-06 09:31:05 -07:00
0650e5935b Disable cuda version check in vllm-openai image (#4530) 2024-05-05 16:58:55 -07:00
c7f2cf2b7f [CI] Reduce wheel size by not shipping debug symbols (#4602) 2024-05-04 21:28:58 -07:00
8d8357c8ed bump version to v0.4.2 (#4600) 2024-05-04 17:09:49 -07:00
4302987069 [Bugfix] Fix inappropriate content of model_name tag in Prometheus metrics (#3937) 2024-05-04 15:39:34 -07:00
021b1a2ab7 [CI] check size of the wheels (#4319) 2024-05-04 20:44:36 +00:00
2a052011ca [Kernel] Support MoE Fp8 Checkpoints for Mixtral (Static Weights with Dynamic/Static Activations) (#4527)
Follow on to #4332 to enable FP8 checkpoint loading for Mixtral and supersedes #4436.

This PR enables the following checkpoint loading features for Mixtral:

Supports loading fp8 checkpoints for Mixtral, such as this "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8" test model
Supports static or dynamic activation quantization with static weight quantization (all per tensor)
Supports different scales for each expert weight
Supports Fp8 in QKV layer
Notes:

The Expert Gate/Router always runs at half / full precision for now.
If there are different weight scales between QKV layer (for separate QKV weights), they are re-quantized using layer.weight_scale.max() so we can have a single gemm for performance.
2024-05-04 11:45:16 -07:00
36fb68f947 [Doc] Chunked Prefill Documentation (#4580) 2024-05-04 00:18:00 -07:00
bc8ad68455 [Misc][Refactor] Introduce ExecuteModelData (#4540) 2024-05-03 17:47:07 -07:00
344bf7cd2d [Misc] add installation time env vars (#4574) 2024-05-03 15:55:56 -07:00
ab50275111 [Speculative decoding] Support target-model logprobs (#4378) 2024-05-03 15:52:01 -07:00
43c413ec57 [Kernel] Use flashinfer for decoding (#4353)
Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com>
2024-05-03 15:51:27 -07:00
f8e7adda21 Fix/async chat serving (#2727) 2024-05-03 11:04:14 -07:00
7e65477e5e [Bugfix] Allow "None" or "" to be passed to CLI for string args that default to None (#4586) 2024-05-03 10:32:21 -07:00
3521ba4f25 [Core][Model runner refactoring 1/N] Refactor attn metadata term (#4518) 2024-05-03 10:20:12 -07:00
2d7bce9cd5 [Doc] add env vars to the doc (#4572) 2024-05-03 05:13:49 +00:00
ce3f1eedf8 [Misc] remove chunk detected debug logs (#4571) 2024-05-03 04:48:08 +00:00
808632d3b4 [BugFix] Prevent the task of _force_log from being garbage collected (#4567) 2024-05-03 01:35:18 +00:00
344a5d0c33 [Core][Distributed] enable allreduce for multiple tp groups (#4566) 2024-05-02 17:32:33 -07:00
0f8a91401c [Core] Ignore infeasible swap requests. (#4557) 2024-05-02 14:31:20 -07:00
9b5c9f9484 [CI/Build] AMD CI pipeline with extended set of tests. (#4267)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-05-02 12:29:07 -07:00
32881f3f31 [kernel] fix sliding window in prefix prefill Triton kernel (#4405)
Co-authored-by: SangBin Cho <rkooo567@gmail.com>
2024-05-02 11:23:37 -07:00
5b8a7c1cb0 [Misc] centralize all usage of environment variables (#4548) 2024-05-02 11:13:25 -07:00
1ff0c73a79 [BugFix] Include target-device specific requirements.txt in sdist (#4559) 2024-05-02 10:52:51 -07:00
5ad60b0cbd [Misc] Exclude the tests directory from being packaged (#4552) 2024-05-02 10:50:25 -07:00
fb087af52e [mypy][7/N] Cover all directories (#4555) 2024-05-02 10:47:41 -07:00
7038e8b803 [Kernel] Support running GPTQ 8-bit models in Marlin (#4533) 2024-05-02 12:56:22 -04:00
2a85f93007 [Core][Distributed] enable multiple tp group (#4512)
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2024-05-02 04:28:21 +00:00
cf8cac8c70 [mypy][6/N] Fix all the core subdirectory typing (#4450)
Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-05-02 03:01:00 +00:00
5e401bce17 [CI]Add regression tests to ensure the async engine generates metrics (#4524) 2024-05-01 19:57:12 -07:00
0d62fe58db [Bug fix][Core] assert num_new_tokens == 1 fails when SamplingParams.n is not 1 and max_tokens is large & Add tests for preemption (#4451) 2024-05-01 19:24:13 -07:00
b8afa8b95a [MISC] Rework logger to enable pythonic custom logging configuration to be provided (#4273) 2024-05-01 17:34:40 -07:00
826b82a260 [Misc] Fix expert_ids shape in MoE (#4517) 2024-05-01 23:47:59 +00:00
c9d852d601 [Misc] Remove Mixtral device="cuda" declarations (#4543)
Remove the device="cuda" declarations in mixtral as promised in #4343
2024-05-01 16:30:52 -07:00
6ef09b08f8 [Core][Distributed] fix pynccl del error (#4508) 2024-05-01 15:23:06 -07:00
Roy
3a922c1e7e [Bugfix][Core] Fix and refactor logging stats (#4336) 2024-05-01 20:08:14 +00:00
c47ba4aaa9 [Bugfix] Add validation for seed (#4529) 2024-05-01 19:31:22 +00:00
24bb4fe432 [Kernel] Update fused_moe tuning script for FP8 (#4457)
This PR updates the tuning script for the fused_moe kernel to support FP8 and also adds configurations for TP4. Note that for the configuration I removed num_warps and num_stages for small batch sizes since that improved performance and brought the benchmarks on par with the numbers before in that regime to make sure this is a strict improvement over the status quo.

All the numbers below are for mistralai/Mixtral-8x7B-Instruct-v0.1, 1000 input and 50 output tokens.

Before this PR (with static activation scaling):

qps = 1: 9.8 ms ITL, 0.49s e2e latency
qps = 2: 9.7 ms ITL, 0.49s e2e latency 
qps = 4: 10.1 ms ITL, 0.52s e2e latency
qps = 6: 11.9 ms ITL, 0.59s e2e latency
qps = 8: 14.0 ms ITL, 0.70s e2e latency
qps = 10: 15.7 ms ITL, 0.79s e2e latency

After this PR (with static activation scaling):

qps = 1: 9.8 ms ITL, 0.49s e2e latency
qps = 2: 9.7 ms ITL, 0.49s e2e latency
qps = 4: 10.2 ms ITL, 0.53s e2e latency
qps = 6: 11.9 ms ITL, 0.59s e2e latency
qps = 8: 11.9 ms ITL, 0.59s e2e latency
qps = 10: 12.1 ms ITL, 0.61s e2e latency
2024-05-01 11:47:38 -07:00
a657bfc48a [Core] Add multiproc_worker_utils for multiprocessing-based workers (#4357) 2024-05-01 18:41:59 +00:00
24750f4cad [Core] Enable prefix caching with block manager v2 enabled (#4142)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
Co-authored-by: Sage Moore <sagemoore@utexas.edu>
2024-05-01 11:20:32 -07:00
b38e42fbca [Speculative decoding] Add ngram prompt lookup decoding (#4237)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-05-01 11:13:03 -07:00
8b798eec75 [CI/Build][Bugfix] VLLM_USE_PRECOMPILED should skip compilation (#4534)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-05-01 18:01:50 +00:00
69909126a7 [Bugfix] Use random seed if seed is -1 (#4531) 2024-05-01 10:41:17 -07:00
e491c7e053 [Doc] update(example model): for OpenAI compatible serving (#4503) 2024-05-01 10:14:16 -07:00
4dc8026d86 [Bugfix] Fix 307 Redirect for /metrics (#4523) 2024-05-01 09:14:13 -07:00
a88bb9b032 [Bugfix] Fix the fp8 kv_cache check error that occurs when failing to obtain the CUDA version. (#4173)
Signed-off-by: AnyISalIn <anyisalin@gmail.com>
2024-05-01 09:11:03 -07:00
6f1df80436 [Test] Add ignore_eos test (#4519) 2024-05-01 08:45:42 -04:00
d6f4bd7cdd [Misc]Add customized information for models (#4132) 2024-04-30 21:18:14 -07:00
c3845d82dc Allow user to define whitespace pattern for outlines (#4305) 2024-04-30 20:48:39 -07:00
a822eb3413 [Misc] fix typo in block manager (#4453) 2024-04-30 20:41:32 -07:00
f458112e8a [Misc][Typo] type annotation fix (#4495) 2024-04-30 20:21:39 -07:00
2e240c69a9 [Core] Centralize GPU Worker construction (#4419) 2024-05-01 01:06:34 +00:00
ee37328da0 Unable to find Punica extension issue during source code installation (#4494)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-05-01 00:42:09 +00:00
6ad58f42c5 fix_tokenizer_snapshot_download_bug (#4493) 2024-04-30 16:38:50 -07:00
dd1a50a8bc [Bugfix][Minor] Make ignore_eos effective (#4468) 2024-04-30 16:33:33 -07:00
715c2d854d [Frontend] [Core] Tensorizer: support dynamic num_readers, update version (#4467) 2024-04-30 16:32:13 -07:00
a494140433 [Frontend] Support complex message content for chat completions endpoint (#3467)
Co-authored-by: Lily Liu <lilyliupku@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2024-04-30 16:28:46 -07:00
111815d482 [Kernel] Support Fp8 Checkpoints (Dynamic + Static) (#4332)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2024-04-30 21:46:12 +00:00
b31a1fb63c [Doc] add visualization for multi-stage dockerfile (#4456)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-30 17:41:59 +00:00
4bb53e2dde [BugFix] fix num_lookahead_slots missing in async executor (#4165)
Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-04-30 10:12:59 -07:00
26f2fb5113 [Core]Refactor gptq_marlin ops (#4466) 2024-04-30 08:14:47 -04:00
fa32207842 [Bugfix][Kernel] Fix compute_type for MoE kernel (#4463) 2024-04-29 22:05:40 -07:00
d627a3d837 [Misc] Upgrade to torch==2.3.0 (#4454) 2024-04-29 20:05:47 -04:00
f4f921b7f1 [Core][Distributed] use cpu group to broadcast metadata in cpu (#4444) 2024-04-29 13:52:22 -07:00
ac5ccf0156 [CI] hotfix: soft fail neuron test (#4458) 2024-04-29 19:50:01 +00:00
73c8d677e5 [Kernel] Marlin Expansion: Support AutoGPTQ Models with Marlin (#3922)
Co-authored-by: alexm <alexm@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-04-29 09:35:34 -07:00
df29793dc7 [mypy][5/N] Support all typing on model executor (#4427) 2024-04-28 19:01:26 -07:00
03dd7d52bf [CI] clean docker cache for neuron (#4441) 2024-04-28 23:32:07 +00:00
bf480c5302 Add more Prometheus metrics (#2764)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
2024-04-28 15:59:33 -07:00
9c7306ac11 [Misc] fix typo in llm_engine init logging (#4428) 2024-04-28 18:58:30 +08:00
4ea1f9678d [BugFix] Resolved Issues For LinearMethod --> QuantConfig (#4418) 2024-04-27 18:35:33 +00:00
ba4be44c32 [BugFix] Fix return type of executor execute_model methods (#4402) 2024-04-27 11:17:45 -07:00
d6e520e170 [Core] Support offline use of local cache for models (#4374)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Travis Johnson <tjohnson31415@gmail.com>
2024-04-27 09:59:55 -07:00
81661da7b2 [BugFix] Fix min_tokens when eos_token_id is None (#4389)
Co-authored-by: DefTruth <31974251+deftruth@users.noreply.github.com>
2024-04-27 09:52:46 -07:00
dfea173148 [Bugfix] Abort requests when the connection to /v1/completions is interrupted (#4363) 2024-04-27 09:48:37 -07:00
Roy
7134303cbb [Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 11:30:08 +00:00
3da24c2df7 [Model] Phi-3 4k sliding window temp. fix (#4380) 2024-04-27 18:08:15 +08:00
eefeb16464 [Kernel] Full Tensor Parallelism for LoRA Layers (#3524)
Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-04-27 00:03:48 -07:00
18d23f642a [ROCm][Hardware][AMD] Enable group query attention for triton FA (#4406) 2024-04-26 23:37:40 -07:00
Roy
87f545ba6f [Misc] Fix logger format typo (#4396) 2024-04-27 13:45:02 +08:00
8947bc3c15 [Frontend][Bugfix] Disallow extra fields in OpenAI API (#4355) 2024-04-27 05:08:24 +00:00
12628d3c78 [Kernel] Optimize FP8 support for MoE kernel / Mixtral via static scales (#4343)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-27 04:49:59 +00:00
258a2c58d0 [Core] Introduce DistributedGPUExecutor abstract class (#4348) 2024-04-27 04:14:26 +00:00
aba47be3fe [Misc] add RFC issue template (#4401)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-26 15:47:45 -07:00
a62aaf1df5 [Misc][Refactor] Generalize linear_method to be quant_method (#4373) 2024-04-26 16:41:14 -04:00
603ad84815 [Core] Refactoring sampler and support prompt logprob for chunked prefill (#4309) 2024-04-26 13:02:02 +00:00
a88081bf76 [CI] Disable non-lazy string operation on logging (#4326)
Co-authored-by: Danny Guinther <dguinther@neuralmagic.com>
2024-04-26 00:16:58 -07:00
2f30e7c72f [Frontend] Add --log-level option to api server (#4377) 2024-04-26 05:36:01 +00:00
a74dee9b62 [Bugfix] Fix parameter name in get_tokenizer (#4107) 2024-04-25 19:10:48 -07:00
cf29b7eda4 [ROCm][Hardware][AMD][Doc] Documentation update for ROCm (#4376)
Co-authored-by: WoosukKwon <woosuk.kwon@berkeley.edu>
2024-04-25 18:12:25 -07:00
efffb63f58 [Core] Move function tracing setup to util function (#4352) 2024-04-25 16:45:12 -07:00
15e7c675b0 [Core] Add shutdown() method to ExecutorBase (#4349) 2024-04-25 16:32:48 -07:00
Roy
b6dcb4d442 [Misc] Fix flash attention backend log (#4368) 2024-04-25 12:43:32 -07:00
b5b4a398a7 [Mypy] Typing lora folder (#4337) 2024-04-25 19:13:50 +00:00
f4bc4de1b1 [Core]refactor aqlm quant ops (#4351) 2024-04-25 15:03:56 -04:00
bd7a8eef25 [Doc] README Phi-3 name fix. (#4372)
Co-authored-by: Caio Mendes <caiocesart@microsoft.com>
2024-04-25 10:32:00 -07:00
7ee82bef1e [CI/Build] Adding functionality to reset the node's GPUs before processing. (#4213) 2024-04-25 09:37:20 -07:00
fbf152d976 [Bugfix][Model] Refactor OLMo model to support new HF format in transformers 4.40.0 (#4324)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-25 09:35:56 -07:00
479d69fad0 [Core] Move ray_utils.py from engine to executor package (#4347) 2024-04-25 06:52:22 +00:00
96e90fdeb3 [Model] Adds Phi-3 support (#4298) 2024-04-25 03:06:57 +00:00
a395a638c2 [Misc] Use public API in benchmark_throughput (#4300) 2024-04-24 21:10:24 +00:00
2768884ac4 [Doc] Add note for docker user (#4340)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-24 21:09:44 +00:00
aae08249ac [Bugfix] Fix marlin kernel crash on H100 (#4218)
This PR addresses the Marlin kernel H100 crash that was reported here: neuralmagic#187.
The reason for the crash was the inline PTX assembly that introduced the async_copy with streaming behavior. The solution is to use the more standard PTX for async_copy (without the fractional L2 policy for "evict_first"). There is no performance difference between standard async_copy PTX and the previous one.
2024-04-24 10:35:01 -07:00
7923dcad12 [Misc] Update ShareGPT Dataset Sampling in Serving Benchmark (#4279) 2024-04-24 09:49:13 -07:00
3cd9b5bb2d [Core][Distributed] use existing torch.cuda.device (#4318)
[Core][Distributed] use existing torch.cuda.device context manager (#4318)
2024-04-24 09:00:20 -07:00
468d761b32 [Misc] Reduce supported Punica dtypes (#4304) 2024-04-23 18:54:33 -07:00
e4bf860a54 [CI][Build] change pynvml to nvidia-ml-py (#4302) 2024-04-23 18:33:12 -07:00
91f50a6fe2 [Core][Distributed] use cpu/gloo to initialize pynccl (#4248) 2024-04-23 18:32:19 -07:00
79a268c4ab [BUG] fixed fp8 conflict with aqlm (#4307)
Fixes fp8 iterface which broke in AQLM merge.
2024-04-23 18:26:33 -07:00
eace8bf0b9 [Kernel] FP8 support for MoE kernel / Mixtral (#4244)
This PR is the first step towards fixing https://github.com/vllm-project/vllm/pull/3208

It implements dynamic per-tensor scaling (see https://github.com/vllm-project/vllm/pull/4118), so users do not need to compute activation scales on a calibration dataset and they also don't need to convert their model checkpoints. It is enough to specify the `quantization="fp8"` argument. You can try out the PR like this:

```python
from vllm import LLM, SamplingParams

prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=2, quantization="fp8")

outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

**Performance**: For this PR, the focus is on making the code clean (while still trying to get reasonable performance), there is a bunch of optimizations that we will submit as a follow up PR that significantly improve the performance (similar to the numbers in https://github.com/vllm-project/vllm/pull/3954). With this PR, the results are as follows:

<img width="725" alt="Screenshot 2024-04-21 at 1 31 50 PM" src="https://github.com/vllm-project/vllm/assets/113316/d8fe1118-07a0-4d4e-8530-37a77d465a03">


**Accuracy**: The accuracy with this PR on MMLU on `mistralai/Mixtral-8x7B-v0.1` is as follows:

```
|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7018|±  |0.0036|
| - humanities     |N/A    |none  |     5|acc   |0.6472|±  |0.0065|
| - other          |N/A    |none  |     5|acc   |0.7673|±  |0.0072|
| - social_sciences|N/A    |none  |     5|acc   |0.8099|±  |0.0070|
| - stem           |N/A    |none  |     5|acc   |0.6131|±  |0.0083|
```
this compares favorably with the fp16 results which are
```
|      Groups      |Version|Filter|n-shot|Metric|Value |   |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu              |N/A    |none  |     0|acc   |0.7020|±  |0.1313|
| - humanities     |N/A    |none  |     5|acc   |0.6425|±  |0.1349|
| - other          |N/A    |none  |     5|acc   |0.7744|±  |0.1038|
| - social_sciences|N/A    |none  |     5|acc   |0.8131|±  |0.0695|
| - stem           |N/A    |none  |     5|acc   |0.6108|±  |0.1383|
```

Happy hacking!
2024-04-24 01:18:23 +00:00
1e8f4252aa [Bugfix][Frontend] Raise exception when file-like chat template fails to be opened (#4292) 2024-04-23 18:19:03 +00:00
2b7949c1c2 AQLM CUDA support (#3287)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-04-23 13:59:33 -04:00
62b5166bd4 [CI] Add ccache for wheel builds job (#4281) 2024-04-23 09:51:41 -07:00
d86285a4a4 [Core][Logging] Add last frame information for better debugging (#4278) 2024-04-23 09:45:52 -07:00
d87f39e9a9 [Bugfix] Add init_cached_hf_modules to RayWorkerWrapper (#4286) 2024-04-23 09:28:35 -07:00
d3c8180ac4 [Bugfix] Fixing max token error message for openai compatible server (#4016) 2024-04-23 19:06:29 +08:00
62b8aebc6f [Speculative decoding 7/9] Speculative decoding end-to-end correctness tests. (#3951) 2024-04-23 08:02:36 +00:00
050f285ff6 [Core] Scheduling optimization 2 (#4280) 2024-04-23 08:02:11 +00:00
8f2ea22bde [Core] Some simplification of WorkerWrapper changes (#4183) 2024-04-23 07:49:08 +00:00
0ae11f78ab [Mypy] Part 3 fix typing for nested directories for most of directory (#4161) 2024-04-22 21:32:44 -07:00
34128a697e Fix autodoc directives (#4272)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-23 01:53:01 +00:00
c1b4e4157c [Core][Distributed] use absolute path for library file (#4271) 2024-04-22 17:21:48 -07:00
ceaf4ed003 [Doc] Update the SkyPilot doc with serving and Llama-3 (#4276) 2024-04-22 15:34:31 -07:00
ad8d696a99 [Core] Scheduler perf fix (#4270) 2024-04-22 21:11:06 +00:00
3d925165f2 Add example scripts to documentation (#4225)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-22 16:36:54 +00:00
1543680691 [Bugfix] Ensure download_weights_from_hf(..) inside loader is using the revision parameter (#4217) 2024-04-22 09:10:48 -07:00
077f0a2e8a [Frontend] Enable support for CPU backend in AsyncLLMEngine. (#3993)
Signed-off-by: Tao He <sighingnow@gmail.com>
2024-04-22 09:19:51 +00:00
e73ed0f1c6 [Bugfix] Fix type annotations in CPU model runner (#4256) 2024-04-22 00:54:16 -07:00
296cdf8ac7 [Misc] Add vision language model support to CPU backend (#3968) 2024-04-22 00:44:16 -07:00
747b1a7147 [Core][Distributed] fix _is_full_nvlink detection (#4233) 2024-04-21 23:04:16 -07:00
95e5b087cf [AMD][Hardware][Misc][Bugfix] xformer cleanup and light navi logic and CI fixes and refactoring (#4129) 2024-04-21 21:57:24 -07:00
a37d815b83 Make initialization of tokenizer and detokenizer optional (#3748)
Co-authored-by: Yun Ding <yunding@nvidia.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-21 22:06:46 +00:00
7f2593b164 [Doc]: Update the doc of adding new models (#4236) 2024-04-21 09:57:08 -07:00
fe7d648fe5 Don't show default value for flags in EngineArgs (#4223)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-21 09:15:28 -07:00
cc74b2b232 Updating lm-format-enforcer version and adding links to decoding libraries in docs (#4222) 2024-04-20 08:33:16 +00:00
91528575ec [Frontend] multiple sampling params support (#3570) 2024-04-20 00:11:57 -07:00
a22cdea371 [Kernel][FP8] Initial support with dynamic per-tensor scaling (#4118)
Provide an initial support to FP8 computation. This PR is inspired by HuggingFace TGI: huggingface/text-generation-inference#1726

This feature can be enabled with --quantization fp8 or -q fp8 when launching an engine.

Algorithm:
We still load a model checkpoint in FP16/BF16. After the weights are loaded, Fp8LinearMethod calculates the per-tensor scaling factor of weights and quantizes the weights accordingly. The scaling factor will then be stored for future use. Meanwhile, the per-tensor scaling factor for activations is calculated in every forward pass.

Initial Results:
Currently tested Mistral-7B on 1xH100. With prompt length ~5 and decoding length 128:

BF16: 1.47s
FP8: 1.66s
I'll try to use larger models and try to find more performance bottleneck. Meanwhile, you're welcome to try this code.
2024-04-20 04:28:57 +00:00
682789d402 Fix missing docs and out of sync EngineArgs (#4219)
Co-authored-by: Harry Mellor <hmellor@oxts.com>
2024-04-19 20:51:33 -07:00
138485a82d [Bugfix] Add fix for JSON whitespace (#4189)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-13-147.ec2.internal>
2024-04-19 20:49:22 -07:00
bc9df1571b Pass tokenizer_revision when getting tokenizer in openai serving (#4214) 2024-04-19 17:13:56 -07:00
15b86408a8 [Misc] add nccl in collect env (#4211) 2024-04-19 19:44:51 +00:00
7be4f5628f [Bugfix][Core] Restore logging of stats in the async engine (#4150) 2024-04-19 08:08:26 -07:00
8f20fc04bf [Misc] fix docstrings (#4191)
Co-authored-by: Zhong Wang <wangzhong@infini-ai.com>
2024-04-19 08:18:33 +00:00
221d93ecbf Bump version of 0.4.1 (#4177) 2024-04-19 01:00:22 -07:00
d17c8477f1 [Bugfix] Fix LoRA loading check (#4138)
Co-authored-by: simon-mo <simon.mo@hey.com>
2024-04-19 00:59:54 -07:00
a134ef6f5e Support eos_token_id from generation_config.json (#4182) 2024-04-19 04:13:36 +00:00
8a7a3e4436 [Core] add an option to log every function call to for debugging hang/crash in distributed inference (#4079)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-18 16:15:12 -07:00
8f9c28fd40 [Bugfix] Fix CustomAllreduce nvlink topology detection (#3974)
[Bugfix] Fix CustomAllreduce pcie nvlink topology detection (#3974) (#4159)
2024-04-18 15:32:47 -07:00
cd2f63fb36 [CI/CD] add neuron docker and ci test scripts (#3571) 2024-04-18 15:26:01 -07:00
87fa80c91f [Misc] Bump transformers to latest version (#4176) 2024-04-18 14:36:39 -07:00
e1bb2fd52d [Bugfix] Support logprobs when using guided_json and other constrained decoding fields (#4149) 2024-04-18 21:12:55 +00:00
705578ae14 [Docs] document that Meta Llama 3 is supported (#4175) 2024-04-18 10:55:48 -07:00
e8cc7967ff [Bugfix][Kernel] allow non-power-of-two head sizes in prefix prefill (#4128) 2024-04-18 00:51:28 -07:00
53b018edcb [Bugfix] Get available quantization methods from quantization registry (#4098) 2024-04-18 00:21:55 -07:00
66ded03067 Allow model to be served under multiple names (#2894)
Co-authored-by: Alexandre Payot <alexandrep@graphcore.ai>
2024-04-18 00:16:26 -07:00
6dc1fc9cfe [Core] nccl integrity check and test (#4155)
[Core] Add integrity check during initialization; add test for it (#4155)
2024-04-17 22:28:52 -07:00
533d2a1f39 [Typing] Mypy typing part 2 (#4043)
Co-authored-by: SangBin Cho <sangcho@sangcho-LT93GQWG9C.local>
2024-04-17 17:28:43 -07:00
a53222544c [Kernel] Add punica dimension for Swallow-MS-7B LoRA (#4134) 2024-04-17 10:02:45 -07:00
fe3b5bbc23 [Bugfix] fix output parsing error for trtllm backend (#4137)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-17 11:07:23 +00:00
8438e0569e [Core] RayWorkerVllm --> WorkerWrapper to reduce duplication (#4024)
[Core] replace narrow-usage RayWorkerVllm to general WorkerWrapper to reduce code duplication (#4024)
2024-04-17 08:34:33 +00:00
11d652bd4f [CI] Move CPU/AMD tests to after wait (#4123) 2024-04-16 22:53:26 -07:00
d150e4f89f [Misc] [CI] Fix CI failure caught after merge (#4126) 2024-04-16 17:56:01 -07:00
e95cd87959 [Speculative decoding 6/9] Integrate speculative decoding with LLMEngine (#3894) 2024-04-16 13:09:21 -07:00
69e1d2fb69 [Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00
05434764cd LM Format Enforcer Guided Decoding Support (#3868)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-16 05:54:57 +00:00
4e7ee664e2 [Core] Fix engine-use-ray broken (#4105) 2024-04-16 05:24:53 +00:00
37e84a403d [Typing] Fix Sequence type GenericAlias only available after Python 3.9. (#4092) 2024-04-15 14:47:31 -07:00
4695397dcf [Bugfix] Fix ray workers profiling with nsight (#4095) 2024-04-15 14:24:45 -07:00
d619ae2d19 [Doc] Add better clarity for tensorizer usage (#4090)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-15 13:28:25 -07:00
eb46fbfda2 [Core] Simplifications to executor classes (#4071) 2024-04-15 13:05:09 -07:00
0003e9154b [Misc][Minor] Fix CPU block num log in CPUExecutor. (#4088) 2024-04-15 08:35:55 -07:00
e11e200736 [Bugfix] Fix filelock version requirement (#4075) 2024-04-14 21:50:08 -07:00
Roy
8db1bf32f8 [Misc] Upgrade triton to 2.2.0 (#4061) 2024-04-14 17:43:54 -07:00
aceb17cf2d [Docs] document that mixtral 8x22b is supported (#4073) 2024-04-14 14:35:55 -07:00
563c54f760 [BugFix] Fix tensorizer extra in setup.py (#4072) 2024-04-14 14:12:42 -07:00
2cd6b4f362 [Core] avoid too many cuda context by caching p2p test (#4021) 2024-04-13 23:40:21 -07:00
711a000255 [Frontend] [Core] feat: Add model loading using tensorizer (#3476) 2024-04-13 17:13:01 -07:00
989ae2538d [Kernel] Add punica dimension for Baichuan-13B (#4053) 2024-04-13 07:55:05 -07:00
0a430b4ae2 [Bugfix] fix_small_bug_in_neuron_executor (#4051) 2024-04-13 07:54:03 -07:00
ec8e3c695f [Bugfix] fix_log_time_in_metrics (#4050) 2024-04-13 07:52:36 -07:00
98afde19fc [Core][Distributed] improve logging for init dist (#4042) 2024-04-13 07:12:53 -07:00
5c2e66e487 [Bugfix] More type hint fixes for py 3.8 (#4039) 2024-04-12 21:07:04 -07:00
546e721168 [CI/Test] expand ruff and yapf for all supported python version (#4037) 2024-04-13 01:43:37 +00:00
b8aacac31a [Bugfix] Fix LoRA bug (#4032) 2024-04-12 16:56:37 -07:00
d04973ad54 Fix triton compilation issue (#3984)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-12 16:41:26 -07:00
fbb9d9eef4 [Core] fix custom allreduce default value (#4040) 2024-04-12 16:40:39 -07:00
09473ee41c [mypy] Add mypy type annotation part 1 (#4006) 2024-04-12 14:35:50 -07:00
d4ec9ffb95 [Misc] Fix typo in scheduler.py (#4022) 2024-04-12 13:56:04 -07:00
96b6a6d790 [Bugfix] fix type hint for py 3.8 (#4036) 2024-04-12 19:35:44 +00:00
36729bac13 [Test] Test multiple attn backend for chunked prefill. (#4023) 2024-04-12 09:56:57 -07:00
7fd3949a0b [Frontend][Core] Move merge_async_iterators to utils (#4026) 2024-04-12 05:30:54 +00:00
1096717ae9 [Core] Support LoRA on quantized models (#4012) 2024-04-11 21:02:44 -07:00
c2b4a1bce9 [Doc] Add typing hints / mypy types cleanup (#3816)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-11 17:17:21 -07:00
e46a60aa4c [BugFix] Fix handling of stop strings and stop token ids (#3672) 2024-04-11 15:34:12 -07:00
1e96c3341a Add extra punica sizes to support bigger vocabs (#4015) 2024-04-11 22:18:57 +00:00
95e7d4a97c Fix echo/logprob OpenAI completion bug (#3441)
Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
2024-04-11 22:15:50 +00:00
559eb852f8 [Core] init_distributed_environment align with init_process_group(#4014)
[Core][Distributed] make init_distributed_environment compatible with init_process_group (#4014)
2024-04-11 14:00:48 -07:00
a10d3056da [Core] Set linear_weights directly on the layer (#3977) 2024-04-11 16:35:51 -04:00
8afca50889 [Hardware][Intel] Isolate CPUModelRunner and ModelRunner for better maintenance (#3824) 2024-04-11 11:56:49 -07:00
08ccee1e83 punica fix-bgmv-kernel-640 (#4007) 2024-04-11 08:59:26 -07:00
c1dc547129 [Kernel] Fused MoE Config for Mixtral 8x22 (#4002) 2024-04-11 07:50:00 -07:00
f3d0bf7589 [Doc][Installation] delete python setup.py develop (#3989) 2024-04-11 03:33:02 +00:00
e9da5a40c6 [Misc] Add indirection layer for custom ops (#3913) 2024-04-10 20:26:07 -07:00
e42df7227d [Test] Add xformer and flash attn tests (#3961)
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-11 03:09:50 +00:00
caada5e50a [Core][Model] torch.compile for layernorm in commandr (#3985)
[Core][Model] Use torch.compile to accelerate layernorm in commandr (#3985)
2024-04-11 01:48:26 +00:00
67b4221a61 [Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-10 17:56:48 -07:00
63e7176f26 [Core][Refactor] move parallel_utils into vllm/distributed (#3950)
[WIP][Core][Refactor] move vllm/model_executor/parallel_utils into vllm/distributed and vllm/device_communicators (#3950)
2024-04-10 15:33:30 -07:00
934d3662f7 [Bugfix] handle hf_config with architectures == None (#3982)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2024-04-10 22:28:25 +00:00
92cd2e2f21 [Doc] Fix getting stared to use publicly available model (#3963) 2024-04-10 18:05:52 +00:00
e4c4072c94 [Bugfix] Remove key sorting for guided_json parameter in OpenAi compatible Server (#3945) 2024-04-10 10:15:51 -07:00
e35397468f [Doc] Add doc to state our model support policy (#3948)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-10 17:03:02 +00:00
8b317c6dd0 [Model][AMD] ROCm support for 256 head dims for Gemma (#3972) 2024-04-10 08:12:00 -07:00
bd3c144e0b [Bugfix][ROCm] Add numba to Dockerfile.rocm (#3962) 2024-04-10 07:37:17 -07:00
0258b7a94b [Bugfix] handle prompt_logprobs in _apply_min_tokens_penalty (#3876)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-04-10 01:39:56 -07:00
b3104b2a10 [Bugfix] Fix logits processor when prompt_logprobs is not None (#3899) 2024-04-10 00:09:36 -07:00
c2e00af523 [Bugfix] fix utils.py/merge_dict func TypeError: 'type' object is not subscriptable (#3955)
Co-authored-by: tianyi_zhao <tianyi.zhao@transwarp.io>
2024-04-10 04:49:11 +00:00
c013d32c75 [Benchmark] Add cpu options to bench scripts (#3915) 2024-04-09 21:30:03 -07:00
11dd6ebb89 [Misc] Avoid loading incorrect LoRA config (#3777) 2024-04-09 19:47:15 -07:00
6c0b04515f [ROCm][Hardware][AMD] Use Triton Kernel for default FA on ROCm (#3643)
Co-authored-by: jpvillam <jpvillam@amd.com>
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-09 15:10:47 -07:00
e23a43aef8 [Bugfix] Fix KeyError on loading GPT-NeoX (#3925) 2024-04-09 12:11:31 -07:00
e7c7067b45 [Misc] [Core] Implement RFC "Augment BaseExecutor interfaces to enable hardware-agnostic speculative decoding" (#3837) 2024-04-09 11:44:15 -07:00
6d592eb430 [Core] separate distributed_init from worker (#3904) 2024-04-09 08:49:02 +00:00
Roy
d036198e23 [BugFix][Model] Fix commandr RoPE max_position_embeddings (#3919) 2024-04-09 06:17:21 +08:00
59a6abf3c9 [Hotfix][CI/Build][Kernel] CUDA 11.8 does not support layernorm optimizations (#3782) 2024-04-08 14:31:02 -07:00
bc0c0192d1 [Bugfix] Enable Proper attention_bias Usage in Llama Model Configuration (#3767)
Co-authored-by: roy <jasonailu87@gmail.com>
2024-04-08 19:42:35 +00:00
f46864d68d [Bugfix] Added Command-R GPTQ support (#3849)
Co-authored-by: Egor Tolmachev <t333ga@gmail.com>
2024-04-08 14:59:38 +00:00
b4543c8f6b [Model] add minicpm (#3893) 2024-04-08 18:28:36 +08:00
0ce0539d47 [Bugfix] Fix Llava inference with Tensor Parallelism. (#3883) 2024-04-07 22:54:13 +08:00
2f19283549 [Core] latency optimization (#3890) 2024-04-06 19:14:06 -07:00
95baec828f [Core] enable out-of-tree model register (#3871) 2024-04-06 17:11:41 -07:00
e4be7d70bb [CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 21:32:30 +00:00
54951ac4bf [Bugfix] Fix incorrect output on OLMo models in Tensor Parallelism (#3869) 2024-04-05 12:02:09 -07:00
18de883489 [Chunked Prefill][4/n] Chunked prefill scheduler. (#3853) 2024-04-05 10:17:58 -07:00
1d7c940d74 Add option to completion API to truncate prompt tokens (#3144) 2024-04-05 10:15:42 -07:00
cfaf49a167 [Misc] Define common requirements (#3841) 2024-04-05 00:39:17 -07:00
9edec652e2 [Bugfix] Fixing requirements.txt (#3865) 2024-04-04 23:46:01 -07:00
e0dd4d3589 [Misc] Fix linter issues in examples/fp8/quantizer/quantize.py (#3864) 2024-04-04 21:57:33 -07:00
e5043a3e75 [Misc] Add pytest marker to opt-out of global test cleanup (#3863) 2024-04-04 21:54:16 -07:00
d03d64fd2e [CI/Build] refactor dockerfile & fix pip cache
[CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859)
2024-04-04 21:53:16 -07:00
78107fa091 [Doc]Add asynchronous engine arguments to documentation. (#3810)
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-04 21:52:01 -07:00
c391e4b68e [Core] improve robustness of pynccl (#3860) 2024-04-04 16:52:12 -07:00
9117f892f0 [Model] Cohere CommandR+ (#3829) 2024-04-04 13:31:49 -07:00
db2a6a41e2 [Hardware][CPU] Update cpu torch to match default of 2.2.1 (#3854) 2024-04-04 19:49:49 +00:00
ca81ff5196 [Core] manage nccl via a pypi package & upgrade to pt 2.2.1 (#3805) 2024-04-04 10:26:19 -07:00
b7782002e1 [Benchmark] Refactor sample_requests in benchmark_throughput (#3613)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-04 09:56:22 +00:00
819a309c0f [Bugfix] Fix args in benchmark_serving (#3836)
Co-authored-by: Roger Wang <ywang@roblox.com>
2024-04-04 07:41:05 +00:00
aabe8f40f2 [Core] [Frontend] Make detokenization optional (#3749)
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-04-03 21:52:18 -07:00
498eb5cfa3 [Bugfix] Add kv_scale input parameter to CPU backend (#3840) 2024-04-04 04:33:08 +00:00
537ee25f43 [Core] Enable hf_transfer by default if available (#3817) 2024-04-04 04:02:43 +00:00
294f8f6665 [BugFix] Pass tokenizer_config to local_tokenizer_group (#3754)
Signed-off-by: Tao He <sighingnow@gmail.com>
2024-04-03 20:31:46 -07:00
b95047f2da [Misc] Publish 3rd meetup slides (#3835) 2024-04-03 15:46:10 -07:00
2ff767b513 Enable scaled FP8 (e4m3fn) KV cache on ROCm (AMD GPU) (#3290)
Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: HaiShaw <hixiao@gmail.com>
Co-authored-by: AdrianAbeyta <Adrian.Abeyta@amd.com>
Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com>
Co-authored-by: root <root@gt-pla-u18-08.pla.dcgpu>
Co-authored-by: mawong-amd <156021403+mawong-amd@users.noreply.github.com>
Co-authored-by: ttbachyinsda <ttbachyinsda@outlook.com>
Co-authored-by: guofangze <guofangze@kuaishou.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-03 14:15:55 -07:00
3dcb3e8b98 [3/N] Refactor scheduler for chunked prefill scheduling (#3550) 2024-04-03 14:13:49 -07:00
c64cf38673 [Doc] Update contribution guidelines for better onboarding (#3819) 2024-04-03 07:31:43 +00:00
76b889bf1d [Doc] Update README.md (#3806) 2024-04-02 23:11:10 -07:00
c9b506dad4 [BugFix] Use different mechanism to get vllm version in is_cpu() (#3804) 2024-04-02 23:06:25 -07:00
5757d90e26 [Speculative decoding] Adding configuration object for speculative decoding (#3706)
Co-authored-by: Lily Liu <lilyliupku@gmail.com>
2024-04-03 00:40:57 +00:00
868 changed files with 118190 additions and 20762 deletions

View File

@ -0,0 +1,36 @@
import os
import zipfile
MAX_SIZE_MB = 250
def print_top_10_largest_files(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]:
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
def check_wheel_size(directory):
for root, _, files in os.walk(directory):
for f in files:
if f.endswith(".whl"):
wheel_path = os.path.join(root, f)
wheel_size = os.path.getsize(wheel_path)
wheel_size_mb = wheel_size / (1024 * 1024)
if wheel_size_mb > MAX_SIZE_MB:
print(
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
print_top_10_largest_files(wheel_path)
return 1
else:
print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb} MB).")
return 0
if __name__ == "__main__":
import sys
sys.exit(check_wheel_size(sys.argv[1]))

View File

@ -1,18 +0,0 @@
#!/bin/bash
set -ex
set -o pipefail
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
mkdir -p images
cd images
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
cd -

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.671
- name: "exact_match,flexible-extract"
value: 0.664
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.905
- name: "exact_match,flexible-extract"
value: 0.905
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.892
- name: "exact_match,flexible-extract"
value: 0.892
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.752
- name: "exact_match,flexible-extract"
value: 0.754
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.753
- name: "exact_match,flexible-extract"
value: 0.753
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.755
- name: "exact_match,flexible-extract"
value: 0.755
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.753
- name: "exact_match,flexible-extract"
value: 0.753
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.728
- name: "exact_match,flexible-extract"
value: 0.728
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.758
- name: "exact_match,flexible-extract"
value: 0.759
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.756
- name: "exact_match,flexible-extract"
value: 0.752
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
model_name: "nvidia/Minitron-4B-Base"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.252
- name: "exact_match,flexible-extract"
value: 0.252
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.86
- name: "exact_match,flexible-extract"
value: 0.86
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.624
- name: "exact_match,flexible-extract"
value: 0.624
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.616
- name: "exact_match,flexible-extract"
value: 0.632
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.578
- name: "exact_match,flexible-extract"
value: 0.585
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.593
- name: "exact_match,flexible-extract"
value: 0.588
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.595
- name: "exact_match,flexible-extract"
value: 0.582
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.792
- name: "exact_match,flexible-extract"
value: 0.824
limit: 250
num_fewshot: 5

View File

@ -0,0 +1,5 @@
Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml

View File

@ -0,0 +1,9 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml

View File

@ -0,0 +1,46 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
usage() {
echo``
echo "Runs lm eval harness on GSM8k using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -b - batch size to run the evaluation at"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo
}
while getopts "m:b:l:f:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model hf \
--model_args pretrained=$MODEL,parallelize=True \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE

View File

@ -0,0 +1,51 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.3
usage() {
echo``
echo "Runs lm eval harness on GSM8k using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -b - batch size to run the evaluation at"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE

View File

@ -0,0 +1,59 @@
#!/bin/bash
usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm and compares to "
echo "precomputed baseline (measured by HF transformers.)"
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
echo " -t - tensor parallel size"
echo
}
SUCCESS=0
while getopts "c:t:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
LOCAL_SUCCESS=0
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
export LM_EVAL_TP_SIZE=$TP_SIZE
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
else
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
fi
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
done
if [ "${SUCCESS}" -eq "0" ]; then
exit 0
else
exit 1
fi

View File

@ -0,0 +1,55 @@
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
* export LM_EVAL_TP_SIZE=4
* pytest -s test_lm_eval_correctness.py
"""
import os
from pathlib import Path
import lm_eval
import numpy
import yaml
RTOL = 0.02
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
def launch_lm_eval(eval_config):
model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={TP_SIZE}," \
f"add_bos_token=true"
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto")
return results
def test_lm_eval_correctness():
eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
# Launch eval requests.
results = launch_lm_eval(eval_config)
# Confirm scores match ground truth.
for task in eval_config["tasks"]:
for metric in task["metrics"]:
ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}')
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)

View File

@ -0,0 +1,152 @@
# vLLM benchmark suite
## Introduction
This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
**Benchmarking Duration**: about 1hr.
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
## Nightly benchmark quick overview
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
**Benchmarking Duration**: about 3.5hrs.
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `nightly-benchmarks` label.
## Performance benchmark details
See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
#### Latency test
Here is an example of one test inside `latency-tests.json`:
```json
[
{
"test_name": "latency_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
]
```
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
#### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
#### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
]
```
Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
#### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
#### Nightly tests
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
#### Docker containers
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -0,0 +1,61 @@
steps:
- label: "Wait for container to be ready"
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
containers:
- image: badouralix/curl-jq
command:
- sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100"
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- label: "H100"
agents:
queue: H100
plugins:
- docker#v5.11.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN

View File

@ -0,0 +1,45 @@
# Nightly benchmark
The main goal of this benchmarking is two-fold:
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
## Docker images
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
- vllm/vllm-openai:v0.5.0.post1
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
- openmmlab/lmdeploy:v0.5.0
- ghcr.io/huggingface/text-generation-inference:2.1
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
## Hardware
One AWS node with 8x NVIDIA A100 GPUs.
## Workload description
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 500 prompts.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
## Plots
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
## Results
{nightly_results_benchmarking_table}

View File

@ -0,0 +1,120 @@
common_pod_spec: &common_pod_spec
priorityClassName: perf-benchmark
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /root/.cache/huggingface
type: Directory
common_container_settings: &common_container_settings
command:
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
- label: "A100 trt benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
<<: *common_container_settings
- label: "A100 lmdeploy benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: openmmlab/lmdeploy:v0.5.0
<<: *common_container_settings
- label: "A100 vllm benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:latest
<<: *common_container_settings
- label: "A100 tgi benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: ghcr.io/huggingface/text-generation-inference:2.1
<<: *common_container_settings
- wait
- label: "Plot"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:v0.5.0.post1
command:
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- wait

View File

@ -0,0 +1,376 @@
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
}
kill_gpu_processes() {
# kill all processes on GPU.
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
if [ -z "$pids" ]; then
echo "No GPU processes found."
else
for pid in $pids; do
kill -9 "$pid"
echo "Killed process with PID: $pid"
done
echo "All GPU processes have been killed."
fi
# Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
# since we are in container anyway
pkill -9 -f python
pkill -9 -f python3
# waiting for GPU processes to be fully killed
# loop while nvidia-smi returns any processes
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
sleep 1
echo "Waiting for GPU processes to be killed"
done
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
if command -v buildkite-agent >/dev/null 2>&1; then
BUILDKITE_AGENT_COMMAND="buildkite-agent"
elif [ -f /workspace/buildkite-agent ]; then
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
else
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
run_latency_tests() {
# run latency tests using `benchmark_latency.py`
# $1: a json file specifying latency test cases
local latency_test_file
latency_test_file=$1
# Iterate over latency tests
jq -c '.[]' "$latency_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^latency_ ]]; then
echo "In latency-test.json, test_name must start with \"latency_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
latency_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params")
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
latency_command="python3 benchmark_latency.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
echo "Running test case $test_name"
echo "Latency command: $latency_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg latency "$latency_command" \
--arg gpu "$gpu_type" \
'{
latency_command: $latency,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$latency_command"
kill_gpu_processes
done
}
run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases
local throughput_test_file
throughput_test_file=$1
# Iterate over throughput tests
jq -c '.[]' "$throughput_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^throughput_ ]]; then
echo "In throughput-test.json, test_name must start with \"throughput_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get arguments
throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params")
# check if there is enough GPU to run the test
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
throughput_command="python3 benchmark_throughput.py \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
echo "Running test case $test_name"
echo "Throughput command: $throughput_command"
# recoding benchmarking command ang GPU command
jq_output=$(jq -n \
--arg command "$throughput_command" \
--arg gpu "$gpu_type" \
'{
throughput_command: $command,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
# run the benchmark
eval "$throughput_command"
kill_gpu_processes
done
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
echo "In serving-test.json, test_name must start with \"serving_\"."
exit 1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters')
client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
continue
fi
# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $testname."
continue
fi
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
server_pid=$!
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill -9 $server_pid
kill_gpu_processes
done
}
main() {
check_gpus
check_hf_token
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
# get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOG_LEVEL="WARNING"
# prepare for benchmarking
cd benchmarks || exit 1
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,76 @@
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
main() {
check_gpus
check_hf_token
df -h
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
cd $VLLM_SOURCE_CODE_LOC/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# run lmdeploy
if which lmdeploy >/dev/null; then
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
exit 0
fi
# run tgi
if [ -e /tgi-entrypoint.sh ]; then
echo "tgi is available, redirect to run-tgi-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
exit 0
fi
# run trt
if which trtllm-build >/dev/null; then
echo "trtllm is available, redirect to run-trt-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
exit 0
fi
# run vllm
if [ -e /vllm-workspace ]; then
echo "vllm is available, redirect to run-vllm-nightly.sh"
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
exit 0
fi
}
main "$@"

View File

@ -0,0 +1,192 @@
import json
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
results_folder = Path("results/")
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
"avg_latency": "Mean latency (ms)",
# "P10": "P10 (s)",
# "P25": "P25 (s)",
"P50": "Median latency (ms)",
# "P75": "P75 (s)",
# "P90": "P90 (s)",
"P99": "P99 latency (ms)",
}
# throughput tests and the keys that will be printed into markdown
throughput_results = []
throughput_results_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
# "num_requests": "# of req.",
# "total_num_tokens": "Total # of tokens",
# "elapsed_time": "Elapsed time (s)",
"requests_per_second": "Tput (req/s)",
# "tokens_per_second": "Tput (tok/s)",
}
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
# "completed": "# of req.",
"request_throughput": "Tput (req/s)",
# "input_throughput": "Input Tput (tok/s)",
# "output_throughput": "Output Tput (tok/s)",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
# "mean_tpot_ms": "Mean TPOT (ms)",
# "median_tpot_ms": "Median",
# "p99_tpot_ms": "P99",
"mean_itl_ms": "Mean ITL (ms)",
"median_itl_ms": "Median ITL (ms)",
"p99_itl_ms": "P99 ITL (ms)",
}
def read_markdown(file):
if os.path.exists(file):
with open(file, "r") as f:
return f.read() + "\n"
else:
return f"{file} not found.\n"
def results_to_json(latency, throughput, serving):
return json.dumps({
'latency': latency.to_dict(),
'throughput': throughput.to_dict(),
'serving': serving.to_dict()
})
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
elif "latency" in f.name:
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# get different percentiles
for perc in [10, 25, 50, 75, 90, 99]:
# Multiply 1000 to convert the time unit from s to ms
raw_result.update(
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
# add the result to raw_result
latency_results.append(raw_result)
continue
elif "throughput" in f.name:
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
throughput_results.append(raw_result)
continue
print(f"Skipping {test_file}")
latency_results = pd.DataFrame.from_dict(latency_results)
serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results)
raw_results_json = results_to_json(latency_results, throughput_results,
serving_results)
# remapping the key, for visualization purpose
if not latency_results.empty:
latency_results = latency_results[list(
latency_column_mapping.keys())].rename(
columns=latency_column_mapping)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
if not throughput_results.empty:
throughput_results = throughput_results[list(
throughput_results_column_mapping.keys())].rename(
columns=throughput_results_column_mapping)
processed_results_json = results_to_json(latency_results,
throughput_results,
serving_results)
# get markdown tables
latency_md_table = tabulate(latency_results,
headers='keys',
tablefmt='pipe',
showindex=False)
serving_md_table = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
throughput_md_table = tabulate(throughput_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# document the result
with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
results = results.format(
latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table,
benchmarking_results_in_json_string=processed_results_json)
f.write(results)
# document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f:
results = latency_results.to_dict(
orient='records') + throughput_results.to_dict(
orient='records') + serving_results.to_dict(orient='records')
f.write(json.dumps(results))

View File

@ -0,0 +1,26 @@
import argparse
from transformers import AutoTokenizer
def main(model, cachedir):
# Load the tokenizer and save it to the specified directory
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.save_pretrained(cachedir)
print(f"Tokenizer saved to {cachedir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer")
parser.add_argument("--model",
type=str,
required=True,
help="Name of the model")
parser.add_argument("--cachedir",
type=str,
required=True,
help="Directory to save the tokenizer")
args = parser.parse_args()
main(args.model, args.cachedir)

View File

@ -0,0 +1,6 @@
from lmdeploy.serve.openai.api_client import APIClient
api_client = APIClient("http://localhost:8000")
model_name = api_client.available_models[0]
print(model_name)

View File

@ -0,0 +1,102 @@
#!/bin/bash
server_params=$1
common_params=$2
model_path=$(echo "$common_params" | jq -r '.model')
model_name="${model_path#*/}"
model_type=$(echo "$server_params" | jq -r '.model_type')
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
model_tp_size=$(echo "$common_params" | jq -r '.tp')
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
cd ~
rm -rf models
mkdir -p models
cd models
models_dir=$(pwd)
trt_model_path=${models_dir}/${model_name}-trt-ckpt
trt_engine_path=${models_dir}/${model_name}-trt-engine
cd ~
rm -rf tensorrt-demo
git clone https://github.com/neuralmagic/tensorrt-demo.git
cd tensorrt-demo
tensorrt_demo_dir=$(pwd)
# make sure the parameter inside tensorrt_demo is consistent to envvar
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
cd /
rm -rf tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git submodule update --init --recursive
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
python ../quantization/quantize.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path} \
--qformat fp8 \
--kv_cache_dtype fp8 \
--calib_size 2
else
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
fi
trtllm-build \
--checkpoint_dir=${trt_model_path} \
--gpt_attention_plugin=${model_dtype} \
--gemm_plugin=${model_dtype} \
--remove_input_padding=enable \
--paged_kv_cache=enable \
--tp_size=${model_tp_size} \
--max_batch_size=${max_batch_size} \
--max_input_len=${max_input_len} \
--max_output_len=${max_output_len} \
--max_num_tokens=${max_output_len} \
--opt_num_tokens=${max_output_len} \
--output_dir=${trt_engine_path}
cd /tensorrtllm_backend/triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--model_repo=/tensorrtllm_backend/triton_model_repo &

View File

@ -0,0 +1,40 @@
#!/bin/bash
set -ex
set -o pipefail
main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results."
exit 0
fi
# initial annotation
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results
cd $VLLM_SOURCE_CODE_LOC/benchmarks
mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls
ls results/
# generate figures
python3 -m pip install tabulate pandas matplotlib
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
--description $description \
--results-folder results/
# upload results and figures
/workspace/buildkite-agent artifact upload "nightly_results.png"
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
main "$@"

View File

@ -0,0 +1,135 @@
import argparse
import json
import math
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
from tabulate import tabulate
def parse_arguments():
parser = argparse.ArgumentParser(
description=
'Parse command line arguments for summary-nightly-results script.')
parser.add_argument('--results-folder',
type=str,
required=True,
help='The folder where the results are stored.')
parser.add_argument('--description',
type=str,
required=True,
help='Description of the results.')
args = parser.parse_args()
return args
def main(args):
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
results_folder = Path(args.results_folder)
results = []
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f:
results = results + json.loads(f.read())
# generate markdown table
df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f:
description = f.read()
description = description.format(
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f:
f.write(description)
plt.rcParams.update({'font.size': 20})
# plot results
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
fig.subplots_adjust(hspace=1)
methods = ["vllm", "trt", "lmdeploy", "tgi"]
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
for j, metric in enumerate(["TTFT", "ITL"]):
means, stds = [], []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
means.append(0.)
stds.append(0.)
else:
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
std = filtered_df[f"Std {metric} (ms)"].values[0]
success = filtered_df["Successful req."].values[0]
stds.append(std / math.sqrt(success))
print(model, metric)
print(means, stds)
ax = axes[i, j + 1]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
means,
yerr=stds,
capsize=10,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel(f"{metric} (ms)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
metric = "Tput"
j = 0
if True:
tputs = []
for method in methods:
target = df['Test name'].str.contains(model)
target = target & df['Engine'].str.contains(method)
filtered_df = df[target]
if filtered_df.empty:
tputs.append(0.)
else:
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
tputs.append(input_tput + output_tput)
print(model, metric)
print(tputs)
ax = axes[i, j]
bars = ax.bar(
["vllm", "trt", "lmdeploy", "tgi"],
tputs,
)
for idx, bar in enumerate(bars):
bar.set_color(bar_colors[idx])
ax.set_ylim(bottom=0)
ax.set_ylabel("Tput (token/s)")
ax.set_title(f"{model} {metric}")
ax.grid(axis='y')
fig.tight_layout()
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
if __name__ == '__main__':
args = parse_arguments()
main(args)

View File

@ -0,0 +1,218 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill lmdeploy || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append lmdeploy to the test name
test_name=lmdeploy_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
# prepare tokenizer
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "lmdeploy server is up and running."
else
echo ""
echo "lmdeploy failed to start within the timeout period."
break
fi
# get model name
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend lmdeploy \
--tokenizer /tokenizer_cache \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--model \"$model_name\" \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "lmdeploy" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
python -m pip install transformers==4.41.2
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,216 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill text-generation || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append tgi to the test name
test_name=tgi_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "tgi server is up and running."
else
echo ""
echo "tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tgi \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "tgi" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=tgi
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,214 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill tritonserver || true
# waiting for GPU processes to be fully killed
sleep 20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append trt to the test name
test_name=trt_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "Running test case $test_name"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "trt server is up and running."
else
echo ""
echo "trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tensorrt-llm \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command=""
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "trt" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python -m pip install transformers -U
export CURRENT_LLM_SERVING_ENGINE=trt
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,221 @@
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append vllm to the test name
test_name=vllm_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "vllm" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=vllm
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"

View File

@ -0,0 +1,76 @@
import datetime
import json
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
results_folder = Path("results/")
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
"completed": "Successful req.",
"request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)",
"input_throughput": "Input Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
"engine": "Engine",
}
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty:
serving_results = serving_results[list(
serving_column_mapping.keys())].rename(
columns=serving_column_mapping)
serving_md_table_with_headers = tabulate(serving_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split('\n')
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
# document benchmarking results in markdown
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
# document results with header.
# for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers)
f.write('\n')
# document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient='records')
f.write(json.dumps(results))

View File

@ -0,0 +1,17 @@
#!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
retries=0
while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0
fi
echo "Waiting for image to be available..."
retries=$((retries + 1))
sleep 5
done
exit 1

View File

@ -0,0 +1,67 @@
## Latency tests
This test suite aims to test vllm's end-to-end latency under a controlled setup.
- Input length: 32 tokens.
- Output length: 128 tokens.
- Batch size: fixed (8).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
### Latency benchmarking results
{latency_tests_markdown_table}
## Throughput tests
This test suite aims to test vllm's throughput.
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput.
### Throughput benchmarking results
{throughput_tests_markdown_table}
## Serving tests
This test suite aims to test vllm's real serving metrics.
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
### Serving benchmarking results
{serving_tests_markdown_table}
## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format.
You can load the benchmarking tables into pandas dataframes as follows:
```python
import json
import pandas as pd
benchmarking_results_json = """The json string"""
benchmarking_results = json.loads(benchmarking_results_json)
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
```
The json string for all benchmarking tables:
```json
{benchmarking_results_in_json_string}
```
You can also check the raw experiment data in the Artifact tab of the Buildkite page.

View File

@ -0,0 +1,32 @@
[
{
"test_name": "latency_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama70B_tp4",
"parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15
}
},
{
"test_name": "latency_mixtral8x7B_tp2",
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15
}
}
]

View File

@ -0,0 +1,116 @@
[
{
"test_name": "llama8B_tp1",
"qps_list": [4],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tp": 1,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
},
{
"test_name": "llama70B_tp4",
"qps_list": [2],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
},
{
"test_name": "mixtral8x7B_tp2",
"qps_list": [2],
"common_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tp": 2,
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500,
"port": 8000
},
"lmdeploy_server_parameters": {
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "float16",
"max_batch_size": 256,
"max_input_len": 4096,
"max_output_len": 4096,
"trt_llm_version": "r24.04"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": ""
},
"vllm_client_parameters": {
}
}
]

View File

@ -0,0 +1,59 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama70B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
}
]

View File

@ -0,0 +1,35 @@
[
{
"test_name": "throughput_llama8B_tp1",
"parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama70B_tp4",
"parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_mixtral8x7B_tp2",
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@ -0,0 +1,19 @@
steps:
- label: "Build wheel - CUDA {{matrix.cuda_version}}"
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env:
DOCKER_BUILDKIT: "1"
matrix:
setup:
cuda_version:
- "11.8.0"
- "12.1.0"

View File

@ -1,38 +1,85 @@
# This script build the ROCm docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
# This script runs test inside the corresponding ROCm docker container.
set -ex
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- ROCm info"
rocminfo
# Try building the docker image
docker build -t rocm -f Dockerfile.rocm .
# Setup cleanup
remove_docker_container() { docker rm -f rocm || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes
docker volume prune -f
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
# Call the cleanup docker function
cleanup_docker
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container"
docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull ${image_name}
remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
}
trap remove_docker_container EXIT
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${@}"

View File

@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@ -50,11 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
echo '```' >> benchmark_results.md
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
echo '```' >> benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /usr/bin/buildkite-agent ]; then
exit 0
fi
# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
# exit with the exit code of the benchmarks
if [ $bench_latency_exit_code -ne 0 ]; then
@ -69,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code
fi
/workspace/buildkite-agent artifact upload openai-*.json
rm ShareGPT_V3_unfiltered_cleaned_split.json
buildkite-agent artifact upload "*.json"

View File

@ -3,12 +3,38 @@
set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
# offline inference
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest Pillow protobuf
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
# online inference
docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"

105
.buildkite/run-multi-node-test.sh Executable file
View File

@ -0,0 +1,105 @@
#!/bin/bash
set -euox pipefail
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi
WORKING_DIR=$1
NUM_NODES=$2
NUM_GPUS=$3
DOCKER_IMAGE=$4
shift 4
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}"
exit 1
fi
echo "List of commands"
for command in "${COMMANDS[@]}"; do
echo $command
done
start_network() {
docker network create --subnet=192.168.10.0/24 docker-net
}
start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
# start the container in detached mode
# things to note:
# 1. --shm-size=10.24gb is required. don't use --ipc=host
# 2. pass HF_TOKEN to the container
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
# organize containers into a ray cluster
if [ $node -eq 0 ]; then
# start the ray head node
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep 10
else
# start the ray worker nodes, and connect them to the head node
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
fi
done
# wait for the cluster to be ready
sleep 10
# print the cluster status
docker exec node0 /bin/bash -c "ray status"
}
run_nodes() {
# important: iterate in reverse order to start the head node last
# we start the worker nodes first, in detached mode, and then start the head node
# in the foreground, so that the output of the head node is visible in the buildkite logs
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -ne 0 ]; then
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do
docker stop node$node
done
docker network rm docker-net
}
trap cleanup EXIT
start_network
start_nodes
run_nodes

View File

@ -0,0 +1,51 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
echo $current_time > /tmp/neuron-docker-build-timestamp
fi
else
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
fi
docker build -t neuron -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() { docker rm -f neuron || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'

14
.buildkite/run-openvino-test.sh Executable file
View File

@ -0,0 +1,14 @@
# This script build the OpenVINO docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t openvino-test -f Dockerfile.openvino .
# Setup cleanup
remove_docker_container() { docker rm -f openvino-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py

View File

@ -0,0 +1,16 @@
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
python3 /workspace/vllm/examples/offline_inference_tpu.py

View File

@ -0,0 +1,14 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t xpu-test -f Dockerfile.xpu .
# Setup cleanup
remove_docker_container() { docker rm -f xpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py

View File

@ -1,97 +1,290 @@
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.
# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.
steps:
- label: Async Engine, Inputs, Utils, Worker Test
fast_check: true
fast_check_only: true
commands:
- pytest -v -s async_engine # Async Engine
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker
- label: Metrics, Tracing Test
fast_check: true
fast_check_only: true
commands:
- pytest -v -s metrics # Metrics
- "pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai" # Tracing
- pytest -v -s tracing
- label: Regression Test
mirror_hardwares: [amd]
fast_check: true
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
#mirror_hardwares: [amd]
command: pytest -v -s async_engine
- label: Basic Correctness Test
command: pytest -v -s basic_correctness
mirror_hardwares: [amd]
fast_check: true
commands:
# This flashinfer installation will fail on AMD ROCm, so it is set as optional.
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test
command: pytest -v -s core
mirror_hardwares: [amd]
fast_check: true
commands:
- pytest -v -s core
- pytest -v -s distributed/test_parallel_state.py
- label: Distributed Comm Ops Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
- label: Distributed Tests
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- pytest -v -s test_pynccl.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total)
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- label: Distributed Tests (2 GPUs)
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Distributed Tests (4 GPUs)
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
fast_check: true
commands:
- pytest -v -s distributed/test_pynccl.py
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
- label: Pipeline Parallelism Test
working_dir: "/vllm-workspace/tests"
num_gpus: 4
commands:
- pytest -v -s distributed/test_pipeline_parallel.py
- label: Engine Test
command: pytest -v -s engine tokenization test_sequence.py test_config.py
mirror_hardwares: [amd]
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: Entrypoints Test
command: pytest -v -s entrypoints
fast_check: true
mirror_hardwares: [amd]
commands:
- pytest -v -s entrypoints/llm
- pytest -v -s entrypoints/openai
- label: Examples Test
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
- pip install awscli
# install tensorizer for tensorize_vllm_model.py
- pip install awscli tensorizer
- python3 offline_inference.py
- python3 cpu_offload.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- label: Inputs Test
#mirror_hardwares: [amd]
commands:
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
- label: Kernels Test %N
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
#mirror_hardwares: [amd]
commands:
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Models Test
#mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- pytest -v -s models -m \"not vlm\"
- label: Llava Test
- label: Vision Language Models Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py
- pytest -v -s models -m vlm
- label: Prefix Caching Test
mirror_hardwares: [amd]
commands:
- pytest -v -s prefix_caching
- label: Samplers Test
#mirror_hardwares: [amd]
command: pytest -v -s samplers
- label: LogitsProcessor Test
mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py
- label: Utils Test
commands:
- pytest -v -s test_utils.py
- pytest -v -s test_embedded_commit.py
- label: Worker Test
mirror_hardwares: [amd]
command: pytest -v -s worker
- label: Speculative decoding tests
command: pytest -v -s spec_decode
#mirror_hardwares: [amd]
commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode
- label: LoRA Test %N
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
#mirror_hardwares: [amd]
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
- label: LoRA Long Context (Distributed)
#mirror_hardwares: [amd]
num_gpus: 4
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py
- label: Tensorizer Test
#mirror_hardwares: [amd]
soft_fail: true
fast_check: true
commands:
- apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader
- label: Metrics Test
mirror_hardwares: [amd]
command: pytest -v -s metrics
- label: Quantization Test
#mirror_hardwares: [amd]
command: pytest -v -s quantization
- label: Tracing Test
commands:
- "pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai"
- pytest -v -s tracing
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
commands:
- pip install aiohttp
- bash run-benchmarks.sh
- label: LM Eval Small Models
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: LM Eval Large Models
gpu: a100
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4
- label: Documentation Build
working_dir: "/vllm-workspace/docs"
working_dir: "/vllm-workspace/test_docs/docs"
fast_check: true
no_gpu: True
commands:
- pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html
- label: Distributed Tests (A100)
gpu: a100
num_gpus: 4
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s -x lora/test_mixtral.py

View File

@ -1,69 +0,0 @@
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: "AMD Test"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh
- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh
- label: ":docker: build image"
commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- wait
{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
{% if step.parallelism %}
parallelism: {{ step.parallelism }}
{% endif %}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- '-c'
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
{% if not step.no_gpu %}
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
{% endif %}
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}

26
.clang-format Normal file
View File

@ -0,0 +1,26 @@
BasedOnStyle: Google
UseTab: Never
IndentWidth: 2
ColumnLimit: 80
# Force pointers to the type for C++.
DerivePointerAlignment: false
PointerAlignment: Left
# Reordering #include statements can (and currently will) introduce errors
SortIncludes: false
# Style choices
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
IndentPPDirectives: BeforeHash
IncludeCategories:
- Regex: '^<'
Priority: 4
- Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
Priority: 3
- Regex: '^"(qoda|\.\.)/'
Priority: 2
- Regex: '.*'
Priority: 1

2
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1,2 @@
github: [vllm-project]
open_collective: [vllm]

View File

@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`

View File

@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`

View File

@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
@ -57,6 +58,10 @@ body:
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.

View File

@ -39,6 +39,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`

49
.github/ISSUE_TEMPLATE/750-RFC.yml vendored Normal file
View File

@ -0,0 +1,49 @@
name: 💬 Request for comments (RFC).
description: Ask for feedback on major architectural changes or design choices.
title: "[RFC]: "
labels: ["RFC"]
body:
- type: markdown
attributes:
value: >
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
- type: textarea
attributes:
label: Motivation.
description: >
The motivation of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Proposed Change.
description: >
The proposed change of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Feedback Period.
description: >
The feedback period of the RFC. Usually at least one week.
validations:
required: false
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC.
validations:
required: false
- type: textarea
attributes:
label: Any Other Things.
description: >
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!

View File

@ -0,0 +1,21 @@
name: Add label on auto-merge enabled
on:
pull_request_target:
types:
- auto_merge_enabled
jobs:
add-label-on-auto-merge:
runs-on: ubuntu-latest
steps:
- name: Add label
uses: actions/github-script@v5
with:
script: |
github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
labels: ['ready']
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -0,0 +1,23 @@
name: Add Ready Label on Ready Comment
on:
issue_comment:
types: [created]
jobs:
add-ready-label:
runs-on: ubuntu-latest
if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
steps:
- name: Add label
uses: actions/github-script@v5
with:
script: |
github.rest.issues.addLabels({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
labels: ['ready']
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

42
.github/workflows/clang-format.yml vendored Normal file
View File

@ -0,0 +1,42 @@
name: clang-format
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
jobs:
clang-format:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install clang-format==18.1.5
- name: Running clang-format
run: |
EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu'
'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
'csrc/punica/bgmv/bgmv_config.h'
'csrc/punica/bgmv/bgmv_impl.cuh'
'csrc/punica/bgmv/vec_dtypes.cuh'
'csrc/punica/punica_ops.cu'
'csrc/punica/type_convert.h'
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| xargs clang-format --dry-run --Werror

53
.github/workflows/mypy.yaml vendored Normal file
View File

@ -0,0 +1,53 @@
name: mypy
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install mypy==1.9.0
pip install types-setuptools
pip install types-PyYAML
pip install types-requests
pip install types-setuptools
- name: Mypy
run: |
mypy tests --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/inputs --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/platforms --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml

View File

@ -49,13 +49,19 @@ jobs:
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11']
pytorch-version: ['2.1.2'] # Must be the most recent version that meets requirements.txt.
pytorch-version: ['2.3.1'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
run: |
@ -76,6 +82,8 @@ jobs:
- name: Build wheel
shell: bash
env:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename)

21
.github/workflows/reminder_comment.yml vendored Normal file
View File

@ -0,0 +1,21 @@
name: PR Reminder Comment Bot
on:
pull_request_target:
types: [opened]
jobs:
pr_reminder:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
uses: actions/github-script@v6
with:
script: |
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
@ -25,7 +25,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .

View File

@ -9,7 +9,7 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements
$python_executable -m pip install wheel packaging
$python_executable -m pip install -r requirements.txt
$python_executable -m pip install -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1

View File

@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
generate_release_notes: true,
name: process.env.RELEASE_TAG,
owner: context.repo.owner,
prerelease: false,
prerelease: true,
repo: context.repo.repo,
tag_name: process.env.RELEASE_TAG,
});

View File

@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}

6
.gitignore vendored
View File

@ -1,3 +1,6 @@
# vllm commit id, generated by setup.py
vllm/commit_id.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -70,6 +73,8 @@ instance/
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/*.rst
!**/*.template.rst
# PyBuilder
.pybuilder/
@ -181,6 +186,7 @@ _build/
# hip files generated by PyTorch
*.hip
*_hip*
hip_compat.h
# Benchmark dataset
*.json

View File

@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)
project(vllm_extensions LANGUAGES CXX)
option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@ -19,7 +20,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
#
# Supported/expected torch versions for CUDA/ROCm.
@ -31,9 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
#
# Try to find python package with an executable that exactly matches
@ -66,19 +66,6 @@ endif()
#
find_package(Torch REQUIRED)
#
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
# `libtorch_python.so` for linking against an extension. Torch's cmake
# configuration does not include this library (presumably since the cmake
# config is used for standalone C++ binaries that link against torch).
# The `libtorch_python.so` library defines some of the glue code between
# torch/python via pybind and is required by VLLM extensions for this
# reason. So, add it by manually with `find_library` using torch's
# installed library path.
#
find_library(torch_python_LIBRARY torch_python PATHS
"${TORCH_INSTALL_PREFIX}/lib")
#
# Forward the non-CUDA device extensions to external CMake scripts.
#
@ -111,18 +98,11 @@ elseif(HIP_FOUND)
# .hip extension automatically, HIP must be enabled explicitly.
enable_language(HIP)
# ROCm 5.x
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
endif()
# ROCm 6.x
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
endif()
else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@ -167,15 +147,50 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp")
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.0
GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
)
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
"csrc/custom_all_reduce.cu")
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/custom_all_reduce.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
#
# The CUTLASS kernels for Hopper require sm90a to be enabled.
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
set_source_files_properties(
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
PROPERTIES
COMPILE_FLAGS
"-gencode arch=compute_90a,code=sm_90a")
endif()
endif()
define_gpu_extension_target(
@ -185,6 +200,8 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)
#
@ -192,7 +209,7 @@ define_gpu_extension_target(
#
set(VLLM_MOE_EXT_SRC
"csrc/moe/moe_ops.cpp"
"csrc/moe/torch_bindings.cpp"
"csrc/moe/topk_softmax_kernels.cu")
define_gpu_extension_target(
@ -202,6 +219,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
#
@ -210,24 +228,13 @@ define_gpu_extension_target(
set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
"csrc/punica/punica_ops.cc")
"csrc/punica/punica_ops.cu"
"csrc/punica/torch_bindings.cpp")
#
# Copy GPU compilation flags+update for punica
@ -251,6 +258,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
endif()
endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif()
if (VLLM_PUNICA_GPU_ARCHES)
@ -261,6 +271,7 @@ if (VLLM_PUNICA_GPU_ARCHES)
SOURCES ${VLLM_PUNICA_EXT_SRC}
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
else()
message(WARNING "Unable to create _punica_C target because none of the "
@ -285,9 +296,7 @@ add_custom_target(default)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)

View File

@ -21,7 +21,6 @@ Express your support on Twitter if vLLM aids you, or simply offer your appreciat
### Build from source
```bash
pip install -r requirements.txt
pip install -e . # This may take several minutes.
```
@ -30,6 +29,8 @@ pip install -e . # This may take several minutes.
```bash
pip install -r requirements-dev.txt
# linting and formatting
bash format.sh
# Static type checking
mypy
# Unit tests

View File

@ -1,55 +1,87 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE ####################
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
# prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
ENV DEBIAN_FRONTEND=noninteractive
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
&& python3 --version
RUN apt-get update -y \
&& apt-get install -y python3-pip git
&& apt-get install -y git curl sudo
# Install pip s.t. it will be compatible with our PYTHON_VERSION
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
RUN python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR /workspace
# install build and runtime dependencies
COPY requirements.txt requirements.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
python3 -m pip install -r requirements-cuda.txt
# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
COPY requirements-mamba.txt requirements-mamba.txt
RUN python3 -m pip install packaging
RUN python3 -m pip install -r requirements-mamba.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
#################### EXTENSION BUILD IMAGE ####################
FROM dev AS build
ARG PYTHON_VERSION=3.10
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt
python3 -m pip install -r requirements-build.txt
# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache
# copy input files
# files and directories related to build wheels
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements.txt requirements.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py
COPY vllm vllm
# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
@ -59,78 +91,140 @@ ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
ARG buildkite_commit
ENV BUILDKITE_COMMIT=${buildkite_commit}
ARG USE_SCCACHE
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& if [ "$CUDA_VERSION" = "11.8.0" ]; then \
export SCCACHE_BUCKET=vllm-build-sccache-2; \
else \
export SCCACHE_BUCKET=vllm-build-sccache; \
fi \
&& export SCCACHE_REGION=us-west-2 \
&& export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py build_ext --inplace
--mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" != "1" ]; then \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist
#################### EXTENSION Build IMAGE ####################
#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
#################### DEV IMAGE ####################
FROM base as dev
COPY requirements-lint.txt requirements-lint.txt
COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
#################### DEV IMAGE ####################
#################### MAMBA Build IMAGE ####################
FROM dev as mamba-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
ARG flash_attn_version=v2.5.6
ENV FLASH_ATTN_VERSION=${flash_attn_version}
WORKDIR /usr/src/flash-attention-v2
WORKDIR /usr/src/mamba
COPY requirements-mamba.txt requirements-mamba.txt
# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
RUN pip --verbose wheel -r requirements-mamba.txt \
--no-build-isolation --no-deps --no-cache-dir
#################### FLASH_ATTENTION Build IMAGE ####################
#################### MAMBA Build IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
WORKDIR /vllm-workspace
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
&& python3 --version
RUN apt-get update -y \
&& apt-get install -y python3-pip git vim curl libibverbs-dev
# Install pip s.t. it will be compatible with our PYTHON_VERSION
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
RUN python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
FROM dev AS test
# note that this uses vllm installed by `pip`
FROM vllm-base AS test
# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
WORKDIR /vllm-workspace
# ADD is used to preserve directory structure
ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################
#################### RUNTIME BASE IMAGE ####################
# We used base cuda image because pytorch installs its own cuda libraries.
# However pynccl depends on cuda libraries so we had to switch to the runtime image
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
# libnccl required for ray
RUN apt-get update -y \
&& apt-get install -y python3-pip
WORKDIR /workspace
COPY requirements.txt requirements.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
python3 -m pip install -r requirements-dev.txt
# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### RUNTIME BASE IMAGE ####################
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer modelscope
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm
pip install accelerate hf_transfer 'modelscope!=1.15.0'
ENV VLLM_USAGE_SOURCE production-docker-image

View File

@ -1,20 +1,41 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04
FROM ubuntu:22.04 AS cpu-test-1
RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
RUN apt-get update -y \
&& apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN pip install intel-openmp
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
RUN pip install --upgrade pip \
&& pip install wheel packaging ninja setuptools>=49.4.0 numpy
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
FROM cpu-test-1 AS build
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
CMD ["/bin/bash"]
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

36
Dockerfile.neuron Normal file
View File

@ -0,0 +1,36 @@
# default base image
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..
CMD ["/bin/bash"]

26
Dockerfile.openvino Normal file
View File

@ -0,0 +1,26 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
FROM ubuntu:20.04 AS dev
RUN apt-get update -y && \
apt-get install -y python3-pip git
WORKDIR /workspace
# copy requirements
COPY requirements-build.txt /workspace/vllm/
COPY requirements-common.txt /workspace/vllm/
COPY requirements-openvino.txt /workspace/vllm/
COPY vllm/ /workspace/vllm/vllm
COPY setup.py /workspace/vllm/
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
# build vLLM with OpenVINO backend
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
COPY examples/ /workspace/vllm/examples
COPY benchmarks/ /workspace/vllm/benchmarks
CMD ["/bin/bash"]

22
Dockerfile.ppc64le Normal file
View File

@ -0,0 +1,22 @@
FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
# These packages will be in rocketce eventually
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /vllm-workspace
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,32 +1,33 @@
# default base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
# Default ROCm 6.1 base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
FROM $BASE_IMAGE
# Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
RUN echo "Base image is $BASE_IMAGE"
# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
ARG FA_BRANCH="3d2b6f5"
RUN echo "FA_BRANCH is $FA_BRANCH"
# whether to build flash-attention
# if 0, will not build flash attention
# this is useful for gfx target where flash-attention is not supported
# In that case, we need to use the python reference attention implementation in vllm
# Whether to install CK-based flash-attention
# If 0, will not install flash-attention
ARG BUILD_FA="1"
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
# architectures specified in `FA_GFX_ARCHS`
ARG TRY_FA_WHEEL="1"
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="23a2b1c2"
# Whether to build triton on rocm
ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e0fc12c"
### Base image build stage
FROM $BASE_IMAGE AS base
# Import arg(s) defined before this build stage
ARG PYTORCH_ROCM_ARCH
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
@ -37,59 +38,145 @@ RUN apt-get update && apt-get install -y \
build-essential \
wget \
unzip \
nvidia-cuda-toolkit \
tmux \
ccache \
&& rm -rf /var/lib/apt/lists/*
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
# When launching the container, mount the code directory to /vllm-workspace
ARG APP_MOUNT=/vllm-workspace
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
# Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --no-cache-dir --pre \
torch==2.5.0.dev20240710 \
torchvision==0.20.0.dev20240710 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
*) ;; esac
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
# Install ROCm flash-attention
RUN if [ "$BUILD_FA" = "1" ]; then \
mkdir libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
&& python3 setup.py install \
&& cd ..; \
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV CCACHE_DIR=/root/.cache/ccache
### AMD-SMI build stage
FROM base AS build_amdsmi
# Build amdsmi wheel always
RUN cd /opt/rocm/share/amd_smi \
&& python3 -m pip wheel . --wheel-dir=/install
### Flash-Attention wheel build stage
FROM base AS build_fa
ARG BUILD_FA
ARG TRY_FA_WHEEL
ARG FA_WHEEL_URL
ARG FA_GFX_ARCHS
ARG FA_BRANCH
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_FA" = "1" ]; then \
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
# If a suitable wheel exists, we download it instead of building FA
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
else \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout "${FA_BRANCH}" \
&& git submodule update --init \
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
fi; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually removed it so that later steps of numpy upgrade can continue
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
COPY ./ /app/vllm
### Triton wheel build stage
FROM base AS build_triton
ARG BUILD_TRITON
ARG TRITON_BRANCH
# Build triton wheel if `BUILD_TRITON = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/OpenAI/triton.git \
&& cd triton \
&& git checkout "${TRITON_BRANCH}" \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=/install; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install xformers==0.0.23 --no-deps
RUN cd /app \
&& cd vllm \
&& pip install -U -r requirements-rocm.txt \
&& if [ "$BUILD_FA" = "1" ]; then \
bash patch_xformers.rocm.sh; fi \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
&& cd ..
### Final vLLM build stage
FROM base AS final
# Import the vLLM development directory from the build context
COPY . .
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
# Make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
# Workaround for ray >= 2.10.0
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# Silences the HF Tokenizers warning
ENV TOKENIZERS_PARALLELISM=false
RUN --mount=type=cache,target=${CCACHE_DIR} \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -Ur requirements-rocm.txt \
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
# Prevent interference if torch bundles its own HIP runtime
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
*) ;; esac \
&& python3 setup.py clean --all \
&& python3 setup.py develop
# Copy amdsmi wheel into final image
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
mkdir -p libs \
&& cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y amdsmi;
# Copy triton wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y triton; fi
# Copy flash-attn wheel(s) into final image if they were built
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y flash-attn; fi
# Install wheels that were built to the final image
RUN --mount=type=cache,target=/root/.cache/pip \
if ls libs/*.whl; then \
python3 -m pip install libs/*.whl; fi
CMD ["/bin/bash"]

20
Dockerfile.tpu Normal file
View File

@ -0,0 +1,20 @@
ARG NIGHTLY_DATE="20240713"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
WORKDIR /workspace
# Install aiohttp separately to avoid build errors.
RUN pip install aiohttp
# Install NumPy 1 instead of NumPy 2.
RUN pip install "numpy<2"
# Install the TPU and Pallas dependencies.
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
# Build vLLM.
COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python setup.py develop
CMD ["/bin/bash"]

22
Dockerfile.xpu Normal file
View File

@ -0,0 +1,22 @@
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-xpu.txt
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
CMD ["/bin/bash"]

View File

@ -1,5 +1,9 @@
include LICENSE
include requirements.txt
include requirements-common.txt
include requirements-cuda.txt
include requirements-rocm.txt
include requirements-neuron.txt
include requirements-cpu.txt
include CMakeLists.txt
recursive-include cmake *

101
README.md
View File

@ -16,25 +16,14 @@ Easy, fast, and cheap LLM serving for everyone
---
**The Third vLLM Bay Area Meetup (April 2nd 6pm-8:30pm PT)**
We are thrilled to announce our third vLLM Meetup!
The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from Roblox coming up to the stage to discuss their experience in deploying LLMs with vLLM.
Please register [here](https://robloxandvllmmeetup2024.splashthat.com/) and join us!
---
*Latest News* 🔥
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2024/01] Added ROCm 6.0 support to vLLM.
- [2023/12] Added ROCm 5.7 support to vLLM.
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
---
@ -50,50 +39,27 @@ vLLM is fast with:
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
- Optimized CUDA kernels
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
vLLM is flexible and easy to use with:
- Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor parallelism support for distributed inference
- Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support NVIDIA GPUs and AMD GPUs
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support
vLLM seamlessly supports many Hugging Face models, including the following architectures:
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral)
- Multi-modal LLMs (e.g., LLaVA)
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
## Getting Started
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
@ -101,9 +67,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
pip install vllm
```
## Getting Started
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
@ -113,6 +77,35 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
We welcome and value any contributions and collaborations.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
## Sponsors
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
<!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
- a16z
- AMD
- Anyscale
- AWS
- Crusoe Cloud
- Databricks
- DeepInfra
- Dropbox
- Google Cloud
- Lambda Lab
- NVIDIA
- Replicate
- Roblox
- RunPod
- Sequoia Capital
- Trainy
- UC Berkeley
- UC San Diego
- ZhenFund
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):

View File

@ -4,10 +4,13 @@ import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import List, Optional
from typing import List, Optional, Union
import aiohttp
import huggingface_hub.constants
from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@ -27,8 +30,8 @@ class RequestFuncInput:
class RequestFuncOutput:
generated_text: str = ""
success: bool = False
latency: float = 0
ttft: float = 0 # Time to first token
latency: float = 0.0
ttft: float = 0.0 # Time to first token
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
prompt_len: int = 0
@ -58,23 +61,28 @@ async def async_request_tgi(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
chunk = remove_prefix(chunk.decode("utf-8"), "data:")
#NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = remove_prefix(chunk_bytes, "data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
@ -88,6 +96,9 @@ async def async_request_tgi(
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
@ -119,23 +130,25 @@ async def async_request_trt_llm(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = remove_prefix(chunk.decode("utf-8"), "data:")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
@ -147,11 +160,10 @@ async def async_request_trt_llm(
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.generated_text = json.loads(data)["text_output"]
output.success = True
else:
output.error = response.reason
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
@ -195,7 +207,7 @@ async def async_request_deepspeed_mii(
output.generated_text = parsed_resp["text"][0]
output.success = True
else:
output.error = response.reason
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
@ -213,8 +225,8 @@ async def async_request_openai_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
"v1/completions"
), "OpenAI Completions API URL must end with 'v1/completions'."
"completions"
), "OpenAI Completions API URL must end with 'completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@ -234,38 +246,38 @@ async def async_request_openai_completions(
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if data["choices"][0]["text"]:
timestamp = time.perf_counter()
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# do not want to include as inter-token-latency
elif data.get("usage", None) is None:
output.itl.append(timestamp -
most_recent_timestamp)
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += data["choices"][0]["text"]
@ -273,6 +285,9 @@ async def async_request_openai_completions(
output.generated_text = generated_text
output.success = True
output.latency = latency
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
@ -289,8 +304,8 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
"v1/chat/completions"
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
"chat/completions"
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@ -315,19 +330,20 @@ async def async_request_openai_chat_completions(
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@ -337,7 +353,7 @@ async def async_request_openai_chat_completions(
delta = data["choices"][0]["delta"]
if delta.get("content", None):
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
@ -354,7 +370,7 @@ async def async_request_openai_chat_completions(
output.success = True
output.latency = latency
else:
output.error = response.reason
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
@ -374,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str:
return text
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
return model_path
return pretrained_model_name_or_path
def get_tokenizer(
pretrained_model_name_or_path: str, trust_remote_code: bool
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(
pretrained_model_name_or_path)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
trust_remote_code=trust_remote_code)
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_openai_completions,
@ -382,4 +422,5 @@ ASYNC_REQUEST_FUNCS = {
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
}

View File

@ -1,14 +1,19 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import json
import time
from pathlib import Path
from typing import Optional
from typing import List, Optional
import numpy as np
import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def main(args: argparse.Namespace):
@ -16,19 +21,33 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(model=args.model,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size)
llm = LLM(
model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
speculative_draft_tensor_parallel_size=\
args.speculative_draft_tensor_parallel_size,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization,
load_format=args.load_format,
distributed_executor_backend=args.distributed_executor_backend,
otlp_traces_endpoint=args.otlp_traces_endpoint,
enable_prefix_caching=args.enable_prefix_caching,
)
sampling_params = SamplingParams(
n=args.n,
@ -42,7 +61,9 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
dummy_inputs: List[PromptInputs] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
@ -53,13 +74,13 @@ def main(args: argparse.Namespace):
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
@ -67,7 +88,8 @@ def main(args: argparse.Namespace):
return latency
print("Warming up...")
run_to_completion(profile_dir=None)
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
run_to_completion(profile_dir=None)
if args.profile:
profile_dir = args.profile_result_dir
@ -83,18 +105,39 @@ def main(args: argparse.Namespace):
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds')
for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds')
# Output JSON results if specified
if args.output_json:
results = {
"avg_latency": np.mean(latencies),
"latencies": latencies.tolist(),
"percentiles": dict(zip(percentages, percentiles.tolist())),
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
parser.add_argument('--speculative-model', type=str, default=None)
parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--speculative-draft-tensor-parallel-size',
'-spec-draft-tp',
type=int,
default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=['awq', 'gptq', 'squeezellm', None],
choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
@ -105,13 +148,23 @@ if __name__ == '__main__':
default=1,
help='Number of generated sequences per prompt.')
parser.add_argument('--use-beam-search', action='store_true')
parser.add_argument('--num-iters-warmup',
type=int,
default=10,
help='Number of iterations to run for warmup.')
parser.add_argument('--num-iters',
type=int,
default=3,
default=30,
help='Number of iterations to run.')
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len',
type=int,
default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype',
type=str,
@ -125,12 +178,23 @@ if __name__ == '__main__':
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument(
"--kv-cache-dtype",
'--kv-cache-dtype',
type=str,
choices=['auto', 'fp8_e5m2'],
default='auto',
help=
'Data type for kv cache storage. If "auto", will use model data type.')
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
'--profile',
action='store_true',
@ -144,19 +208,23 @@ if __name__ == '__main__':
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.')
default="auto",
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument('--block-size',
type=int,
default=16,
help='block size of key/value cache')
parser.add_argument(
'--enable-chunked-prefill',
type=bool,
default=False,
action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
@ -167,5 +235,51 @@ if __name__ == '__main__':
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the latency results in JSON format.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
args = parser.parse_args()
main(args)

View File

@ -1,7 +1,7 @@
import argparse
import time
from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
def main(args):
llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
llm = LLM(model=args.model,
tokenizer_mode='auto',
trust_remote_code=True,
enforce_eager=True,
use_v2_block_manager=args.use_v2_block_manager,
tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching)
num_prompts = 100
prompts = [PROMPT] * num_prompts
sampling_params = SamplingParams(temperature=0, max_tokens=100)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------")
test_prefix(
llm=llm,
prompts=prompts[:1],
prompts=prompts,
sampling_params=sampling_params,
)
@ -42,11 +44,19 @@ def main(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
parser = FlexibleArgumentParser(
description='Benchmark the performance with or without automatic '
'prefix caching.')
parser.add_argument('--model',
type=str,
default='baichuan-inc/Baichuan2-13B-Chat')
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)

View File

@ -2,8 +2,8 @@
On the server side, run one of the following commands:
vLLM OpenAI API server
python -m vllm.entrypoints.openai.api_server \
--model <your_model> --swap-space 16 \
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests
(TGI backend)
@ -17,6 +17,10 @@ On the client side, run:
--dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
"""
import argparse
import asyncio
@ -27,7 +31,7 @@ import time
import warnings
from dataclasses import dataclass
from datetime import datetime
from typing import AsyncGenerator, List, Tuple
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@ -35,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
try:
from vllm.utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
@dataclass
@ -48,17 +60,26 @@ class BenchmarkMetrics:
output_throughput: float
mean_ttft_ms: float
median_ttft_ms: float
std_ttft_ms: float
p99_ttft_ms: float
mean_tpot_ms: float
median_tpot_ms: float
std_tpot_ms: float
p99_tpot_ms: float
mean_itl_ms: float
median_itl_ms: float
std_itl_ms: float
p99_itl_ms: float
def sample_sharegpt_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
@ -68,38 +89,32 @@ def sample_sharegpt_requests(
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# some of these will be filtered out, so sample more than we need
sampled_indices = random.sample(range(len(dataset)),
int(num_requests * 1.2))
dataset = [dataset[i] for i in sampled_indices]
# Shuffle the dataset.
random.shuffle(dataset)
# Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids
completions = [completion for _, completion in dataset]
completion_token_ids = tokenizer(completions).input_ids
tokenized_dataset = []
for i in range(len(dataset)):
output_len = len(completion_token_ids[i])
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
# Filter out too long sequences.
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for prompt, prompt_token_ids, output_len in tokenized_dataset:
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
# This is because TGI causes errors when the input or output length
# is too short.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
# Sample the requests.
sampled_requests = random.sample(filtered_dataset, num_requests)
return sampled_requests
return filtered_dataset
def sample_sonnet_requests(
@ -110,7 +125,9 @@ def sample_sonnet_requests(
prefix_len: int,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, str, int, int]]:
assert input_len > prefix_len, "input_len must be greater than prefix_len."
assert (
input_len > prefix_len
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
# Load the dataset.
with open(dataset_path) as f:
@ -131,8 +148,9 @@ def sample_sonnet_requests(
base_message, add_generation_prompt=True, tokenize=False)
base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
assert (input_len > base_prompt_offset
), f"Please set 'args.input-len' higher than {base_prompt_offset}."
assert (
input_len > base_prompt_offset
), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
num_input_lines = round(
(input_len - base_prompt_offset) / average_poem_len)
@ -140,7 +158,7 @@ def sample_sonnet_requests(
# prompt are fixed poem lines.
assert (
prefix_len > base_prompt_offset
), f"Please set 'args.prefix-len' higher than {base_prompt_offset}."
), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
num_prefix_lines = round(
(prefix_len - base_prompt_offset) / average_poem_len)
@ -169,6 +187,31 @@ def sample_sonnet_requests(
return sampled_requests
def sample_random_requests(
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
input_lens = np.random.randint(
int(input_len * range_ratio),
input_len + 1,
size=num_prompts,
)
output_lens = np.random.randint(
int(output_len * range_ratio),
output_len + 1,
size=num_prompts,
)
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = []
for i in range(num_prompts):
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])])
input_requests.append(
(prompt, int(input_lens[i]), int(output_lens[i])))
return input_requests
async def get_request(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
@ -180,6 +223,7 @@ async def get_request(
if request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait.
continue
# Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / request_rate)
# The next request will be sent after the interval.
@ -192,24 +236,37 @@ def calculate_metrics(
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens = []
actual_output_lens: List[int] = []
total_input = 0
completed = 0
tpots = []
ttfts = []
itls: List[float] = []
tpots: List[float] = []
ttfts: List[float] = []
for i in range(len(outputs)):
if outputs[i].success:
output_len = len(tokenizer(outputs[i].generated_text).input_ids)
# We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together
# Note : this may inflate the output token count slightly
output_len = len(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i][1]
if output_len > 1:
tpots.append(
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
itls += outputs[i].itl
ttfts.append(outputs[i].ttft)
completed += 1
else:
actual_output_lens.append(0)
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
@ -220,10 +277,16 @@ def calculate_metrics(
mean_ttft_ms=np.mean(ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000,
std_ttft_ms=np.std(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
mean_tpot_ms=np.mean(tpots) * 1000,
median_tpot_ms=np.median(tpots) * 1000,
p99_tpot_ms=np.percentile(tpots, 99) * 1000,
mean_tpot_ms=np.mean(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
mean_itl_ms=np.mean(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
)
return metrics, actual_output_lens
@ -241,16 +304,34 @@ async def benchmark(
disable_tqdm: bool,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS.get(backend)
request_func = ASYNC_REQUEST_FUNCS[backend]
else:
raise ValueError(f"Unknown backend: {backend}")
print("Starting initial single prompt test run...")
test_prompt, test_prompt_len, test_output_len = input_requests[0]
test_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=api_url,
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {test_output.error}")
else:
print("Initial test run completed. Starting main benchmark run...")
print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
benchmark_start_time = time.perf_counter()
tasks = []
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
request_func_input = RequestFuncInput(
@ -268,7 +349,7 @@ async def benchmark(
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if not disable_tqdm:
if pbar is not None:
pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time
@ -305,6 +386,10 @@ async def benchmark(
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
metrics.median_tpot_ms))
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
print("=" * 50)
result = {
@ -317,10 +402,16 @@ async def benchmark(
"output_throughput": metrics.output_throughput,
"mean_ttft_ms": metrics.mean_ttft_ms,
"median_ttft_ms": metrics.median_ttft_ms,
"std_ttft_ms": metrics.std_ttft_ms,
"p99_ttft_ms": metrics.p99_ttft_ms,
"mean_tpot_ms": metrics.mean_tpot_ms,
"median_tpot_ms": metrics.median_tpot_ms,
"std_tpot_ms": metrics.std_tpot_ms,
"p99_tpot_ms": metrics.p99_tpot_ms,
"mean_itl_ms": metrics.mean_itl_ms,
"median_itl_ms": metrics.median_itl_ms,
"std_itl_ms": metrics.std_itl_ms,
"p99_itl_ms": metrics.p99_itl_ms,
"input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens,
"ttfts": [output.ttft for output in outputs],
@ -358,6 +449,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sharegpt":
@ -365,6 +457,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sonnet":
@ -373,9 +466,9 @@ def main(args: argparse.Namespace):
input_requests = sample_sonnet_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
input_len=args.input_len,
output_len=args.output_len,
prefix_len=args.prefix_len,
input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
)
input_requests = [(prompt, prompt_len, output_len)
@ -388,15 +481,24 @@ def main(args: argparse.Namespace):
input_requests = sample_sonnet_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
input_len=args.input_len,
output_len=args.output_len,
prefix_len=args.prefix_len,
input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
)
input_requests = [(prompt_formatted, prompt_len, output_len)
for prompt, prompt_formatted, prompt_len,
output_len in input_requests]
elif args.dataset_name == "random":
input_requests = sample_random_requests(
input_len=args.random_input_len,
output_len=args.random_output_len,
num_prompts=args.num_prompts,
range_ratio=args.random_range_ratio,
tokenizer=tokenizer,
)
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
@ -415,7 +517,7 @@ def main(args: argparse.Namespace):
# Save config and results to json
if args.save_result:
result_json = {}
result_json: Dict[str, Any] = {}
# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@ -448,6 +550,8 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id = model_id.split("/")[-1]
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w") as outfile:
@ -455,7 +559,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument(
"--backend",
@ -488,7 +592,7 @@ if __name__ == "__main__":
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "sonnet"],
choices=["sharegpt", "sonnet", "random"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument("--dataset-path",
@ -505,7 +609,7 @@ if __name__ == "__main__":
"--tokenizer",
type=str,
help=
"Name or path of the tokenizer, if not using the default tokenizer.",
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument(
"--best-of",
@ -521,6 +625,12 @@ if __name__ == "__main__":
default=1000,
help="Number of prompts to process.",
)
parser.add_argument(
"--sharegpt-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output length "
"from the ShareGPT dataset.")
parser.add_argument(
"--sonnet-input-len",
type=int,
@ -542,6 +652,27 @@ if __name__ == "__main__":
help=
"Number of prefix tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--random-input-len",
type=int,
default=1024,
help=
"Number of input tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-output-len",
type=int,
default=128,
help=
"Number of output tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-range-ratio",
type=float,
default=1.0,
help="Range of sampled ratio of input/output length, "
"used only for random sampling.",
)
parser.add_argument(
"--request-rate",
type=float,
@ -582,6 +713,15 @@ if __name__ == "__main__":
help="Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory.",
)
parser.add_argument(
"--result-filename",
type=str,
default=None,
help="Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format.",
)
args = parser.parse_args()
main(args)

View File

@ -10,6 +10,10 @@ from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def sample_requests(
dataset_path: str,
@ -29,22 +33,23 @@ def sample_requests(
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids
completions = [completion for _, completion in dataset]
completion_token_ids = tokenizer(completions).input_ids
tokenized_dataset = []
for i in range(len(dataset)):
output_len = len(completion_token_ids[i])
if fixed_output_len is not None:
output_len = fixed_output_len
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out too long sequences.
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for prompt, prompt_token_ids, output_len in tokenized_dataset:
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
@ -53,9 +58,7 @@ def sample_requests(
continue
filtered_dataset.append((prompt, prompt_len, output_len))
# Sample the requests.
sampled_requests = random.sample(filtered_dataset, num_requests)
return sampled_requests
return filtered_dataset
def run_vllm(
@ -72,47 +75,56 @@ def run_vllm(
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir)
llm = LLM(
model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
quantization_param_path=quantization_param_path,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
)
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests:
sampling_params = SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
max_tokens=output_len,
)
# FIXME(woosuk): Do not use internal method.
llm._add_request(
prompt=prompt,
prompt_token_ids=None,
sampling_params=sampling_params,
)
prompts.append(prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
max_tokens=output_len,
))
start = time.perf_counter()
# FIXME(woosuk): Do not use internal method.
llm._run_engine(use_tqdm=True)
llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
return end - start
@ -212,14 +224,15 @@ def main(args: argparse.Namespace):
args.output_len)
if args.backend == "vllm":
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
args.quantization, args.tensor_parallel_size,
args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype,
args.max_model_len, args.enforce_eager,
args.kv_cache_dtype, args.device,
args.enable_prefix_caching,
args.gpu_memory_utilization, args.download_dir)
elapsed_time = run_vllm(
requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.download_dir, args.load_format)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -235,9 +248,21 @@ def main(args: argparse.Namespace):
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
# Output JSON results if specified
if args.output_json:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": total_num_tokens / elapsed_time,
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
@ -259,7 +284,7 @@ if __name__ == "__main__":
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=['awq', 'gptq', 'squeezellm', None],
choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
@ -304,27 +329,82 @@ if __name__ == "__main__":
action="store_true",
help="enforce eager execution")
parser.add_argument(
"--kv-cache-dtype",
'--kv-cache-dtype',
type=str,
choices=["auto", "fp8_e5m2"],
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
help=
'Data type for kv cache storage. If "auto", will use model data type.')
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.')
default="auto",
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument("--enable-chunked-prefill",
action='store_true',
help="enable chunked prefill for vLLM backend.")
parser.add_argument('--max-num-batched-tokens',
type=int,
default=None,
help='maximum number of batched tokens per '
'iteration')
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model

View File

@ -0,0 +1,353 @@
import argparse
import copy
import itertools
import pickle as pkl
import time
from typing import Callable, Iterable, List, Tuple
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1]
# helpers
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
if dtype == torch.int8:
return to_int8(a), to_int8(b)
if dtype == torch.float8_e4m3fn:
return to_fp8(a), to_fp8(b)
raise ValueError("unsupported dtype")
# impl
def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.Tensor:
return torch.mm(a, b)
def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.Tensor:
return torch._scaled_mm(a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=out_dtype)
def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
scale_a: torch.Tensor, scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.Tensor:
return torch._scaled_mm(a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=out_dtype,
use_fast_accum=True)
def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.Tensor:
return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
# bench
def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
sub_label: str, fn: Callable, description: str) -> TMeasurement:
min_run_time = 1
globals = {
"a": a,
"b": b,
"scale_a": scale_a,
"scale_b": scale_b,
"out_dtype": out_dtype,
"fn": fn,
}
return TBenchmark.Timer(
stmt="fn(a, b, scale_a, scale_b, out_dtype)",
globals=globals,
label=label,
sub_label=sub_label,
description=description,
).blocked_autorange(min_run_time=min_run_time)
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.int8
a, b = make_rand_tensors(torch.int8, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
timers = []
# pytorch impl
timers.append(
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
torch.bfloat16, label, sub_label, pytorch_mm_impl,
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
# cutlass impl
timers.append(
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
return timers
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.float8_e4m3fn
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
timers = []
# pytorch impl w. bf16
timers.append(
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
torch.bfloat16, label, sub_label, pytorch_mm_impl,
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
# pytorch impl: bf16 output, without fp8 fast accum
timers.append(
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
# pytorch impl: bf16 output, with fp8 fast accum
timers.append(
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
pytorch_fp8_impl_fast_accum,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
# pytorch impl: fp16 output, without fp8 fast accum
timers.append(
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
# pytorch impl: fp16 output, with fp8 fast accum
timers.append(
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
pytorch_fp8_impl_fast_accum,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
# cutlass impl: bf16 output
timers.append(
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
# cutlass impl: fp16 output
timers.append(
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
return timers
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn:
return bench_fp8(dtype, m, k, n, label, sub_label)
raise ValueError("unsupported type")
# runner
def print_timers(timers: Iterable[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
def run(dtype: torch.dtype,
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
f"MKN=({m}x{k}x{n})")
print_timers(timers)
results.extend(timers)
return results
# output makers
def make_output(data: Iterable[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None):
print(f"== All Results {base_description} ====")
print_timers(data)
# pickle all the results
timestamp = int(time.time()) if timestamp is None else timestamp
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
pkl.dump(data, f)
# argparse runners
def run_square_bench(args):
dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}")
def run_range_bench(args):
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
n = len(dim_sizes)
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
MKNs = list(zip(Ms, Ks, Ns))
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"range_bench-{args.dtype}")
def run_model_bench(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
KNs.append(KN)
return KNs
model_bench_data = []
models_tps = list(itertools.product(args.models, args.tp_sizes))
for model, tp_size in models_tps:
Ms = args.batch_sizes
KNs = model_shapes(model, tp_size)
MKNs = []
for m in Ms:
for k, n in KNs:
MKNs.append((m, k, n))
data = run(args.dtype, MKNs)
model_bench_data.append(data)
# Print all results
for data, model_tp in zip(model_bench_data, models_tps):
model, tp_size = model_tp
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
print_timers(data)
timestamp = int(time.time())
all_data = []
for d in model_bench_data:
all_data.extend(d)
# pickle all data
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
pkl.dump(all_data, f)
if __name__ == '__main__':
def to_torch_dtype(dt):
if dt == "int8":
return torch.int8
if dt == "fp8":
return torch.float8_e4m3fn
raise ValueError("unsupported dtype")
parser = FlexibleArgumentParser(
description="""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
""", # noqa: E501
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--dtype",
type=to_torch_dtype,
required=True,
help="Available options are ['int8', 'fp8']")
subparsers = parser.add_subparsers(dest="cmd")
square_parser = subparsers.add_parser("square_bench")
square_parser.add_argument("--dim-start", type=int, required=True)
square_parser.add_argument("--dim-end", type=int, required=True)
square_parser.add_argument("--dim-increment", type=int, required=True)
square_parser.set_defaults(func=run_square_bench)
range_parser = subparsers.add_parser("range_bench")
range_parser.add_argument("--dim-start", type=int, required=True)
range_parser.add_argument("--dim-end", type=int, required=True)
range_parser.add_argument("--dim-increment", type=int, required=True)
range_parser.add_argument("--m-constant", type=int, default=None)
range_parser.add_argument("--n-constant", type=int, default=None)
range_parser.add_argument("--k-constant", type=int, default=None)
range_parser.set_defaults(func=run_range_bench)
model_parser = subparsers.add_parser("model_bench")
model_parser.add_argument("--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES.keys())
model_parser.add_argument("--tp-sizes",
nargs="+",
type=int,
default=DEFAULT_TP_SIZES)
model_parser.add_argument("--batch-sizes",
nargs="+",
type=int,
default=DEFAULT_BATCH_SIZES)
model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args()
args.func(args)

View File

@ -0,0 +1,43 @@
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
# - TP1 : K = 14336, N = 4096
# - TP2 : K = 7168, N = 4096
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
# - TP1 : K = 4096, N = 6144
# - TP4 : K = 4096, N = 1536
# TP1 shapes
WEIGHT_SHAPES = {
"mistralai/Mistral-7B-v0.1": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-2-7b-hf": [
([4096, 12288], 1),
([4096, 4096], 0),
([4096, 22016], 1),
([11008, 4096], 0),
],
"meta-llama/Llama-3-8b": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-2-13b-hf": [
([5120, 15360], 1),
([5120, 5120], 0),
([5120, 27648], 1),
([13824, 5120], 0),
],
"meta-llama/Llama-2-70b-hf": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
}

View File

@ -0,0 +1,302 @@
import os
import sys
from typing import Optional
import torch
import torch.nn.functional as F
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
optimized_dequantize_gemm)
from vllm.utils import FlexibleArgumentParser
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def torch_mult(
input: torch.Tensor, # [..., in_features]
weights: torch.Tensor,
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
) -> torch.Tensor:
output = F.linear(input, weights)
return output
def dequant_out_scale(
input: torch.Tensor, # [..., in_features]
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
codebooks: torch.
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
if bias is None:
output = F.linear(input, weights, bias)
orig_shape = output.shape
flattened_output = output.view(-1, output.size(-1))
f_scales = scales.view(-1, scales.shape[0])
b_scales = f_scales.expand(flattened_output.shape[0], -1)
flattened_output *= b_scales
return flattened_output.view(orig_shape)
else:
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_weight_scale(
input: torch.Tensor, # [..., in_features]
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
codebooks: torch.
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_no_scale(
input: torch.Tensor, # [..., in_features]
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
codebooks: torch.
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
return F.linear(input, weights, bias)
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
n = int(parts.sum().item())
device = torch.device('cuda:0')
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device)
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device)
count = 0
for index in range(16):
for i in range(8):
for book in range(nbooks):
codebooks[book, index, 0, i] = count * (10**book)
count += 1
print("codes shape", codes.shape)
for i in range(16):
for book in range(nbooks):
codes[0, i, book] = i
codes[0, -i, book] = i
weights = dequantize_weight(codes, codebooks, None)
weights2 = ops.aqlm_dequant(codes, codebooks, parts)
print("weights shape:", weights.shape)
print("weights2 shape:", weights2.shape)
print("weights are:", weights)
print("weights2 are:", weights2)
print("first 128 weights are", weights[0, 0:128].to(torch.int32))
print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
print("last 128 weights are", weights[0, -128:])
print("last 128 weights2 are:", weights2[0, -128:])
def main():
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
# Add arguments
parser.add_argument("--nbooks",
type=int,
default=1,
help="Number of codebooks (default: 1)")
parser.add_argument("--bits",
type=int,
default=16,
help="Number of bits per code element (default: 16)")
parser.add_argument(
"--test",
type=bool,
default=False,
help="Run the decompression/dequant tester rather than benchmarking "
"(default: False)")
# Parse the arguments
args = parser.parse_args()
# Extract values
nbooks = args.nbooks
bits = args.bits
if args.test:
dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
return
# Otherwise, benchmark.
methods = [
ops.aqlm_gemm,
dequant_out_scale,
generic_dequantize_gemm,
optimized_dequantize_gemm,
dequant_weight_scale,
torch_mult,
dequant_no_scale,
]
filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
print(f"writing benchmarks to file {filename}")
with open(filename, "w") as f:
sys.stdout = f
print('m | k | n | n parts', end='')
for method in methods:
print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
print('')
# These are reasonable prefill sizes.
ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
(4096, (11008, 11008)), (11008, (4096, )))
# reasonable ranges for m.
for m in [
1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
128, 256, 512, 1024, 1536, 2048, 3072, 4096
]:
print(f'{m}', file=sys.__stdout__)
for ksp in ksandpartions:
run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
methods)
sys.stdout = sys.__stdout__
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
methods):
# I didn't see visible improvements from increasing these, but feel free :)
num_warmup_trials = 1
num_trials = 1
num_calls = 100
# warmup.
for method in methods:
for _ in range(num_warmup_trials):
run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
n = parts.sum().item()
print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
for method in methods:
best_time_us = 1e20
for _ in range(num_trials):
kernel_dur_ms = run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
kernel_dur_us = 1000 * kernel_dur_ms
if kernel_dur_us < best_time_us:
best_time_us = kernel_dur_us
print(f' | {kernel_dur_us:.0f}', end='')
print('')
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
nbooks: int, bits: int, method) -> float:
n = int(parts.sum().item())
device = torch.device('cuda:0')
input = torch.randn((1, m, k), dtype=torch.float16, device=device)
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device)
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device)
scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
# for comparison to just a pytorch mult.
weights = torch.randn((n, k), dtype=torch.float16, device=device)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
if method is torch_mult:
for i in range(num_calls):
torch_mult(input, weights, scales)
else:
for i in range(num_calls):
method(input, codes, codebooks, scales, parts, None)
end_event.record()
end_event.synchronize()
dur_ms = start_event.elapsed_time(end_event) / num_calls
return dur_ms
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,237 @@
from typing import List
import torch
import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace, marlin_quantize)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, quantize_weights, sort_weights)
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
ACT_ORDER_OPTS = [False, True]
K_FULL_OPTS = [False, True]
def bench_run(results: List[benchmark.Measurement], model: str,
act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
size_m: int, size_k: int, size_n: int):
label = "Quant Matmul"
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
"MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
group_size, size_m, size_k, size_n))
print(f"Testing: {sub_label}")
a = torch.randn(size_m, size_k).to(torch.half).cuda()
b = torch.rand(size_k, size_n).to(torch.half).cuda()
a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
# Marlin quant
(
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_g_idx,
marlin_sort_indices,
marlin_rand_perm,
) = marlin_quantize(b, num_bits, group_size, act_order)
# Marlin_24 quant
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
# GPTQ quant
(w_ref, q_w, s, g_idx,
rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
# Prepare
marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_MAX_PARALLEL)
marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_MAX_PARALLEL)
globals = {
# Gen params
"num_bits": num_bits,
"group_size": group_size,
"size_m": size_m,
"size_n": size_n,
"size_k": size_k,
"a": a,
"a_tmp": a_tmp,
# Marlin params
"marlin_w_ref": marlin_w_ref,
"marlin_q_w": marlin_q_w,
"marlin_s": marlin_s,
"marlin_g_idx": marlin_g_idx,
"marlin_sort_indices": marlin_sort_indices,
"marlin_rand_perm": marlin_rand_perm,
"marlin_workspace": marlin_workspace,
"is_k_full": is_k_full,
# Marlin_24 params
"marlin_24_w_ref": marlin_24_w_ref,
"marlin_24_q_w_comp": marlin_24_q_w_comp,
"marlin_24_meta": marlin_24_meta,
"marlin_24_s": marlin_24_s,
"marlin_24_workspace": marlin_24_workspace,
# GPTQ params
"q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices,
# Kernels
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
"gptq_marlin_repack": ops.gptq_marlin_repack,
}
min_run_time = 1
# Warmup pytorch
for i in range(5):
torch.matmul(a, marlin_w_ref)
results.append(
benchmark.Timer(
stmt="torch.matmul(a, marlin_w_ref)",
globals=globals,
label=label,
sub_label=sub_label,
description="pytorch_gemm",
).blocked_autorange(min_run_time=min_run_time))
results.append(
benchmark.Timer(
stmt=
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_gemm",
).blocked_autorange(min_run_time=min_run_time))
if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
results.append(
benchmark.Timer(
stmt=
"output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_24_gemm",
).blocked_autorange(min_run_time=min_run_time))
results.append(
benchmark.Timer(
stmt=
"q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_repack",
).blocked_autorange(min_run_time=min_run_time))
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
results: List[benchmark.Measurement] = []
for model in args.models:
for layer in WEIGHT_SHAPES[model]:
size_k = layer[0]
size_n = layer[1]
if len(args.limit_k) > 0 and size_k not in args.limit_k:
continue
if len(args.limit_n) > 0 and size_n not in args.limit_n:
continue
for act_order in ACT_ORDER_OPTS:
if len(args.limit_act_order
) > 0 and act_order not in args.limit_act_order:
continue
for is_k_full in K_FULL_OPTS:
if len(args.limit_k_full
) > 0 and is_k_full not in args.limit_k_full:
continue
for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
if len(args.limit_num_bits
) > 0 and num_bits not in args.limit_num_bits:
continue
for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
if len(
args.limit_group_size
) > 0 and group_size not in args.limit_group_size:
continue
# For act_order, the group_size must be less than
# size_k
if act_order and (group_size == size_k
or group_size == -1):
continue
for size_m in args.batch_sizes:
bench_run(results, model, act_order, is_k_full,
num_bits, group_size, size_m, size_k,
size_n)
compare = benchmark.Compare(results)
compare.print()
# For quick benchmarking use:
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
#
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark Marlin across specified models/shapes/batches")
parser.add_argument(
"--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES.keys(),
)
parser.add_argument("--batch-sizes",
nargs="+",
type=int,
default=DEFAULT_BATCH_SIZES)
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
args = parser.parse_args()
main(args)

View File

@ -1,182 +0,0 @@
import json
import os
import sys
import torch
import torch.nn.functional as F
import triton
from vllm.model_executor.layers.fused_moe import (fused_moe,
get_config_file_name)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def main():
method = fused_moe
for bs in [
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
2048, 3072, 4096
]:
run_grid(bs, method=method)
def run_grid(bs, method):
d_model = 4096
num_total_experts = 8
top_k = 2
tp_size = 2
model_intermediate_size = 14336
num_layers = 32
num_calls = 100
num_warmup_trials = 1
num_trials = 1
configs = []
if bs <= 16:
BLOCK_SIZES_M = [16]
elif bs <= 32:
BLOCK_SIZES_M = [16, 32]
elif bs <= 64:
BLOCK_SIZES_M = [16, 32, 64]
elif bs <= 128:
BLOCK_SIZES_M = [16, 32, 64, 128]
else:
BLOCK_SIZES_M = [16, 32, 64, 128, 256]
for block_size_n in [32, 64, 128, 256]:
for block_size_m in BLOCK_SIZES_M:
for block_size_k in [64, 128, 256]:
for group_size_m in [1, 16, 32, 64]:
for num_warps in [4, 8]:
configs.append({
"BLOCK_SIZE_M": block_size_m,
"BLOCK_SIZE_N": block_size_n,
"BLOCK_SIZE_K": block_size_k,
"GROUP_SIZE_M": group_size_m,
"num_warps": num_warps,
"num_stages": 4,
})
best_config = None
best_time_us = 1e20
for config in configs:
print(f'{tp_size=} {bs=}')
print(f'{config}')
# warmup
print('warming up')
try:
for _ in range(num_warmup_trials):
run_timing(
num_calls=num_calls,
bs=bs,
d_model=d_model,
num_total_experts=num_total_experts,
top_k=top_k,
tp_size=tp_size,
model_intermediate_size=model_intermediate_size,
method=method,
config=config,
)
except triton.runtime.autotuner.OutOfResources:
continue
# trial
print('benchmarking')
for _ in range(num_trials):
kernel_dur_ms = run_timing(
num_calls=num_calls,
bs=bs,
d_model=d_model,
num_total_experts=num_total_experts,
top_k=top_k,
tp_size=tp_size,
model_intermediate_size=model_intermediate_size,
method=method,
config=config,
)
kernel_dur_us = 1000 * kernel_dur_ms
model_dur_ms = kernel_dur_ms * num_layers
if kernel_dur_us < best_time_us:
best_config = config
best_time_us = kernel_dur_us
print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
f'{d_model=} {model_intermediate_size=} {num_layers=}')
print("best_time_us", best_time_us)
print("best_config", best_config)
# holds Dict[str, Dict[str, int]]
filename = get_config_file_name(num_total_experts,
model_intermediate_size // tp_size)
print(f"writing config to file {filename}")
existing_content = {}
if os.path.exists(filename):
with open(filename, "r") as f:
existing_content = json.load(f)
existing_content[str(bs)] = best_config
with open(filename, "w") as f:
json.dump(existing_content, f, indent=4)
f.write("\n")
def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
top_k: int, tp_size: int, model_intermediate_size: int, method,
config) -> float:
shard_intermediate_size = model_intermediate_size // tp_size
hidden_states = torch.rand(
(bs, d_model),
device="cuda:0",
dtype=torch.bfloat16,
)
ws = torch.rand(
(num_total_experts, 2 * shard_intermediate_size, d_model),
device=hidden_states.device,
dtype=hidden_states.dtype,
)
w2s = torch.rand(
(num_total_experts, d_model, shard_intermediate_size),
device=hidden_states.device,
dtype=hidden_states.dtype,
)
gating_output = F.softmax(torch.rand(
(num_calls, bs, num_total_experts),
device=hidden_states.device,
dtype=torch.float32,
),
dim=-1)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
for i in range(num_calls):
hidden_states = method(
hidden_states=hidden_states,
w1=ws,
w2=w2s,
gating_output=gating_output[i],
topk=2,
renormalize=True,
inplace=True,
override_config=config,
)
end_event.record()
end_event.synchronize()
dur_ms = start_event.elapsed_time(end_event) / num_calls
return dur_ms
if __name__ == "__main__":
sys.exit(main())

Some files were not shown because too many files have changed in this diff Show More