mirror of
https://github.com/vllm-project/vllm.git
synced 2025-11-05 01:44:35 +08:00
Compare commits
432 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fd47e57f4b | |||
| 203ab8f80f | |||
| 4141608c6a | |||
| dfe43a2071 | |||
| 16b24e7dcd | |||
| f519902c52 | |||
| 250e26a63e | |||
| 2b184ddd4f | |||
| 00298e092c | |||
| 89feb4c84d | |||
| ec10cb8511 | |||
| d11b46f3a5 | |||
| c6cf9295e1 | |||
| de9fb4bef8 | |||
| 8baf85e4e9 | |||
| 1a1823871d | |||
| 6cf1167c1a | |||
| f710090d8e | |||
| 7342a7d7f8 | |||
| df3dcdf49d | |||
| 36ea79079b | |||
| e808156f30 | |||
| cbc2ef5529 | |||
| 94bf9ae4e9 | |||
| f990bab2a4 | |||
| e00c094f15 | |||
| a78c6ba7c8 | |||
| fb870fd491 | |||
| 270953bafb | |||
| 9cc811c4ff | |||
| e4d652ea3e | |||
| 78c0b4166c | |||
| 21efb603f5 | |||
| 055f3270d4 | |||
| 18511aeda6 | |||
| 83ea5c72b9 | |||
| 04de9057ab | |||
| 07c11cf4d4 | |||
| f3a507f1d3 | |||
| a64e7b9407 | |||
| ce00231a8b | |||
| de895f1697 | |||
| cf25b93bdd | |||
| d5fbb8706d | |||
| cdca8994bd | |||
| ca77dd7a44 | |||
| 7dea289066 | |||
| cfaa6008e6 | |||
| 21906a6f50 | |||
| dc4aea677a | |||
| c8627cd41b | |||
| 8bfaa4e31e | |||
| 0b5b5d767e | |||
| cdc72e3c80 | |||
| 7627172bf4 | |||
| 480b7f40cf | |||
| acce7630c1 | |||
| ffc4b27ea8 | |||
| 2f4117c38e | |||
| 9ba0bd6aa6 | |||
| 2a131965a8 | |||
| bd37b9fbe2 | |||
| de24046fcd | |||
| 1874c6a1b0 | |||
| 9a94ca4a5d | |||
| cfba685bd4 | |||
| 069d3bd8d0 | |||
| a3691b6b5e | |||
| 8c746226c9 | |||
| e1faa2a598 | |||
| 80b57f00d5 | |||
| 04c12f8157 | |||
| 8eeb857084 | |||
| fa45513a51 | |||
| c0d9a98d0c | |||
| e0dbdb013d | |||
| 93cf74a8a7 | |||
| 151ef4efd2 | |||
| f19da64871 | |||
| 4f95ffee6f | |||
| 8c6de96ea1 | |||
| 18b296fdb2 | |||
| c8f26bb636 | |||
| 487678d046 | |||
| cb3b2b9ba4 | |||
| fdf59d30ea | |||
| b22b798471 | |||
| f22619fe96 | |||
| 168cab6bbf | |||
| 23fea8714a | |||
| f4dd830e09 | |||
| 5df1834895 | |||
| cfadb9c687 | |||
| 15986f598c | |||
| 53b3a33027 | |||
| dac914b0d6 | |||
| a95354a36e | |||
| 663874e048 | |||
| cc90419e89 | |||
| 27302dd584 | |||
| 0cc566ca8f | |||
| 05c531be47 | |||
| fbb74420e7 | |||
| 05d686432f | |||
| 0dcc8cbe5a | |||
| 26aa325f4f | |||
| e5dc713c23 | |||
| 36eecfbddb | |||
| 9ade8bbc8d | |||
| 22482e495e | |||
| 3d826d2c52 | |||
| 0e36fd4909 | |||
| 0f6d7a9a34 | |||
| 303d44790a | |||
| aeb37c2a72 | |||
| 3dbb215b38 | |||
| 2838d6b38e | |||
| 91add85ec4 | |||
| 9aaf14c62e | |||
| 63e39937f9 | |||
| f5d72b2fc6 | |||
| 83caf35e08 | |||
| 01843c89b8 | |||
| 19a4dd0990 | |||
| 18c2e30c57 | |||
| 19f0d25796 | |||
| f58d4fccc9 | |||
| afb050b29d | |||
| 7f60520deb | |||
| 563649aafe | |||
| 1570203864 | |||
| 22f5851b80 | |||
| 4f341bd4bf | |||
| 35bd215168 | |||
| 1fe0a4264a | |||
| bc4eb65b54 | |||
| 82f3937e59 | |||
| 7da2487591 | |||
| aaccca2b4d | |||
| 062c89e7c9 | |||
| bce324487a | |||
| 1425a1bcf9 | |||
| 1cabfcefb6 | |||
| be76e5aabf | |||
| 2ae25f79cf | |||
| 8e60afa15e | |||
| b6d7392579 | |||
| e01ab595d8 | |||
| f13a07b1f8 | |||
| 6c9ba48fde | |||
| 1fb9c1b0bf | |||
| 31f46a0d35 | |||
| 3d49776bbb | |||
| bc2ef1f77c | |||
| 2e7fe7e79f | |||
| 26a68d5d7e | |||
| d081da0064 | |||
| 5bf8789b2a | |||
| d1537039ce | |||
| cc276443b5 | |||
| e585b583a9 | |||
| 090e945e36 | |||
| e1a3f5e831 | |||
| 19d02ff938 | |||
| 39d3f8d94f | |||
| b0298aa8cc | |||
| 260024a374 | |||
| d86f6b2afb | |||
| bd429f2b75 | |||
| 18e60d7d13 | |||
| c2ec430ab5 | |||
| c5d55356f9 | |||
| 172d1cd276 | |||
| a9b15c606f | |||
| 8df2dc3c88 | |||
| 6d792d2f31 | |||
| 0e088750af | |||
| dc4e3df5c2 | |||
| 3b00b9c26c | |||
| 344cd2b6f4 | |||
| 1b49148e47 | |||
| 4b377d6feb | |||
| 71d21c73ab | |||
| ee2da3e9ef | |||
| e2f6f26e86 | |||
| b28d2104de | |||
| 93d364da34 | |||
| d9cfbc891e | |||
| 70de39f6b4 | |||
| 68988d4e0d | |||
| 520db4dbc1 | |||
| f70bccac75 | |||
| 4bb98f2190 | |||
| 7193774b1f | |||
| e2c6e0a829 | |||
| 770ec6024f | |||
| 4f1ba0844b | |||
| 873edda6cf | |||
| 64840dfae4 | |||
| 28e1299e60 | |||
| 0c4d2ad5e6 | |||
| c6f2485c82 | |||
| 300da09177 | |||
| 1c046447a6 | |||
| 8fae5ed7f6 | |||
| 3368c3ab36 | |||
| 1ac3de09cd | |||
| 3e073e66f1 | |||
| c23953675f | |||
| e3dd0692fa | |||
| fc3afc20df | |||
| b4522474a3 | |||
| ee777d9c30 | |||
| 6e0c9d6bd0 | |||
| 6da1ab6b41 | |||
| 01b6f9e1f0 | |||
| 13f9f7a3d0 | |||
| 1e7d5c01f5 | |||
| 2467b642dd | |||
| 72fc97a0f1 | |||
| 2529d09b5a | |||
| a928ded995 | |||
| cc4325b66a | |||
| 8ff7ced996 | |||
| 3f06bae907 | |||
| b8747e8a7c | |||
| 3185fb0cca | |||
| 0250dd68c5 | |||
| 88577ac928 | |||
| 530821d00c | |||
| 1a2aef3e59 | |||
| 5f7bb58427 | |||
| b05f5c9238 | |||
| 9b0e3ec970 | |||
| 86e9c8df29 | |||
| ee5f34b1c2 | |||
| f2bd246c17 | |||
| a79e522984 | |||
| 3e83c12b5c | |||
| e551ca1555 | |||
| 9b8c8ba119 | |||
| d23679eb99 | |||
| 57a0702e63 | |||
| 3dda7c2250 | |||
| 92ba7e7477 | |||
| d4a2ac8302 | |||
| c6bd70d772 | |||
| 5b59532760 | |||
| ca2b628b3c | |||
| 8ca5051b9a | |||
| 06ed2815e2 | |||
| 0e40ac9b7b | |||
| 13d88d4137 | |||
| d66ac62854 | |||
| 9dc7c6c7f3 | |||
| ec4aaad812 | |||
| 4dfdf43196 | |||
| 5e85f4f82a | |||
| 71c60491f2 | |||
| 0faab90eb0 | |||
| 0455c46ed4 | |||
| d4bf085ad0 | |||
| 0057894ef7 | |||
| 0f961b3ce9 | |||
| 7f9c8902e3 | |||
| 7c8566aa4f | |||
| b4e4eda92e | |||
| 2874bac618 | |||
| 035fa895ec | |||
| b28298f2f4 | |||
| 2940afa04e | |||
| 3b63de9353 | |||
| 260d40b5ea | |||
| 9e5ec35b1f | |||
| 18ae428a0d | |||
| de6f90a13d | |||
| 6cb748e190 | |||
| 9e99407e3c | |||
| ea4647b7d7 | |||
| e42c634acb | |||
| 9cc373f390 | |||
| 76515f303b | |||
| 855c8ae2c9 | |||
| c52ec5f034 | |||
| 02c9afa2d0 | |||
| 3118f63385 | |||
| 4c34ce8916 | |||
| 0d47bf3bf4 | |||
| d9cd78eb71 | |||
| db9120cded | |||
| b3195bc9e4 | |||
| e18749ff09 | |||
| d65798f78c | |||
| a8c1d161a7 | |||
| 7c7714d856 | |||
| 9d104b5beb | |||
| 6ffa3f314c | |||
| e351572900 | |||
| 95965d31b6 | |||
| 8110e44529 | |||
| 09deb4721f | |||
| fa0c114fad | |||
| 98f9713399 | |||
| 56c3de018c | |||
| a54ed80249 | |||
| 9855b99502 | |||
| 1009e93c5d | |||
| 1b6de8352b | |||
| cbdb252259 | |||
| 99aa4eddaf | |||
| ee2bceaaa6 | |||
| 1c1bb388e0 | |||
| 546034b466 | |||
| cca61642e0 | |||
| 5ce45eb54d | |||
| 5478c4b41f | |||
| 47f5e03b5b | |||
| 2759a43a26 | |||
| 5d73ae49d6 | |||
| 781e3b9a42 | |||
| acd5511b6d | |||
| 837c1968f9 | |||
| a091e2da3e | |||
| fc990f9795 | |||
| 3724d5f6b5 | |||
| 50e9ec41fc | |||
| 47790f3e32 | |||
| a36e070dad | |||
| 8a0cf1ddc3 | |||
| 1ef0d2efd0 | |||
| 851725202a | |||
| 9ba0817ff1 | |||
| 18e9e1f7b3 | |||
| f57092c00b | |||
| a84e598e21 | |||
| 0a4806f0a9 | |||
| ecd7a1d5b6 | |||
| a2469127db | |||
| 06311e2956 | |||
| cab69a15e4 | |||
| 9b4a3b235e | |||
| acda0b35d0 | |||
| ba77527955 | |||
| 6821020109 | |||
| 8427550488 | |||
| 3f79bc3d1a | |||
| 40c396533d | |||
| 5ec9c0fb3c | |||
| 8f44a92d85 | |||
| 360ddbd37e | |||
| a480939e8e | |||
| d31174a4e1 | |||
| b61bd98f90 | |||
| c16369455f | |||
| 019877253b | |||
| 551ce01078 | |||
| a6c0f3658d | |||
| f2e263b801 | |||
| 1f0c75afa9 | |||
| 8a23e93302 | |||
| c6202daeed | |||
| e56bf27741 | |||
| 520ca380ae | |||
| 7de49aa86c | |||
| 42ffba11ad | |||
| 295c4730a8 | |||
| 1bf2dd9df0 | |||
| 5a60699c45 | |||
| b6c75e1cf2 | |||
| b71c956deb | |||
| f842a7aff1 | |||
| a65cb16067 | |||
| 3fd2b0d21c | |||
| d394787e52 | |||
| 775f00f81e | |||
| 8baa454937 | |||
| 73202dbe77 | |||
| 7015417fd4 | |||
| aea02f30de | |||
| 0b952af458 | |||
| 3b7fea770f | |||
| cea95dfb94 | |||
| 6a512a00df | |||
| efcf946a15 | |||
| 1230263e16 | |||
| e497b8aeff | |||
| 94144e726c | |||
| 1d5e397aa4 | |||
| 22f3a4bc6c | |||
| b1f3e18958 | |||
| 04e7c4e771 | |||
| 5faedf1b62 | |||
| 02751a7a42 | |||
| f421f3cefb | |||
| 8c054b7a62 | |||
| 6234385f4a | |||
| da1a844e61 | |||
| a1d874224d | |||
| 6cd5e5b07e | |||
| c7cb5c3335 | |||
| f9b4a2d415 | |||
| 58fcc8545a | |||
| 08287ef675 | |||
| 4ef41b8476 | |||
| cfe712bf1a | |||
| b962ee1470 | |||
| 36bf8150cc | |||
| e807125936 | |||
| 9f68e00d27 | |||
| ce2702a923 | |||
| 795b662cff | |||
| 2f707fcb35 | |||
| 41e95c5247 | |||
| 12dd715807 | |||
| 29f49cd6e3 | |||
| 23f322297f | |||
| 9db52eab3d | |||
| 1447c97e75 | |||
| de80783b69 | |||
| e5cab71531 | |||
| baa5467547 | |||
| db3bf7c991 | |||
| 2febcf2777 | |||
| 2ee45281a5 | |||
| 9da25a88aa | |||
| 8685ba1a1e | |||
| 288a938872 | |||
| e39ebf5cf5 | |||
| ba262c4e5a | |||
| 4624d98dbd | |||
| 1afc931987 | |||
| e01c2beb7d |
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.764
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.764
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,6 +1,7 @@
|
|||||||
Meta-Llama-3-8B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
Minitron-4B-Base-FP8.yaml
|
Minitron-4B-Base-FP8.yaml
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install lm-eval==0.4.3
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -49,10 +49,15 @@ def test_lm_eval_correctness():
|
|||||||
results = launch_lm_eval(eval_config)
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
# Confirm scores match ground truth.
|
# Confirm scores match ground truth.
|
||||||
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(f'{task["name"]} | {metric["name"]}: '
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
success = success and numpy.isclose(
|
||||||
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
|
# Assert at the end, print all scores even on failure for debugging.
|
||||||
|
assert success
|
||||||
|
|||||||
@ -8,8 +8,7 @@ steps:
|
|||||||
containers:
|
containers:
|
||||||
- image: badouralix/curl-jq
|
- image: badouralix/curl-jq
|
||||||
command:
|
command:
|
||||||
- sh
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
|
||||||
- wait
|
- wait
|
||||||
- label: "A100"
|
- label: "A100"
|
||||||
agents:
|
agents:
|
||||||
|
|||||||
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This file contains the downloading link for benchmarking results.
|
||||||
|
|
||||||
|
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
||||||
|
- [benchmarking results](artifact://results.zip)
|
||||||
|
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
||||||
|
|
||||||
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
|
||||||
|
## Results reproduction
|
||||||
|
|
||||||
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
|
- Deploy the docker, and inside the docker:
|
||||||
|
- Download `nightly-benchmarks.zip`.
|
||||||
|
- In the same folder, run the following code
|
||||||
|
```
|
||||||
|
export HF_TOKEN=<your HF token>
|
||||||
|
apt update
|
||||||
|
apt install -y git
|
||||||
|
unzip nightly-benchmarks.zip
|
||||||
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|
||||||
@ -1,45 +1,39 @@
|
|||||||
|
|
||||||
# Nightly benchmark
|
# Nightly benchmark
|
||||||
|
|
||||||
The main goal of this benchmarking is two-fold:
|
This benchmark aims to:
|
||||||
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
|
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||||
|
|
||||||
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
|
||||||
## Docker images
|
## Setup
|
||||||
|
|
||||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
|
- Docker images:
|
||||||
- vllm/vllm-openai:v0.5.0.post1
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
- openmmlab/lmdeploy:v0.5.0
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
- ghcr.io/huggingface/text-generation-inference:2.1
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
|
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
|
- Hardware
|
||||||
|
- 8x Nvidia A100 GPUs
|
||||||
|
- Workload:
|
||||||
|
- Dataset
|
||||||
|
- ShareGPT dataset
|
||||||
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
|
# Known issues
|
||||||
|
|
||||||
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
## Hardware
|
- TGI does not support `ignore-eos` flag.
|
||||||
|
|
||||||
One AWS node with 8x NVIDIA A100 GPUs.
|
|
||||||
|
|
||||||
|
|
||||||
## Workload description
|
|
||||||
|
|
||||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
|
|
||||||
|
|
||||||
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
|
|
||||||
- Output length: the corresponding output length of these 500 prompts.
|
|
||||||
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
|
||||||
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
|
||||||
|
|
||||||
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
|
|
||||||
|
|
||||||
## Plots
|
|
||||||
|
|
||||||
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
|
|
||||||
|
|
||||||
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
|
|
||||||
|
|
||||||
## Results
|
|
||||||
|
|
||||||
{nightly_results_benchmarking_table}
|
|
||||||
@ -13,7 +13,7 @@ common_pod_spec: &common_pod_spec
|
|||||||
|
|
||||||
common_container_settings: &common_container_settings
|
common_container_settings: &common_container_settings
|
||||||
command:
|
command:
|
||||||
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
|
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 8
|
nvidia.com/gpu: 8
|
||||||
@ -37,7 +37,10 @@ common_container_settings: &common_container_settings
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||||
- label: "A100 trt benchmark"
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 vllm step 10"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@ -46,7 +49,21 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
- image: vllm/vllm-openai:v0.6.2
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 sglang benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: lmsysorg/sglang:v0.3.2-cu121
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
|
||||||
- label: "A100 lmdeploy benchmark"
|
- label: "A100 lmdeploy benchmark"
|
||||||
@ -58,11 +75,13 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: openmmlab/lmdeploy:v0.5.0
|
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 vllm benchmark"
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 trt llama-8B"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@ -71,10 +90,25 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: vllm/vllm-openai:latest
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama8B"
|
||||||
|
|
||||||
- label: "A100 tgi benchmark"
|
|
||||||
|
- label: "A100 trt llama-70B"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@ -83,12 +117,54 @@ steps:
|
|||||||
podSpec:
|
podSpec:
|
||||||
<<: *common_pod_spec
|
<<: *common_pod_spec
|
||||||
containers:
|
containers:
|
||||||
- image: ghcr.io/huggingface/text-generation-inference:2.1
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
<<: *common_container_settings
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama70B"
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
||||||
|
# - label: "A100 trt benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
||||||
|
# - label: "A100 tgi benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
- wait
|
- wait
|
||||||
|
|
||||||
- label: "Plot"
|
- label: "Collect the results"
|
||||||
priority: 100
|
priority: 100
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
@ -117,4 +193,4 @@ steps:
|
|||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
key: token
|
key: token
|
||||||
|
|
||||||
- wait
|
- block: ":rocket: check the results!"
|
||||||
@ -1,76 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
set -x
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_hf_token() {
|
|
||||||
# check if HF_TOKEN is available and valid
|
|
||||||
if [[ -z "$HF_TOKEN" ]]; then
|
|
||||||
echo "Error: HF_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
|
||||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "HF_TOKEN is set and valid."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
check_hf_token
|
|
||||||
|
|
||||||
df -h
|
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
|
||||||
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
|
|
||||||
# run lmdeploy
|
|
||||||
if which lmdeploy >/dev/null; then
|
|
||||||
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run tgi
|
|
||||||
if [ -e /tgi-entrypoint.sh ]; then
|
|
||||||
echo "tgi is available, redirect to run-tgi-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run trt
|
|
||||||
if which trtllm-build >/dev/null; then
|
|
||||||
echo "trtllm is available, redirect to run-trt-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run vllm
|
|
||||||
if [ -e /vllm-workspace ]; then
|
|
||||||
echo "vllm is available, redirect to run-vllm-nightly.sh"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -0,0 +1,95 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=
|
||||||
|
'Parse command line arguments for summary-nightly-results script.')
|
||||||
|
parser.add_argument('--results-folder',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='The folder where the results are stored.')
|
||||||
|
parser.add_argument('--description',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='Description of the results.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf(df, method, model, metric):
|
||||||
|
|
||||||
|
means = []
|
||||||
|
|
||||||
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
|
target = df['Test name'].str.contains(model)
|
||||||
|
target = target & df['Engine'].str.contains(method)
|
||||||
|
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
means.append(0.)
|
||||||
|
else:
|
||||||
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
|
return np.array(means)
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
|
||||||
|
if metric in ["TTFT", "ITL"]:
|
||||||
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
||||||
|
if std.mean() == 0:
|
||||||
|
std = None
|
||||||
|
success = get_perf(df, method, model, "Successful req.")
|
||||||
|
if std is not None:
|
||||||
|
std = std / np.sqrt(success)
|
||||||
|
std = std.tolist()
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert metric == "Tput"
|
||||||
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
|
df, method, model, "Output Tput (tok/s)")
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = None
|
||||||
|
|
||||||
|
return mean, std
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
results_folder = Path(args.results_folder)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
results = results + json.loads(f.read())
|
||||||
|
|
||||||
|
# generate markdown table
|
||||||
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
|
with open(args.description, "r") as f:
|
||||||
|
description = f.read()
|
||||||
|
|
||||||
|
description = description.format(
|
||||||
|
nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
|
with open("nightly_results.md", "w") as f:
|
||||||
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_arguments()
|
||||||
|
main(args)
|
||||||
241
.buildkite/nightly-benchmarks/scripts/launch-server.sh
Normal file
241
.buildkite/nightly-benchmarks/scripts/launch-server.sh
Normal file
@ -0,0 +1,241 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Currently FP8 benchmark is NOT enabled.
|
||||||
|
|
||||||
|
set -x
|
||||||
|
server_params=$1
|
||||||
|
common_params=$2
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_trt_server() {
|
||||||
|
|
||||||
|
model_path=$(echo "$common_params" | jq -r '.model')
|
||||||
|
model_name="${model_path#*/}"
|
||||||
|
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||||
|
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||||
|
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||||
|
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||||
|
max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len')
|
||||||
|
max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens')
|
||||||
|
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||||
|
|
||||||
|
# create model caching directory
|
||||||
|
cd ~
|
||||||
|
rm -rf models
|
||||||
|
mkdir -p models
|
||||||
|
cd models
|
||||||
|
models_dir=$(pwd)
|
||||||
|
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||||
|
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||||
|
|
||||||
|
# clone tensorrt backend
|
||||||
|
cd /
|
||||||
|
rm -rf tensorrtllm_backend
|
||||||
|
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||||
|
git lfs install
|
||||||
|
cd tensorrtllm_backend
|
||||||
|
git checkout $trt_llm_version
|
||||||
|
tensorrtllm_backend_dir=$(pwd)
|
||||||
|
git submodule update --init --recursive
|
||||||
|
|
||||||
|
# build trtllm engine
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
cd ./tensorrt_llm/examples/${model_type}
|
||||||
|
python3 convert_checkpoint.py \
|
||||||
|
--model_dir ${model_path} \
|
||||||
|
--dtype ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--output_dir ${trt_model_path}
|
||||||
|
trtllm-build \
|
||||||
|
--checkpoint_dir ${trt_model_path} \
|
||||||
|
--use_fused_mlp \
|
||||||
|
--reduce_fusion disable \
|
||||||
|
--workers 8 \
|
||||||
|
--gpt_attention_plugin ${model_dtype} \
|
||||||
|
--gemm_plugin ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--max_batch_size ${max_batch_size} \
|
||||||
|
--max_input_len ${max_input_len} \
|
||||||
|
--max_seq_len ${max_seq_len} \
|
||||||
|
--max_num_tokens ${max_num_tokens} \
|
||||||
|
--output_dir ${trt_engine_path}
|
||||||
|
|
||||||
|
# handle triton protobuf files and launch triton server
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
mkdir triton_model_repo
|
||||||
|
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
|
||||||
|
cd triton_model_repo
|
||||||
|
rm -rf ./tensorrt_llm/1/*
|
||||||
|
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
||||||
|
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
|
||||||
|
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
|
||||||
|
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
|
||||||
|
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
|
||||||
|
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
python3 scripts/launch_triton_server.py \
|
||||||
|
--world_size=${model_tp_size} \
|
||||||
|
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_tgi_server() {
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
--quantize fp8 \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_lmdeploy_server() {
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
server_command="lmdeploy serve api_server $model \
|
||||||
|
--tp $tp \
|
||||||
|
--server-port $port \
|
||||||
|
$server_args"
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
bash -c "$server_command" &
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_sglang_server() {
|
||||||
|
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
|
server_command="python3 \
|
||||||
|
-m sglang.launch_server \
|
||||||
|
--tp $tp \
|
||||||
|
--model-path $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="python3 \
|
||||||
|
-m sglang.launch_server \
|
||||||
|
--tp $tp \
|
||||||
|
--model-path $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
}
|
||||||
|
|
||||||
|
launch_vllm_server() {
|
||||||
|
|
||||||
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
|
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
|
||||||
|
launch_trt_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
|
||||||
|
launch_tgi_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
|
||||||
|
launch_lmdeploy_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
|
||||||
|
launch_sglang_server
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then
|
||||||
|
launch_vllm_server
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main
|
||||||
@ -1,102 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
|
|
||||||
server_params=$1
|
|
||||||
common_params=$2
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
model_path=$(echo "$common_params" | jq -r '.model')
|
|
||||||
model_name="${model_path#*/}"
|
|
||||||
model_type=$(echo "$server_params" | jq -r '.model_type')
|
|
||||||
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
|
||||||
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
|
||||||
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
|
||||||
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
|
|
||||||
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
|
||||||
|
|
||||||
cd ~
|
|
||||||
rm -rf models
|
|
||||||
mkdir -p models
|
|
||||||
cd models
|
|
||||||
models_dir=$(pwd)
|
|
||||||
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
|
||||||
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
|
||||||
|
|
||||||
cd ~
|
|
||||||
rm -rf tensorrt-demo
|
|
||||||
git clone https://github.com/neuralmagic/tensorrt-demo.git
|
|
||||||
cd tensorrt-demo
|
|
||||||
tensorrt_demo_dir=$(pwd)
|
|
||||||
|
|
||||||
# make sure the parameter inside tensorrt_demo is consistent to envvar
|
|
||||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
|
|
||||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
|
|
||||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
|
|
||||||
|
|
||||||
|
|
||||||
cd /
|
|
||||||
rm -rf tensorrtllm_backend
|
|
||||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
|
||||||
git lfs install
|
|
||||||
cd tensorrtllm_backend
|
|
||||||
git checkout $trt_llm_version
|
|
||||||
tensorrtllm_backend_dir=$(pwd)
|
|
||||||
git submodule update --init --recursive
|
|
||||||
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
|
|
||||||
|
|
||||||
cd /tensorrtllm_backend
|
|
||||||
cd ./tensorrt_llm/examples/${model_type}
|
|
||||||
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
|
||||||
|
|
||||||
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
|
|
||||||
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
|
|
||||||
python ../quantization/quantize.py \
|
|
||||||
--model_dir ${model_path} \
|
|
||||||
--dtype ${model_dtype} \
|
|
||||||
--tp_size ${model_tp_size} \
|
|
||||||
--output_dir ${trt_model_path} \
|
|
||||||
--qformat fp8 \
|
|
||||||
--kv_cache_dtype fp8 \
|
|
||||||
--calib_size 2
|
|
||||||
|
|
||||||
else
|
|
||||||
|
|
||||||
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
|
|
||||||
python3 convert_checkpoint.py \
|
|
||||||
--model_dir ${model_path} \
|
|
||||||
--dtype ${model_dtype} \
|
|
||||||
--tp_size ${model_tp_size} \
|
|
||||||
--output_dir ${trt_model_path}
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
trtllm-build \
|
|
||||||
--checkpoint_dir=${trt_model_path} \
|
|
||||||
--gpt_attention_plugin=${model_dtype} \
|
|
||||||
--gemm_plugin=${model_dtype} \
|
|
||||||
--remove_input_padding=enable \
|
|
||||||
--paged_kv_cache=enable \
|
|
||||||
--tp_size=${model_tp_size} \
|
|
||||||
--max_batch_size=${max_batch_size} \
|
|
||||||
--max_input_len=${max_input_len} \
|
|
||||||
--max_output_len=${max_output_len} \
|
|
||||||
--max_num_tokens=${max_output_len} \
|
|
||||||
--opt_num_tokens=${max_output_len} \
|
|
||||||
--output_dir=${trt_engine_path}
|
|
||||||
|
|
||||||
cd /tensorrtllm_backend/triton_model_repo
|
|
||||||
rm -rf ./tensorrt_llm/1/*
|
|
||||||
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
|
||||||
cd /tensorrtllm_backend
|
|
||||||
python3 scripts/launch_triton_server.py \
|
|
||||||
--world_size=${model_tp_size} \
|
|
||||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
|
||||||
@ -8,6 +8,7 @@ main() {
|
|||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
(which zip) || (apt-get install -y zip)
|
||||||
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||||
@ -24,17 +25,54 @@ main() {
|
|||||||
ls
|
ls
|
||||||
ls results/
|
ls results/
|
||||||
|
|
||||||
# generate figures
|
# upload benchmark results
|
||||||
python3 -m pip install tabulate pandas matplotlib
|
zip -r results.zip results/
|
||||||
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
/workspace/buildkite-agent artifact upload "results.zip"
|
||||||
--description $description \
|
|
||||||
--results-folder results/
|
# upload benchmarking scripts
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/
|
||||||
|
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
||||||
|
# upload benchmarking pipeline
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
||||||
|
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||||
|
|
||||||
# upload results and figures
|
|
||||||
/workspace/buildkite-agent artifact upload "nightly_results.png"
|
|
||||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
# The figures should be genereated by a separate process outside the CI/CD pipeline
|
||||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
|
||||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
# # generate figures
|
||||||
|
# python3 -m pip install tabulate pandas matplotlib
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/
|
||||||
|
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sharegpt
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_2048_128
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_128_2048
|
||||||
|
|
||||||
|
# # upload results and figures
|
||||||
|
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||||
}
|
}
|
||||||
|
|
||||||
main "$@"
|
main "$@"
|
||||||
@ -1,135 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import pandas as pd
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description=
|
|
||||||
'Parse command line arguments for summary-nightly-results script.')
|
|
||||||
parser.add_argument('--results-folder',
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help='The folder where the results are stored.')
|
|
||||||
parser.add_argument('--description',
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help='Description of the results.')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
|
|
||||||
results_folder = Path(args.results_folder)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# collect results
|
|
||||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
|
||||||
with open(test_file, "r") as f:
|
|
||||||
results = results + json.loads(f.read())
|
|
||||||
|
|
||||||
# generate markdown table
|
|
||||||
df = pd.DataFrame.from_dict(results)
|
|
||||||
|
|
||||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
|
||||||
|
|
||||||
with open(args.description, "r") as f:
|
|
||||||
description = f.read()
|
|
||||||
|
|
||||||
description = description.format(
|
|
||||||
nightly_results_benchmarking_table=md_table)
|
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
|
||||||
f.write(description)
|
|
||||||
|
|
||||||
plt.rcParams.update({'font.size': 20})
|
|
||||||
|
|
||||||
# plot results
|
|
||||||
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
|
|
||||||
fig.subplots_adjust(hspace=1)
|
|
||||||
methods = ["vllm", "trt", "lmdeploy", "tgi"]
|
|
||||||
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
|
|
||||||
for j, metric in enumerate(["TTFT", "ITL"]):
|
|
||||||
means, stds = [], []
|
|
||||||
for method in methods:
|
|
||||||
target = df['Test name'].str.contains(model)
|
|
||||||
target = target & df['Engine'].str.contains(method)
|
|
||||||
filtered_df = df[target]
|
|
||||||
|
|
||||||
if filtered_df.empty:
|
|
||||||
means.append(0.)
|
|
||||||
stds.append(0.)
|
|
||||||
else:
|
|
||||||
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
|
|
||||||
std = filtered_df[f"Std {metric} (ms)"].values[0]
|
|
||||||
success = filtered_df["Successful req."].values[0]
|
|
||||||
stds.append(std / math.sqrt(success))
|
|
||||||
|
|
||||||
print(model, metric)
|
|
||||||
print(means, stds)
|
|
||||||
|
|
||||||
ax = axes[i, j + 1]
|
|
||||||
|
|
||||||
bars = ax.bar(
|
|
||||||
["vllm", "trt", "lmdeploy", "tgi"],
|
|
||||||
means,
|
|
||||||
yerr=stds,
|
|
||||||
capsize=10,
|
|
||||||
)
|
|
||||||
for idx, bar in enumerate(bars):
|
|
||||||
bar.set_color(bar_colors[idx])
|
|
||||||
ax.set_ylim(bottom=0)
|
|
||||||
|
|
||||||
ax.set_ylabel(f"{metric} (ms)")
|
|
||||||
ax.set_title(f"{model} {metric}")
|
|
||||||
ax.grid(axis='y')
|
|
||||||
|
|
||||||
metric = "Tput"
|
|
||||||
j = 0
|
|
||||||
if True:
|
|
||||||
tputs = []
|
|
||||||
for method in methods:
|
|
||||||
target = df['Test name'].str.contains(model)
|
|
||||||
target = target & df['Engine'].str.contains(method)
|
|
||||||
filtered_df = df[target]
|
|
||||||
|
|
||||||
if filtered_df.empty:
|
|
||||||
tputs.append(0.)
|
|
||||||
else:
|
|
||||||
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
|
|
||||||
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
|
|
||||||
tputs.append(input_tput + output_tput)
|
|
||||||
|
|
||||||
print(model, metric)
|
|
||||||
print(tputs)
|
|
||||||
|
|
||||||
ax = axes[i, j]
|
|
||||||
|
|
||||||
bars = ax.bar(
|
|
||||||
["vllm", "trt", "lmdeploy", "tgi"],
|
|
||||||
tputs,
|
|
||||||
)
|
|
||||||
for idx, bar in enumerate(bars):
|
|
||||||
bar.set_color(bar_colors[idx])
|
|
||||||
|
|
||||||
ax.set_ylim(bottom=0)
|
|
||||||
|
|
||||||
ax.set_ylabel("Tput (token/s)")
|
|
||||||
ax.set_title(f"{model} {metric}")
|
|
||||||
ax.grid(axis='y')
|
|
||||||
|
|
||||||
fig.tight_layout()
|
|
||||||
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
args = parse_arguments()
|
|
||||||
main(args)
|
|
||||||
@ -1,218 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill lmdeploy || true
|
|
||||||
# waiting for GPU processes to be fully killed
|
|
||||||
sleep 10
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append lmdeploy to the test name
|
|
||||||
test_name=lmdeploy_$test_name
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepare tokenizer
|
|
||||||
rm -rf /tokenizer_cache
|
|
||||||
mkdir /tokenizer_cache
|
|
||||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
|
||||||
--model "$model" \
|
|
||||||
--cachedir /tokenizer_cache
|
|
||||||
|
|
||||||
server_command="lmdeploy serve api_server $model \
|
|
||||||
--tp $tp \
|
|
||||||
--server-port $port \
|
|
||||||
$server_args"
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
bash -c "$server_command" &
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "lmdeploy server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "lmdeploy failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# get model name
|
|
||||||
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend lmdeploy \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--model \"$model_name\" \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "lmdeploy" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
python -m pip install transformers==4.41.2
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
python -m pip install tabulate pandas
|
|
||||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
357
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
357
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
@ -0,0 +1,357 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
get_current_llm_serving_engine() {
|
||||||
|
|
||||||
|
if which lmdeploy >/dev/null; then
|
||||||
|
echo "Container: lmdeploy"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /tgi-entrypoint.sh ]; then
|
||||||
|
echo "Container: tgi"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if which trtllm-build >/dev/null; then
|
||||||
|
echo "Container: tensorrt-llm"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /sgl-workspace ]; then
|
||||||
|
echo "Container: sglang"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=sglang
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /vllm-workspace ]; then
|
||||||
|
echo "Container: vllm"
|
||||||
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill -f python
|
||||||
|
pkill -f python3
|
||||||
|
pkill -f tritonserver
|
||||||
|
pkill -f pt_main_thread
|
||||||
|
pkill -f text-generation
|
||||||
|
pkill -f lmdeploy
|
||||||
|
|
||||||
|
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_installed() {
|
||||||
|
# Ensure that the given command is installed by apt-get
|
||||||
|
local cmd=$1
|
||||||
|
if ! which $cmd >/dev/null; then
|
||||||
|
apt-get update && apt-get install -y $cmd
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
wait_for_server
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
# this is required for lmdeploy.
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
|
||||||
|
# change model name for lmdeploy (it will not follow standard hf name)
|
||||||
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||||
|
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ $backend = "trt" ]]; then
|
||||||
|
backend="tensorrt-llm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
elif [[ "$dataset_name" = "sonnet" ]]; then
|
||||||
|
|
||||||
|
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
||||||
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--sonnet-input-len $sonnet_input_len \
|
||||||
|
--sonnet-output-len $sonnet_output_len \
|
||||||
|
--sonnet-prefix-len $sonnet_prefix_len \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
|
exit 1
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
server_command="None"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
prepare_dataset() {
|
||||||
|
|
||||||
|
# download sharegpt dataset
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
echo "" > sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
# check if the environment variable is successfully injected from yaml
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
get_current_llm_serving_engine
|
||||||
|
|
||||||
|
pip install -U transformers
|
||||||
|
|
||||||
|
# check storage
|
||||||
|
df -h
|
||||||
|
|
||||||
|
ensure_installed wget
|
||||||
|
ensure_installed curl
|
||||||
|
ensure_installed jq
|
||||||
|
|
||||||
|
prepare_dataset
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# run the test
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
|
||||||
|
# upload benchmark results to buildkite
|
||||||
|
python3 -m pip install tabulate pandas
|
||||||
|
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@ -1,216 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill text-generation || true
|
|
||||||
# waiting for GPU processes to be fully killed
|
|
||||||
sleep 10
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append tgi to the test name
|
|
||||||
test_name=tgi_$test_name
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
|
||||||
echo "Key 'fp8' exists in common params."
|
|
||||||
server_command="/tgi-entrypoint.sh \
|
|
||||||
--model-id $model \
|
|
||||||
--num-shard $tp \
|
|
||||||
--port $port \
|
|
||||||
--quantize fp8 \
|
|
||||||
$server_args"
|
|
||||||
else
|
|
||||||
echo "Key 'fp8' does not exist in common params."
|
|
||||||
server_command="/tgi-entrypoint.sh \
|
|
||||||
--model-id $model \
|
|
||||||
--num-shard $tp \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
eval "$server_command" &
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "tgi server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "tgi failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend tgi \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "tgi" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
python -m pip install tabulate pandas
|
|
||||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -1,214 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill tritonserver || true
|
|
||||||
# waiting for GPU processes to be fully killed
|
|
||||||
sleep 20
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append trt to the test name
|
|
||||||
test_name=trt_$test_name
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "trt server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "trt failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepare tokenizer
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
rm -rf /tokenizer_cache
|
|
||||||
mkdir /tokenizer_cache
|
|
||||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
|
||||||
--model "$model" \
|
|
||||||
--cachedir /tokenizer_cache
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend tensorrt-llm \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
server_command=""
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "trt" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
|
|
||||||
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
# update transformers package, to make sure mixtral tokenizer is available
|
|
||||||
python -m pip install transformers -U
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
python -m pip install tabulate pandas
|
|
||||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -1,221 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
# kill all processes on GPU.
|
|
||||||
pkill pt_main_thread
|
|
||||||
sleep 10
|
|
||||||
|
|
||||||
# remove vllm config file
|
|
||||||
rm -rf ~/.config/vllm
|
|
||||||
|
|
||||||
# Print the GPU memory usage
|
|
||||||
# so that we know if all GPU processes are killed.
|
|
||||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
||||||
# The memory usage should be 0 MB.
|
|
||||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `benchmark_serving.py`
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# append vllm to the test name
|
|
||||||
test_name=vllm_$test_name
|
|
||||||
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
|
|
||||||
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
|
|
||||||
server_args=$(json2args "$server_params")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
|
||||||
server_command="python3 \
|
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
|
||||||
--model $model \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
else
|
|
||||||
echo "Key 'fp8' does not exist in common params."
|
|
||||||
server_command="python3 \
|
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
|
||||||
--model $model \
|
|
||||||
--port $port \
|
|
||||||
$server_args"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# run the server
|
|
||||||
echo "Running test case $test_name"
|
|
||||||
echo "Server command: $server_command"
|
|
||||||
eval "$server_command" &
|
|
||||||
|
|
||||||
# wait until the server is alive
|
|
||||||
wait_for_server
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "vllm server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "vllm failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "vllm" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
# clean up
|
|
||||||
kill_gpu_processes
|
|
||||||
rm -rf /root/.cache/huggingface/*
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
# enter vllm directory
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
|
||||||
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
|
||||||
|
|
||||||
python3 -m pip install tabulate pandas
|
|
||||||
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -17,10 +17,17 @@ serving_column_mapping = {
|
|||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"std_ttft_ms": "Std TTFT (ms)",
|
"std_ttft_ms": "Std TTFT (ms)",
|
||||||
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"std_itl_ms": "Std ITL (ms)",
|
"std_itl_ms": "Std ITL (ms)",
|
||||||
"input_throughput": "Input Tput (tok/s)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
|
"std_tpot_ms": "Std TPOT (ms)",
|
||||||
|
"median_tpot_ms": "Median TPOT (ms)",
|
||||||
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
|
"total_input_tokens": "Total input tokens",
|
||||||
|
"total_output_tokens": "Total output tokens",
|
||||||
"engine": "Engine",
|
"engine": "Engine",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2,9 +2,11 @@
|
|||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
retries=0
|
retries=0
|
||||||
while [ $retries -lt 1000 ]; do
|
while [ $retries -lt 1000 ]; do
|
||||||
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -1,16 +1,18 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "llama8B_tp1",
|
"test_name": "llama8B_tp1_sharegpt",
|
||||||
"qps_list": [4],
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
"common_parameters": {
|
"common_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
"tp": 1,
|
"tp": 1,
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 500,
|
"num_prompts": 500,
|
||||||
"port": 8000
|
"port": 8000,
|
||||||
|
"reuse_server": false
|
||||||
},
|
},
|
||||||
"lmdeploy_server_parameters": {
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"lmdeploy_client_parameters": {
|
"lmdeploy_client_parameters": {
|
||||||
},
|
},
|
||||||
@ -21,34 +23,158 @@
|
|||||||
},
|
},
|
||||||
"trt_server_parameters": {
|
"trt_server_parameters": {
|
||||||
"model_type": "llama",
|
"model_type": "llama",
|
||||||
"model_dtype": "float16",
|
"model_dtype": "bfloat16",
|
||||||
"max_batch_size": 256,
|
"max_batch_size": 2048,
|
||||||
"max_input_len": 4096,
|
"max_input_len": 4096,
|
||||||
"max_output_len": 4096,
|
"max_seq_len": 6144,
|
||||||
"trt_llm_version": "r24.04"
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
},
|
},
|
||||||
"trt_client_parameters": {
|
"trt_client_parameters": {
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": ""
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"vllm_client_parameters": {
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"enable_torch_compile": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "llama70B_tp4",
|
"test_name": "llama8B_tp1_sonnet_512_16",
|
||||||
"qps_list": [2],
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
"tp": 1,
|
||||||
|
"dataset_name": "sonnet",
|
||||||
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 16,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "bfloat16",
|
||||||
|
"max_batch_size": 2048,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_seq_len": 6144,
|
||||||
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"enable_torch_compile": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama8B_tp1_sonnet_512_256",
|
||||||
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
"tp": 1,
|
||||||
|
"dataset_name": "sonnet",
|
||||||
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 256,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "bfloat16",
|
||||||
|
"max_batch_size": 2048,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_seq_len": 6144,
|
||||||
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"enable_torch_compile": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
"common_parameters": {
|
"common_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
"tp": 4,
|
"tp": 4,
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 500,
|
"num_prompts": 500,
|
||||||
"port": 8000
|
"port": 8000,
|
||||||
|
"reuse_server": false
|
||||||
},
|
},
|
||||||
"lmdeploy_server_parameters": {
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"lmdeploy_client_parameters": {
|
"lmdeploy_client_parameters": {
|
||||||
},
|
},
|
||||||
@ -59,34 +185,50 @@
|
|||||||
},
|
},
|
||||||
"trt_server_parameters": {
|
"trt_server_parameters": {
|
||||||
"model_type": "llama",
|
"model_type": "llama",
|
||||||
"model_dtype": "float16",
|
"model_dtype": "bfloat16",
|
||||||
"max_batch_size": 256,
|
"max_batch_size": 2048,
|
||||||
"max_input_len": 4096,
|
"max_input_len": 4096,
|
||||||
"max_output_len": 4096,
|
"max_seq_len": 6144,
|
||||||
"trt_llm_version": "r24.04"
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
},
|
},
|
||||||
"trt_client_parameters": {
|
"trt_client_parameters": {
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": ""
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"vllm_client_parameters": {
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "mixtral8x7B_tp2",
|
"test_name": "llama70B_tp4_sonnet_512_16",
|
||||||
"qps_list": [2],
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
"common_parameters": {
|
"common_parameters": {
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
"tp": 2,
|
"tp": 4,
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sonnet",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
"num_prompts": 500,
|
"num_prompts": 500,
|
||||||
"port": 8000
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 16,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
},
|
},
|
||||||
"lmdeploy_server_parameters": {
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"lmdeploy_client_parameters": {
|
"lmdeploy_client_parameters": {
|
||||||
},
|
},
|
||||||
@ -97,20 +239,85 @@
|
|||||||
},
|
},
|
||||||
"trt_server_parameters": {
|
"trt_server_parameters": {
|
||||||
"model_type": "llama",
|
"model_type": "llama",
|
||||||
"model_dtype": "float16",
|
"model_dtype": "bfloat16",
|
||||||
"max_batch_size": 256,
|
"max_batch_size": 2048,
|
||||||
"max_input_len": 4096,
|
"max_input_len": 4096,
|
||||||
"max_output_len": 4096,
|
"max_seq_len": 6144,
|
||||||
"trt_llm_version": "r24.04"
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
},
|
},
|
||||||
"trt_client_parameters": {
|
"trt_client_parameters": {
|
||||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": ""
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
"vllm_client_parameters": {
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama70B_tp4_sonnet_512_256",
|
||||||
|
"qps_list": [4,8,16,32,"inf"],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tp": 4,
|
||||||
|
"dataset_name": "sonnet",
|
||||||
|
"dataset_path": "./sonnet_4x.txt",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000,
|
||||||
|
"sonnet_input_len": 512,
|
||||||
|
"sonnet_output_len": 256,
|
||||||
|
"sonnet_prefix_len": 50,
|
||||||
|
"reuse_server": true
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "bfloat16",
|
||||||
|
"max_batch_size": 2048,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_seq_len": 6144,
|
||||||
|
"max_num_tokens": 16384,
|
||||||
|
"trt_llm_version": "v0.11.0"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
},
|
||||||
|
"sglang_server_parameters": {
|
||||||
|
"disable_radix_cache": "",
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"sglang_client_parameters": {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -3,13 +3,14 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue
|
queue: cpu_queue
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
# rename the files to change linux -> manylinux1
|
# rename the files to change linux -> manylinux1
|
||||||
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
||||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||||
|
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -21,7 +22,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue
|
queue: cpu_queue
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
# rename the files to change linux -> manylinux1
|
# rename the files to change linux -> manylinux1
|
||||||
|
|||||||
@ -71,13 +71,47 @@ mkdir -p ${HF_CACHE}
|
|||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
commands=$@
|
commands=$@
|
||||||
|
echo "Commands:$commands"
|
||||||
|
#ignore certain kernels tests
|
||||||
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/test_attention.py \
|
||||||
|
--ignore=kernels/test_attention_selector.py \
|
||||||
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
|
--ignore=kernels/test_flash_attn.py \
|
||||||
|
--ignore=kernels/test_flashinfer.py \
|
||||||
|
--ignore=kernels/test_gguf.py \
|
||||||
|
--ignore=kernels/test_int8_quant.py \
|
||||||
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/test_moe.py \
|
||||||
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
|
--ignore=kernels/test_rand.py \
|
||||||
|
--ignore=kernels/test_sampler.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#ignore certain Entrypoints tests
|
||||||
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
|
--ignore=entrypoints/openai/test_accuracy.py \
|
||||||
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
|
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
|
--ignore=entrypoints/openai/test_embedding.py \
|
||||||
|
--ignore=entrypoints/openai/test_oot_registration.py "}
|
||||||
|
fi
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
#replace shard arguments
|
#replace shard arguments
|
||||||
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
|
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||||
|
echo "Shard ${GPU} commands:$commands"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd --device /dev/dri \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network host \
|
--network host \
|
||||||
|
|||||||
39
.buildkite/run-cpu-test-ppc64le.sh
Executable file
39
.buildkite/run-cpu-test-ppc64le.sh
Executable file
@ -0,0 +1,39 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f cpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
|
source /etc/environment
|
||||||
|
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
pip install pytest matplotlib einops transformers_stream_generator
|
||||||
|
pytest -v -s tests/models -m \"not vlm\" \
|
||||||
|
--ignore=tests/models/test_embedding.py \
|
||||||
|
--ignore=tests/models/test_oot_registration.py \
|
||||||
|
--ignore=tests/models/test_registry.py \
|
||||||
|
--ignore=tests/models/test_jamba.py \
|
||||||
|
--ignore=tests/models/test_mamba.py \
|
||||||
|
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
|
# online inference
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
@ -22,13 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
|||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pip install pytest matplotlib einops transformers_stream_generator
|
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
|
||||||
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
|
pytest -v -s tests/models/encoder_decoder/language
|
||||||
--ignore=tests/models/test_oot_registration.py \
|
pytest -v -s tests/models/decoder_only/language \
|
||||||
--ignore=tests/models/test_registry.py \
|
--ignore=tests/models/test_fp8.py \
|
||||||
--ignore=tests/models/test_fp8.py \
|
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
||||||
--ignore=tests/models/test_jamba.py \
|
--ignore=tests/models/decoder_only/language/test_mamba.py \
|
||||||
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
|
||||||
|
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||||
|
|
||||||
|
# Run compressed-tensor test
|
||||||
|
# docker exec cpu-test bash -c "
|
||||||
|
# pytest -s -v \
|
||||||
|
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
|
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
|
||||||
|
|
||||||
|
# Run AWQ test
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
pytest -s -v \
|
||||||
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# online inference
|
# online inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
|||||||
@ -11,4 +11,4 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
|
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
|
||||||
|
|||||||
@ -9,6 +9,7 @@
|
|||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
|
# optional(bool): never run this test by default (i.e. need to unblock manually)
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
@ -39,17 +40,20 @@ steps:
|
|||||||
# Check API reference (if it fails, you may have missing mock imports)
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 15min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
|
- tests/mq_llm_engine
|
||||||
- tests/async_engine
|
- tests/async_engine
|
||||||
- tests/test_inputs
|
- tests/test_inputs
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/test_utils
|
- tests/test_utils
|
||||||
- tests/worker
|
- tests/worker
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s async_engine # Async Engine
|
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||||
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
@ -60,14 +64,22 @@ steps:
|
|||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
|
- tests/basic_correctness/test_cpu_offload
|
||||||
|
- tests/basic_correctness/test_preemption
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
|
- label: Chunked Prefill Test
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
|
commands:
|
||||||
|
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
@ -76,22 +88,29 @@ steps:
|
|||||||
- vllm/distributed
|
- vllm/distributed
|
||||||
- tests/core
|
- tests/core
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s core
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
|
||||||
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
|
||||||
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
|
||||||
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
|
||||||
|
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
|
||||||
|
|
||||||
- label: Entrypoints Test # 20min
|
- label: Entrypoints Test # 40min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
|
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@ -102,7 +121,9 @@ steps:
|
|||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed
|
- tests/distributed
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
|
||||||
@ -130,7 +151,9 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
command: pytest -v -s test_regression.py
|
commands:
|
||||||
|
- pip install modelscope
|
||||||
|
- pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
@ -144,7 +167,7 @@ steps:
|
|||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: Examples Test # 12min
|
- label: Examples Test # 15min
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -158,42 +181,20 @@ steps:
|
|||||||
- python3 offline_inference_with_prefix.py
|
- python3 offline_inference_with_prefix.py
|
||||||
- python3 llm_engine_example.py
|
- python3 llm_engine_example.py
|
||||||
- python3 offline_inference_vision_language.py
|
- python3 offline_inference_vision_language.py
|
||||||
|
- python3 offline_inference_vision_language_multi_image.py
|
||||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference_encoder_decoder.py
|
- python3 offline_inference_encoder_decoder.py
|
||||||
|
|
||||||
- label: Models Test # 1hr10min
|
- label: Prefix Caching Test # 9min
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models
|
|
||||||
commands:
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
|
||||||
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
|
|
||||||
|
|
||||||
- label: torch compile integration test
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s ./compile/test_full_graph.py
|
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
|
||||||
|
|
||||||
|
|
||||||
- label: Vision Language Models Test # 42min
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models -m vlm
|
|
||||||
|
|
||||||
- label: Prefix Caching Test # 7min
|
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s prefix_caching
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
|
||||||
|
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
|
||||||
|
|
||||||
- label: Samplers Test # 18min
|
- label: Samplers Test # 36min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
@ -209,16 +210,16 @@ steps:
|
|||||||
- tests/test_logits_processor
|
- tests/test_logits_processor
|
||||||
command: pytest -v -s test_logits_processor.py
|
command: pytest -v -s test_logits_processor.py
|
||||||
|
|
||||||
- label: Speculative decoding tests # 22min
|
- label: Speculative decoding tests # 30min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
commands:
|
commands:
|
||||||
# See https://github.com/vllm-project/vllm/issues/5152
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
|
||||||
- pytest -v -s spec_decode
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 30min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@ -226,7 +227,25 @@ steps:
|
|||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Test %N # 30min each
|
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||||
|
fast_check: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/compile
|
||||||
|
commands:
|
||||||
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
|
|
||||||
|
# TODO: re-write in comparison tests, and fix symbolic shape
|
||||||
|
# for quantization ops.
|
||||||
|
# - label: "PyTorch Fullgraph Test" # 18min
|
||||||
|
# source_file_dependencies:
|
||||||
|
# - vllm/
|
||||||
|
# - tests/compile
|
||||||
|
# commands:
|
||||||
|
# - pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
|
- label: Kernels Test %N # 1h each
|
||||||
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
@ -255,12 +274,12 @@ steps:
|
|||||||
- pip install aiohttp
|
- pip install aiohttp
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Quantization Test # 15min
|
- label: Quantization Test # 33min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
command: pytest -v -s quantization
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
@ -268,10 +287,16 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pip install lm-eval
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
|
- label: Encoder Decoder tests # 5min
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/encoder_decoder
|
||||||
|
commands:
|
||||||
|
- pytest -v -s encoder_decoder
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
fast_check: false
|
fast_check: false
|
||||||
mirror_hardwares: [ amd ]
|
mirror_hardwares: [ amd ]
|
||||||
@ -281,6 +306,56 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
|
|
||||||
|
##### models test #####
|
||||||
|
|
||||||
|
- label: Basic Models Test # 3min
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models
|
||||||
|
commands:
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
|
||||||
|
|
||||||
|
- label: Decoder-only Language Models Test # 1h36min
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/decoder_only/language
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/decoder_only/language
|
||||||
|
|
||||||
|
- label: Decoder-only Multi-Modal Models Test # 1h31min
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/decoder_only/audio_language
|
||||||
|
- pytest -v -s models/decoder_only/vision_language
|
||||||
|
|
||||||
|
- label: Other Models Test # 6min
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/embedding/language
|
||||||
|
- pytest -v -s models/encoder_decoder/language
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language
|
||||||
|
|
||||||
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
|
- label: Custom Models Test
|
||||||
|
optional: true
|
||||||
|
commands:
|
||||||
|
- echo 'Testing custom models...'
|
||||||
|
# PR authors can temporarily add commands below to test individual models
|
||||||
|
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||||
|
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
@ -306,13 +381,13 @@ steps:
|
|||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
commands:
|
commands:
|
||||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 28min
|
- label: Distributed Tests (2 GPUs) # 40min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
@ -322,19 +397,23 @@ steps:
|
|||||||
- vllm/executor/
|
- vllm/executor/
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
|
- vllm/compilation
|
||||||
commands:
|
commands:
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s distributed/test_multimodal_broadcast.py
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
||||||
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||||
|
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 21min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -352,7 +431,7 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_async_llm.py
|
- pytest -v -s multi_step/test_correctness_async_llm.py
|
||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 23min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -368,6 +447,7 @@ steps:
|
|||||||
- label: LoRA Long Context (Distributed) # 11min
|
- label: LoRA Long Context (Distributed) # 11min
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora/test_long_context
|
- tests/lora/test_long_context
|
||||||
@ -377,14 +457,25 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s -x lora/test_long_context.py
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
gpu: a100
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/weight_loading
|
||||||
|
commands:
|
||||||
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -399,7 +490,7 @@ steps:
|
|||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
- label: LM Eval Large Models # optional
|
- label: LM Eval Large Models # optional
|
||||||
@ -410,6 +501,5 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pip install lm-eval
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|||||||
@ -1,4 +1,34 @@
|
|||||||
vllm/*.so
|
/.github/
|
||||||
/.venv
|
/.venv
|
||||||
/build
|
/build
|
||||||
dist
|
dist
|
||||||
|
vllm/*.so
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
.mypy_cache
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
/build/
|
||||||
|
cmake-build-*/
|
||||||
|
CMakeUserPresets.json
|
||||||
|
develop-eggs/
|
||||||
|
/dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|||||||
30
.github/CODEOWNERS
vendored
Normal file
30
.github/CODEOWNERS
vendored
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# See https://help.github.com/articles/about-codeowners/
|
||||||
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
|
CMakeLists.txt @tlrmchlsmth @WoosukKwon
|
||||||
|
|
||||||
|
# Test ownership
|
||||||
|
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
|
||||||
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
|
||||||
|
/tests/models @DarkLight1337 @ywang96
|
||||||
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
|
/tests/quantization @mgoin @robertgshaw2-neuralmagic
|
||||||
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
|
/tests/multi_step @alexm-neuralmagic @comaniac
|
||||||
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
9
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
9
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
@ -30,6 +30,15 @@ body:
|
|||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Model Input Dumps
|
||||||
|
description: |
|
||||||
|
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
||||||
|
placeholder: |
|
||||||
|
Upload the dumped input file.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: 🐛 Describe the bug
|
label: 🐛 Describe the bug
|
||||||
|
|||||||
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
|
|||||||
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
<h3>Adding or changing kernels</h3>
|
||||||
|
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
|
||||||
|
<ul>
|
||||||
|
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
|
||||||
|
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
|
||||||
|
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
|
||||||
|
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
|
||||||
|
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
|
||||||
|
</ul>
|
||||||
|
|
||||||
<h3>Notes for Large Changes</h3>
|
<h3>Notes for Large Changes</h3>
|
||||||
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
||||||
|
|
||||||
|
|||||||
7
.github/dependabot.yml
vendored
Normal file
7
.github/dependabot.yml
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
version: 2
|
||||||
|
updates:
|
||||||
|
# Maintain dependencies for GitHub Actions
|
||||||
|
- package-ecosystem: "github-actions"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
37
.github/workflows/actionlint.yml
vendored
Normal file
37
.github/workflows/actionlint.yml
vendored
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
name: Lint GitHub Actions workflows
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- "main"
|
||||||
|
paths:
|
||||||
|
- '.github/workflows/*.ya?ml'
|
||||||
|
- '.github/workflows/actionlint.*'
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- "main"
|
||||||
|
paths:
|
||||||
|
- '.github/workflows/*.ya?ml'
|
||||||
|
- '.github/workflows/actionlint.*'
|
||||||
|
|
||||||
|
env:
|
||||||
|
LC_ALL: en_US.UTF-8
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
actionlint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: "Checkout"
|
||||||
|
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: "Run actionlint"
|
||||||
|
run: |
|
||||||
|
tools/actionlint.sh -color
|
||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -8,7 +8,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Add label
|
- name: Add label
|
||||||
uses: actions/github-script@v5
|
uses: actions/github-script@v7
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.addLabels({
|
github.rest.issues.addLabels({
|
||||||
|
|||||||
4
.github/workflows/clang-format.yml
vendored
4
.github/workflows/clang-format.yml
vendored
@ -17,9 +17,9 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.11"]
|
python-version: ["3.11"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
17
.github/workflows/matchers/actionlint.json
vendored
Normal file
17
.github/workflows/matchers/actionlint.json
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"owner": "actionlint",
|
||||||
|
"pattern": [
|
||||||
|
{
|
||||||
|
"regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
|
||||||
|
"file": 1,
|
||||||
|
"line": 2,
|
||||||
|
"column": 3,
|
||||||
|
"message": 4,
|
||||||
|
"code": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
19
.github/workflows/mypy.yaml
vendored
19
.github/workflows/mypy.yaml
vendored
@ -11,15 +11,15 @@ on:
|
|||||||
- main
|
- main
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ruff:
|
mypy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
@ -32,15 +32,4 @@ jobs:
|
|||||||
pip install types-setuptools
|
pip install types-setuptools
|
||||||
- name: Mypy
|
- name: Mypy
|
||||||
run: |
|
run: |
|
||||||
mypy
|
tools/mypy.sh
|
||||||
mypy tests --follow-imports skip
|
|
||||||
mypy vllm/attention --follow-imports skip
|
|
||||||
mypy vllm/distributed --follow-imports skip
|
|
||||||
mypy vllm/engine --follow-imports skip
|
|
||||||
mypy vllm/executor --follow-imports skip
|
|
||||||
mypy vllm/lora --follow-imports skip
|
|
||||||
mypy vllm/model_executor --follow-imports skip
|
|
||||||
mypy vllm/prompt_adapter --follow-imports skip
|
|
||||||
mypy vllm/spec_decode --follow-imports skip
|
|
||||||
mypy vllm/worker --follow-imports skip
|
|
||||||
|
|
||||||
|
|||||||
16
.github/workflows/publish.yml
vendored
16
.github/workflows/publish.yml
vendored
@ -21,16 +21,16 @@ jobs:
|
|||||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Extract branch info
|
- name: Extract branch info
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
|
echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Create Release
|
- name: Create Release
|
||||||
id: create_release
|
id: create_release
|
||||||
uses: "actions/github-script@v6"
|
uses: "actions/github-script@v7"
|
||||||
env:
|
env:
|
||||||
RELEASE_TAG: ${{ env.release_tag }}
|
RELEASE_TAG: ${{ env.release_tag }}
|
||||||
with:
|
with:
|
||||||
@ -54,7 +54,7 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Setup ccache
|
- name: Setup ccache
|
||||||
uses: hendrikmuhs/ccache-action@v1.2
|
uses: hendrikmuhs/ccache-action@v1.2
|
||||||
@ -68,7 +68,7 @@ jobs:
|
|||||||
bash -x .github/workflows/scripts/env.sh
|
bash -x .github/workflows/scripts/env.sh
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
@ -86,10 +86,10 @@ jobs:
|
|||||||
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||||
run: |
|
run: |
|
||||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
||||||
asset_name=${wheel_name//"linux"/"manylinux1"}
|
asset_name=${wheel_name//"linux"/"manylinux1"}
|
||||||
echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
|
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
||||||
echo "asset_name=${asset_name}" >> $GITHUB_ENV
|
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Upload Release Asset
|
- name: Upload Release Asset
|
||||||
uses: actions/upload-release-asset@v1
|
uses: actions/upload-release-asset@v1
|
||||||
|
|||||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@ -8,7 +8,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Remind to run full CI on PR
|
- name: Remind to run full CI on PR
|
||||||
uses: actions/github-script@v6
|
uses: actions/github-script@v7
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.createComment({
|
github.rest.issues.createComment({
|
||||||
|
|||||||
8
.github/workflows/ruff.yml
vendored
8
.github/workflows/ruff.yml
vendored
@ -17,18 +17,18 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
|
pip install -r requirements-lint.txt
|
||||||
- name: Analysing the code with ruff
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
ruff .
|
ruff check .
|
||||||
- name: Spelling check with codespell
|
- name: Spelling check with codespell
|
||||||
run: |
|
run: |
|
||||||
codespell --toml pyproject.toml
|
codespell --toml pyproject.toml
|
||||||
|
|||||||
4
.github/workflows/scripts/build.sh
vendored
4
.github/workflows/scripts/build.sh
vendored
@ -8,12 +8,12 @@ PATH=${cuda_home}/bin:$PATH
|
|||||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
$python_executable -m pip install wheel packaging
|
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
||||||
$python_executable -m pip install -r requirements-cuda.txt
|
|
||||||
|
|
||||||
# Limit the number of parallel jobs to avoid OOM
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
# Make sure release wheels are built for the following architectures
|
# Make sure release wheels are built for the following architectures
|
||||||
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||||
|
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
|
||||||
# Build
|
# Build
|
||||||
$python_executable setup.py bdist_wheel --dist-dir=dist
|
$python_executable setup.py bdist_wheel --dist-dir=dist
|
||||||
|
|||||||
4
.github/workflows/yapf.yml
vendored
4
.github/workflows/yapf.yml
vendored
@ -16,9 +16,9 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
13
.gitignore
vendored
13
.gitignore
vendored
@ -1,5 +1,8 @@
|
|||||||
# vllm commit id, generated by setup.py
|
# version file generated by setuptools-scm
|
||||||
vllm/commit_id.py
|
/vllm/_version.py
|
||||||
|
|
||||||
|
# vllm-flash-attn built from source
|
||||||
|
vllm/vllm_flash_attn/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -12,6 +15,8 @@ __pycache__/
|
|||||||
# Distribution / packaging
|
# Distribution / packaging
|
||||||
.Python
|
.Python
|
||||||
build/
|
build/
|
||||||
|
cmake-build-*/
|
||||||
|
CMakeUserPresets.json
|
||||||
develop-eggs/
|
develop-eggs/
|
||||||
dist/
|
dist/
|
||||||
downloads/
|
downloads/
|
||||||
@ -28,6 +33,7 @@ share/python-wheels/
|
|||||||
.installed.cfg
|
.installed.cfg
|
||||||
*.egg
|
*.egg
|
||||||
MANIFEST
|
MANIFEST
|
||||||
|
/.deps/
|
||||||
|
|
||||||
# PyInstaller
|
# PyInstaller
|
||||||
# Usually these files are written by a python script from a template
|
# Usually these files are written by a python script from a template
|
||||||
@ -193,3 +199,6 @@ hip_compat.h
|
|||||||
|
|
||||||
# Benchmark dataset
|
# Benchmark dataset
|
||||||
benchmarks/*.json
|
benchmarks/*.json
|
||||||
|
|
||||||
|
# Linting
|
||||||
|
actionlint
|
||||||
|
|||||||
@ -13,10 +13,10 @@ sphinx:
|
|||||||
fail_on_warning: true
|
fail_on_warning: true
|
||||||
|
|
||||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||||
formats:
|
formats: []
|
||||||
- pdf
|
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: docs/requirements-docs.txt
|
- requirements: docs/requirements-docs.txt
|
||||||
|
|
||||||
|
|||||||
403
CMakeLists.txt
403
CMakeLists.txt
@ -1,5 +1,16 @@
|
|||||||
cmake_minimum_required(VERSION 3.26)
|
cmake_minimum_required(VERSION 3.26)
|
||||||
|
|
||||||
|
# When building directly using CMake, make sure you run the install step
|
||||||
|
# (it places the .so files in the correct location).
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# mkdir build && cd build
|
||||||
|
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
|
||||||
|
# cmake --build . --target install
|
||||||
|
#
|
||||||
|
# If you want to only build one target, make sure to install it manually:
|
||||||
|
# cmake --build . --target _C
|
||||||
|
# cmake --install . --component _C
|
||||||
project(vllm_extensions LANGUAGES CXX)
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Suppress potential warnings about unused manually-specified variables
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
|
# Prevent installation of dependencies (cutlass) by default.
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
@ -70,19 +84,6 @@ endif()
|
|||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Add the `default` target which detects which extensions should be
|
|
||||||
# built based on platform/architecture. This is the same logic that
|
|
||||||
# setup.py uses to select which extensions should be built and should
|
|
||||||
# be kept in sync.
|
|
||||||
#
|
|
||||||
# The `default` target makes direct use of cmake easier since knowledge
|
|
||||||
# of which extensions are supported has been factored in, e.g.
|
|
||||||
#
|
|
||||||
# mkdir build && cd build
|
|
||||||
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
|
|
||||||
# cmake --build . --target default
|
|
||||||
#
|
|
||||||
add_custom_target(default)
|
|
||||||
message(STATUS "Enabling core extension.")
|
message(STATUS "Enabling core extension.")
|
||||||
|
|
||||||
# Define _core_C extension
|
# Define _core_C extension
|
||||||
@ -100,8 +101,6 @@ define_gpu_extension_target(
|
|||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
add_dependencies(default _core_C)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@ -144,14 +143,32 @@ else()
|
|||||||
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Override the GPU architectures detected by cmake/torch and filter them by
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
# the supported versions for the current language.
|
#
|
||||||
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
# For cuda we want to be able to control which architectures we compile for on
|
||||||
#
|
# a per-file basis in order to cut down on compile time. So here we extract
|
||||||
override_gpu_arches(VLLM_GPU_ARCHES
|
# the set of architectures we want to compile for and remove the from the
|
||||||
${VLLM_GPU_LANG}
|
# CMAKE_CUDA_FLAGS so that they are not applied globally.
|
||||||
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
#
|
||||||
|
clear_cuda_arches(CUDA_ARCH_FLAGS)
|
||||||
|
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
|
||||||
|
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
|
||||||
|
# Filter the target architectures by the supported supported archs
|
||||||
|
# since for some files we will build for all CUDA_ARCHS.
|
||||||
|
cuda_archs_loose_intersection(CUDA_ARCHS
|
||||||
|
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
|
||||||
|
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
#
|
||||||
|
# For other GPU targets override the GPU architectures detected by cmake/torch
|
||||||
|
# and filter them by the supported versions for the current language.
|
||||||
|
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
||||||
|
#
|
||||||
|
override_gpu_arches(VLLM_GPU_ARCHES
|
||||||
|
${VLLM_GPU_LANG}
|
||||||
|
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Query torch for additional GPU compilation flags for the given
|
# Query torch for additional GPU compilation flags for the given
|
||||||
@ -167,6 +184,17 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||||
|
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
|
||||||
|
#
|
||||||
|
include(FetchContent)
|
||||||
|
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
|
||||||
|
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
|
||||||
|
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
|
||||||
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define other extension targets
|
# Define other extension targets
|
||||||
#
|
#
|
||||||
@ -181,7 +209,6 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
@ -191,14 +218,21 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(FetchContent)
|
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
|
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||||
|
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# CUTLASS 3.5.1
|
GIT_TAG v3.5.1
|
||||||
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
|
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
|
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
||||||
|
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
@ -207,78 +241,165 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
|
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
|
||||||
"csrc/custom_all_reduce.cu"
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
|
# are not supported by Machete yet.
|
||||||
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
|
||||||
|
if (MARLIN_ARCHS)
|
||||||
|
set(MARLIN_SRCS
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_SRCS}"
|
||||||
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
|
||||||
|
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||||
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
|
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
|
||||||
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
set_source_files_properties(
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
set_gencode_flags_for_srcs(
|
||||||
PROPERTIES
|
SRCS "${SRCS}"
|
||||||
COMPILE_FLAGS
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
"-gencode arch=compute_90a,code=sm_90a")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
||||||
|
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||||
|
else()
|
||||||
|
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||||
|
# build any 3x kernels
|
||||||
|
set(SCALED_MM_3X_ARCHS)
|
||||||
|
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
|
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
||||||
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
|
"later if you intend on running FP8 quantized models on "
|
||||||
|
"Hopper.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
|
"7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}")
|
||||||
|
# subtract out the archs that are already built for 3x
|
||||||
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
|
if (SCALED_MM_2X_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
|
||||||
|
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (SCALED_MM_3X_ARCHS)
|
||||||
|
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
|
||||||
|
" for and covered by scaled_mm_c3x")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
|
|
||||||
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
# Only build Machete kernels if we are building for something compatible with sm90a
|
||||||
|
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
|
||||||
#
|
#
|
||||||
# For the Machete kernels we automatically generate sources for various
|
# For the Machete kernels we automatically generate sources for various
|
||||||
# preselected input type pairs and schedules.
|
# preselected input type pairs and schedules.
|
||||||
# Generate sources:
|
# Generate sources:
|
||||||
execute_process(
|
set(MACHETE_GEN_SCRIPT
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
|
||||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
|
||||||
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
|
|
||||||
RESULT_VARIABLE machete_generation_result
|
|
||||||
OUTPUT_VARIABLE machete_generation_output
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT machete_generation_result EQUAL 0)
|
message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
|
||||||
message(FATAL_ERROR "Machete generation failed."
|
message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
|
||||||
" Result: \"${machete_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
|
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||||
|
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||||
|
RESULT_VARIABLE machete_generation_result
|
||||||
|
OUTPUT_VARIABLE machete_generation_output
|
||||||
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||||
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||||
|
)
|
||||||
|
|
||||||
|
if (NOT machete_generation_result EQUAL 0)
|
||||||
|
message(FATAL_ERROR "Machete generation failed."
|
||||||
|
" Result: \"${machete_generation_result}\""
|
||||||
|
"\nCheck the log for details: "
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
||||||
|
else()
|
||||||
|
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
|
||||||
|
CACHE STRING "Last run machete generate script hash" FORCE)
|
||||||
|
message(STATUS "Machete generation completed successfully.")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "Machete generation completed successfully.")
|
message(STATUS "Machete generation script has not changed, skipping generation.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Add machete generated sources
|
# Add machete generated sources
|
||||||
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
||||||
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
||||||
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
|
|
||||||
|
|
||||||
set_source_files_properties(
|
# forward compatible
|
||||||
${MACHETE_GEN_SOURCES}
|
set_gencode_flags_for_srcs(
|
||||||
PROPERTIES
|
SRCS "${MACHETE_GEN_SOURCES}"
|
||||||
COMPILE_FLAGS
|
CUDA_ARCHS "${MACHETE_ARCHS}")
|
||||||
"-gencode arch=compute_90a,code=sm_90a")
|
|
||||||
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
csrc/quantization/machete/machete_pytorch.cu)
|
||||||
|
|
||||||
|
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
|
||||||
|
AND MACHETE_ARCHS)
|
||||||
|
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
|
"later if you intend on running w4a16 quantized models on "
|
||||||
|
"Hopper.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building Machete kernels as no compatible archs "
|
||||||
|
"found in CUDA target architectures")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
# if CUDA endif
|
||||||
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
|
|
||||||
# raise an error if the user that this was built with an incompatible
|
|
||||||
# CUDA version)
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
|
||||||
csrc/quantization/machete/machete_pytorch.cu)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
message(STATUS "Enabling C extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_C
|
_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
@ -290,6 +411,12 @@ define_gpu_extension_target(
|
|||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
|
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
||||||
|
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
||||||
|
# driver API. This causes problems when linking with earlier versions of CUDA.
|
||||||
|
# Setting this variable sidesteps the issue by calling the driver directly.
|
||||||
|
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||||
|
|
||||||
#
|
#
|
||||||
# _moe_C extension
|
# _moe_C extension
|
||||||
#
|
#
|
||||||
@ -298,11 +425,36 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||||
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
list(APPEND VLLM_MOE_EXT_SRC
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
"csrc/moe/marlin_moe_ops.cu")
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
set(MARLIN_MOE_SRC
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||||
|
"csrc/moe/marlin_moe_ops.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||||
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
@ -313,13 +465,96 @@ define_gpu_extension_target(
|
|||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
|
#
|
||||||
|
# _rocm_C extension
|
||||||
|
#
|
||||||
|
set(VLLM_ROCM_EXT_SRC
|
||||||
|
"csrc/rocm/torch_bindings.cpp"
|
||||||
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
|
define_gpu_extension_target(
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
_rocm_C
|
||||||
message(STATUS "Enabling C extension.")
|
DESTINATION vllm
|
||||||
add_dependencies(default _C)
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
|
SOURCES ${VLLM_ROCM_EXT_SRC}
|
||||||
message(STATUS "Enabling moe extension.")
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
add_dependencies(default _moe_C)
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# vllm-flash-attn currently only supported on CUDA
|
||||||
|
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
|
||||||
|
return()
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||||
|
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||||
|
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||||
|
# we need to manually set VLLM_GPU_ARCHES here.
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
foreach(_ARCH ${CUDA_ARCHS})
|
||||||
|
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||||
|
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Build vLLM flash attention from source
|
||||||
|
#
|
||||||
|
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||||
|
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||||
|
# They should be identical but if they aren't, this is a massive footgun.
|
||||||
|
#
|
||||||
|
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||||
|
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
|
||||||
|
# If no component is specified, vllm-flash-attn is still installed.
|
||||||
|
|
||||||
|
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||||
|
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||||
|
# It can be set as an environment variable or passed as a cmake argument.
|
||||||
|
# The environment variable takes precedence.
|
||||||
|
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||||
|
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||||
|
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
|
||||||
|
else()
|
||||||
|
FetchContent_Declare(
|
||||||
|
vllm-flash-attn
|
||||||
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
|
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
|
||||||
|
set(VLLM_PARENT_BUILD ON)
|
||||||
|
|
||||||
|
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||||
|
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
|
||||||
|
|
||||||
|
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
|
||||||
|
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
||||||
|
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
|
||||||
|
|
||||||
|
# Fetch the vllm-flash-attn library
|
||||||
|
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||||
|
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||||
|
|
||||||
|
# Restore the install prefix
|
||||||
|
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
|
||||||
|
|
||||||
|
# Copy over the vllm-flash-attn python files
|
||||||
|
install(
|
||||||
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
|
DESTINATION vllm/vllm_flash_attn
|
||||||
|
COMPONENT vllm_flash_attn_c
|
||||||
|
FILES_MATCHING PATTERN "*.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Nothing after vllm-flash-attn, see comment about macros above
|
||||||
|
|||||||
128
CODE_OF_CONDUCT.md
Normal file
128
CODE_OF_CONDUCT.md
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
|
||||||
|
# vLLM Code of Conduct
|
||||||
|
|
||||||
|
## Our Pledge
|
||||||
|
|
||||||
|
We as members, contributors, and leaders pledge to make participation in our
|
||||||
|
community a harassment-free experience for everyone, regardless of age, body
|
||||||
|
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||||
|
identity and expression, level of experience, education, socioeconomic status,
|
||||||
|
nationality, personal appearance, race, caste, color, religion, or sexual
|
||||||
|
identity and orientation.
|
||||||
|
|
||||||
|
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||||
|
diverse, inclusive, and healthy community.
|
||||||
|
|
||||||
|
## Our Standards
|
||||||
|
|
||||||
|
Examples of behavior that contributes to a positive environment for our
|
||||||
|
community include:
|
||||||
|
|
||||||
|
* Demonstrating empathy and kindness toward other people
|
||||||
|
* Being respectful of differing opinions, viewpoints, and experiences
|
||||||
|
* Giving and gracefully accepting constructive feedback
|
||||||
|
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||||
|
and learning from the experience
|
||||||
|
* Focusing on what is best not just for us as individuals, but for the overall
|
||||||
|
community
|
||||||
|
|
||||||
|
Examples of unacceptable behavior include:
|
||||||
|
|
||||||
|
* The use of sexualized language or imagery, and sexual attention or advances of
|
||||||
|
any kind
|
||||||
|
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||||
|
* Public or private harassment
|
||||||
|
* Publishing others' private information, such as a physical or email address,
|
||||||
|
without their explicit permission
|
||||||
|
* Other conduct which could reasonably be considered inappropriate in a
|
||||||
|
professional setting
|
||||||
|
|
||||||
|
## Enforcement Responsibilities
|
||||||
|
|
||||||
|
Community leaders are responsible for clarifying and enforcing our standards of
|
||||||
|
acceptable behavior and will take appropriate and fair corrective action in
|
||||||
|
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||||
|
or harmful.
|
||||||
|
|
||||||
|
Community leaders have the right and responsibility to remove, edit, or reject
|
||||||
|
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||||
|
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||||
|
decisions when appropriate.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This Code of Conduct applies within all community spaces, and also applies when
|
||||||
|
an individual is officially representing the community in public spaces.
|
||||||
|
Examples of representing our community include using an official email address,
|
||||||
|
posting via an official social media account, or acting as an appointed
|
||||||
|
representative at an online or offline/IRL event.
|
||||||
|
|
||||||
|
## Enforcement
|
||||||
|
|
||||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
|
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
||||||
|
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
|
||||||
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
|
reporter of any incident.
|
||||||
|
|
||||||
|
## Enforcement Guidelines
|
||||||
|
|
||||||
|
Community leaders will follow these Community Impact Guidelines in determining
|
||||||
|
the consequences for any action they deem in violation of this Code of Conduct:
|
||||||
|
|
||||||
|
### 1. Correction
|
||||||
|
|
||||||
|
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||||
|
unprofessional or unwelcome in the community.
|
||||||
|
|
||||||
|
**Consequence**: A private, written warning from community leaders, providing
|
||||||
|
clarity around the nature of the violation and an explanation of why the
|
||||||
|
behavior was inappropriate. A public apology may be requested.
|
||||||
|
|
||||||
|
### 2. Warning
|
||||||
|
|
||||||
|
**Community Impact**: A violation through a single incident or series of
|
||||||
|
actions.
|
||||||
|
|
||||||
|
**Consequence**: A warning with consequences for continued behavior. No
|
||||||
|
interaction with the people involved, including unsolicited interaction with
|
||||||
|
those enforcing the Code of Conduct, for a specified period of time. This
|
||||||
|
includes avoiding interactions in community spaces as well as external channels
|
||||||
|
like social media. Violating these terms may lead to a temporary or permanent
|
||||||
|
ban.
|
||||||
|
|
||||||
|
### 3. Temporary Ban
|
||||||
|
|
||||||
|
**Community Impact**: A serious violation of community standards, including
|
||||||
|
sustained inappropriate behavior.
|
||||||
|
|
||||||
|
**Consequence**: A temporary ban from any sort of interaction or public
|
||||||
|
communication with the community for a specified period of time. No public or
|
||||||
|
private interaction with the people involved, including unsolicited interaction
|
||||||
|
with those enforcing the Code of Conduct, is allowed during this period.
|
||||||
|
Violating these terms may lead to a permanent ban.
|
||||||
|
|
||||||
|
### 4. Permanent Ban
|
||||||
|
|
||||||
|
**Community Impact**: Demonstrating a pattern of violation of community
|
||||||
|
standards, including sustained inappropriate behavior, harassment of an
|
||||||
|
individual, or aggression toward or disparagement of classes of individuals.
|
||||||
|
|
||||||
|
**Consequence**: A permanent ban from any sort of public interaction within the
|
||||||
|
community.
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
|
||||||
|
version 2.1, available at
|
||||||
|
[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
|
||||||
|
|
||||||
|
Community Impact Guidelines were inspired by
|
||||||
|
[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
|
||||||
|
|
||||||
|
For answers to common questions about this code of conduct, see the
|
||||||
|
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
|
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||||
|
|
||||||
@ -1,30 +1,23 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
Thank you for your interest in contributing to vLLM!
|
Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
|
||||||
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
|
|
||||||
There are several ways you can contribute to the project:
|
|
||||||
|
|
||||||
- Identify and report any issues or bugs.
|
- Identify and report any issues or bugs.
|
||||||
- Request or add a new model.
|
- Request or add support for a new model.
|
||||||
- Suggest or implement new features.
|
- Suggest or implement new features.
|
||||||
|
- Improve documentation or contribute a how-to guide.
|
||||||
|
|
||||||
However, remember that contributions aren't just about code.
|
We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
|
||||||
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
|
|
||||||
|
|
||||||
Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
|
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
|
||||||
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
|
|
||||||
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
|
|
||||||
|
|
||||||
|
|
||||||
## Setup for development
|
## Developing
|
||||||
|
|
||||||
### Build from source
|
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -e . # This may take several minutes.
|
|
||||||
```
|
|
||||||
|
|
||||||
### Testing
|
## Testing
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
@ -36,15 +29,16 @@ mypy
|
|||||||
# Unit tests
|
# Unit tests
|
||||||
pytest tests/
|
pytest tests/
|
||||||
```
|
```
|
||||||
**Note:** Currently, the repository does not pass the mypy tests.
|
**Note:** Currently, the repository does not pass the ``mypy`` tests.
|
||||||
|
|
||||||
|
## Contribution Guidelines
|
||||||
|
|
||||||
## Contributing Guidelines
|
### Issues
|
||||||
|
|
||||||
### Issue Reporting
|
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||||
|
|
||||||
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
|
> [!IMPORTANT]
|
||||||
If not, please file a new issue, providing as much relevant information as possible.
|
> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
|
||||||
|
|
||||||
### Pull Requests & Code Reviews
|
### Pull Requests & Code Reviews
|
||||||
|
|
||||||
@ -53,4 +47,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE
|
|||||||
### Thank You
|
### Thank You
|
||||||
|
|
||||||
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
|
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
|
||||||
Your contributions make vLLM a great tool for everyone!
|
All of your contributions help make vLLM a great tool and community for everyone!
|
||||||
|
|||||||
39
Dockerfile
39
Dockerfile
@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
|
|||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.12
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
# Install Python and other dependencies
|
# Install Python and other dependencies
|
||||||
@ -27,6 +27,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
|
RUN apt-get install -y gcc-10 g++-10
|
||||||
|
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
|
||||||
|
RUN <<EOF
|
||||||
|
gcc --version
|
||||||
|
EOF
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
# this won't be needed for future versions of this docker image
|
# this won't be needed for future versions of this docker image
|
||||||
@ -37,7 +45,6 @@ WORKDIR /workspace
|
|||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
COPY requirements-common.txt requirements-common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements-adag.txt requirements-adag.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-cuda.txt
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
@ -49,6 +56,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
|
# Override the arch list for flash-attn to reduce the binary size
|
||||||
|
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
||||||
|
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
@ -61,15 +71,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
python3 -m pip install -r requirements-build.txt
|
python3 -m pip install -r requirements-build.txt
|
||||||
|
|
||||||
# files and directories related to build wheels
|
# files and directories related to build wheels
|
||||||
COPY csrc csrc
|
COPY . .
|
||||||
COPY setup.py setup.py
|
|
||||||
COPY cmake cmake
|
|
||||||
COPY CMakeLists.txt CMakeLists.txt
|
|
||||||
COPY requirements-common.txt requirements-common.txt
|
|
||||||
COPY requirements-adag.txt requirements-adag.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
|
||||||
COPY pyproject.toml pyproject.toml
|
|
||||||
COPY vllm vllm
|
|
||||||
|
|
||||||
# max jobs used by Ninja to build extensions
|
# max jobs used by Ninja to build extensions
|
||||||
ARG max_jobs=2
|
ARG max_jobs=2
|
||||||
@ -78,14 +80,13 @@ ENV MAX_JOBS=${max_jobs}
|
|||||||
ARG nvcc_threads=8
|
ARG nvcc_threads=8
|
||||||
ENV NVCC_THREADS=$nvcc_threads
|
ENV NVCC_THREADS=$nvcc_threads
|
||||||
|
|
||||||
ARG buildkite_commit
|
|
||||||
ENV BUILDKITE_COMMIT=${buildkite_commit}
|
|
||||||
|
|
||||||
ARG USE_SCCACHE
|
ARG USE_SCCACHE
|
||||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||||
ARG SCCACHE_REGION_NAME=us-west-2
|
ARG SCCACHE_REGION_NAME=us-west-2
|
||||||
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
echo "Installing sccache..." \
|
echo "Installing sccache..." \
|
||||||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||||
@ -94,6 +95,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||||
|
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||||
&& export CMAKE_BUILD_TYPE=Release \
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
&& sccache --show-stats \
|
&& sccache --show-stats \
|
||||||
@ -104,6 +106,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
fi
|
fi
|
||||||
@ -133,9 +136,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
@ -147,6 +150,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||||
@ -170,6 +174,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
|
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
|
||||||
|
COPY examples examples
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
@ -199,7 +204,7 @@ FROM vllm-base AS vllm-openai
|
|||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
@ -2,9 +2,14 @@
|
|||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-1
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
|
||||||
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
apt-get update -y \
|
apt-get update -y \
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
@ -17,14 +22,30 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
|
|||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
|
RUN pip install intel_extension_for_pytorch==2.4.0
|
||||||
|
|
||||||
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||||
pip install --upgrade pip && \
|
pip install --upgrade pip && \
|
||||||
pip install -r requirements-build.txt
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
|
# install oneDNN
|
||||||
|
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
|
||||||
|
-DONEDNN_BUILD_DOC=OFF \
|
||||||
|
-DONEDNN_BUILD_EXAMPLES=OFF \
|
||||||
|
-DONEDNN_BUILD_TESTS=OFF \
|
||||||
|
-DONEDNN_BUILD_GRAPH=OFF \
|
||||||
|
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
|
||||||
|
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
|
||||||
|
cmake --build ./oneDNN/build --target install --config Release
|
||||||
|
|
||||||
FROM cpu-test-1 AS build
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@ -40,11 +61,12 @@ COPY ./ ./
|
|||||||
ARG VLLM_CPU_DISABLE_AVX512
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||||
pip install dist/*.whl
|
pip install dist/*.whl && \
|
||||||
|
rm -rf dist
|
||||||
|
|
||||||
WORKDIR /workspace/
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,17 @@
|
|||||||
# default base image
|
# default base image
|
||||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
|
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
RUN echo "Base image is $BASE_IMAGE"
|
RUN echo "Base image is $BASE_IMAGE"
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
git \
|
||||||
|
python3 \
|
||||||
|
python3-pip \
|
||||||
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
### Mount Point ###
|
### Mount Point ###
|
||||||
# When launching the container, mount the code directory to /app
|
# When launching the container, mount the code directory to /app
|
||||||
@ -18,19 +23,19 @@ RUN python3 -m pip install --upgrade pip
|
|||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
||||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
|
||||||
COPY ./vllm /app/vllm/vllm
|
COPY . /app/vllm
|
||||||
COPY ./setup.py /app/vllm/setup.py
|
|
||||||
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
|
|
||||||
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
|
|
||||||
|
|
||||||
RUN cd /app/vllm \
|
RUN cd /app/vllm \
|
||||||
&& python3 -m pip install -U -r requirements-neuron.txt
|
&& python3 -m pip install -U \
|
||||||
|
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
|
||||||
|
-r requirements-neuron.txt
|
||||||
|
|
||||||
ENV VLLM_TARGET_DEVICE neuron
|
ENV VLLM_TARGET_DEVICE neuron
|
||||||
RUN cd /app/vllm \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
&& pip install -e . \
|
cd /app/vllm \
|
||||||
|
&& pip install --no-build-isolation -v -e . \
|
||||||
&& cd ..
|
&& cd ..
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
@ -4,19 +4,12 @@
|
|||||||
FROM ubuntu:22.04 AS dev
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
RUN apt-get update -y && \
|
||||||
apt-get install -y python3-pip git
|
apt-get install -y \
|
||||||
|
git python3-pip \
|
||||||
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# copy requirements
|
COPY . .
|
||||||
COPY requirements-build.txt /workspace/vllm/
|
|
||||||
COPY requirements-common.txt /workspace/vllm/
|
|
||||||
COPY requirements-openvino.txt /workspace/vllm/
|
|
||||||
|
|
||||||
COPY vllm/ /workspace/vllm/vllm
|
|
||||||
COPY csrc/core /workspace/vllm/csrc/core
|
|
||||||
COPY cmake/utils.cmake /workspace/vllm/cmake/
|
|
||||||
COPY CMakeLists.txt /workspace/vllm/
|
|
||||||
COPY setup.py /workspace/vllm/
|
|
||||||
|
|
||||||
# install build requirements
|
# install build requirements
|
||||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
||||||
|
|||||||
@ -2,21 +2,32 @@ FROM mambaorg/micromamba
|
|||||||
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
# Some packages in requirements-cpu are installed here
|
# Some packages in requirements-cpu are installed here
|
||||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
# Currently these may not be available for venv or pip directly
|
# Currently these may not be available for venv or pip directly
|
||||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
# These packages will be in rocketce eventually
|
# These packages will be in rocketce eventually
|
||||||
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||||
|
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
|
||||||
|
torch==2.3.1 \
|
||||||
|
-r requirements-cpu.txt \
|
||||||
|
xformers uvloop==0.20.0
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /workspace/
|
||||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
# Default ROCm 6.1 base image
|
# Default ROCm 6.2 base image
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
|
||||||
|
|
||||||
# Default ROCm ARCHes to build vLLM for.
|
# Default ROCm ARCHes to build vLLM for.
|
||||||
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||||
@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
|||||||
# Whether to install CK-based flash-attention
|
# Whether to install CK-based flash-attention
|
||||||
# If 0, will not install flash-attention
|
# If 0, will not install flash-attention
|
||||||
ARG BUILD_FA="1"
|
ARG BUILD_FA="1"
|
||||||
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
|
|
||||||
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
|
|
||||||
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
|
|
||||||
# architectures specified in `FA_GFX_ARCHS`
|
|
||||||
ARG TRY_FA_WHEEL="1"
|
|
||||||
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
|
|
||||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||||
ARG FA_BRANCH="23a2b1c2"
|
ARG FA_BRANCH="3cea2fb"
|
||||||
|
|
||||||
# Whether to build triton on rocm
|
# Whether to build triton on rocm
|
||||||
ARG BUILD_TRITON="1"
|
ARG BUILD_TRITON="1"
|
||||||
ARG TRITON_BRANCH="e0fc12c"
|
ARG TRITON_BRANCH="e192dba"
|
||||||
|
|
||||||
### Base image build stage
|
### Base image build stage
|
||||||
FROM $BASE_IMAGE AS base
|
FROM $BASE_IMAGE AS base
|
||||||
@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip
|
|||||||
# Remove sccache so it doesn't interfere with ccache
|
# Remove sccache so it doesn't interfere with ccache
|
||||||
# TODO: implement sccache support across components
|
# TODO: implement sccache support across components
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
# Install torch == 2.5.0 on ROCm
|
|
||||||
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
# Install torch == 2.6.0 on ROCm
|
||||||
*"rocm-6.1"*) \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
|
*"rocm-6.2"*) \
|
||||||
python3 -m pip uninstall -y torch torchvision \
|
python3 -m pip uninstall -y torch torchvision \
|
||||||
&& python3 -m pip install --no-cache-dir --pre \
|
&& python3 -m pip install --pre \
|
||||||
torch==2.5.0.dev20240726 \
|
torch==2.6.0.dev20240918 \
|
||||||
torchvision==0.20.0.dev20240726 \
|
setuptools-scm>=8 \
|
||||||
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
|
torchvision==0.20.0.dev20240918 \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
|
||||||
*) ;; esac
|
*) ;; esac
|
||||||
|
|
||||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
||||||
@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \
|
|||||||
### Flash-Attention wheel build stage
|
### Flash-Attention wheel build stage
|
||||||
FROM base AS build_fa
|
FROM base AS build_fa
|
||||||
ARG BUILD_FA
|
ARG BUILD_FA
|
||||||
ARG TRY_FA_WHEEL
|
|
||||||
ARG FA_WHEEL_URL
|
|
||||||
ARG FA_GFX_ARCHS
|
ARG FA_GFX_ARCHS
|
||||||
ARG FA_BRANCH
|
ARG FA_BRANCH
|
||||||
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
if [ "$BUILD_FA" = "1" ]; then \
|
if [ "$BUILD_FA" = "1" ]; then \
|
||||||
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
|
mkdir -p libs \
|
||||||
# If a suitable wheel exists, we download it instead of building FA
|
&& cd libs \
|
||||||
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
|
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||||
else \
|
&& cd flash-attention \
|
||||||
mkdir -p libs \
|
&& git checkout "${FA_BRANCH}" \
|
||||||
&& cd libs \
|
&& git submodule update --init \
|
||||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||||
&& cd flash-attention \
|
|
||||||
&& git checkout "${FA_BRANCH}" \
|
|
||||||
&& git submodule update --init \
|
|
||||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
|
||||||
fi; \
|
|
||||||
# Create an empty directory otherwise as later build stages expect one
|
# Create an empty directory otherwise as later build stages expect one
|
||||||
else mkdir -p /install; \
|
else mkdir -p /install; \
|
||||||
fi
|
fi
|
||||||
@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|||||||
if [ "$BUILD_TRITON" = "1" ]; then \
|
if [ "$BUILD_TRITON" = "1" ]; then \
|
||||||
mkdir -p libs \
|
mkdir -p libs \
|
||||||
&& cd libs \
|
&& cd libs \
|
||||||
|
&& python3 -m pip install ninja cmake wheel pybind11 \
|
||||||
&& git clone https://github.com/OpenAI/triton.git \
|
&& git clone https://github.com/OpenAI/triton.git \
|
||||||
&& cd triton \
|
&& cd triton \
|
||||||
&& git checkout "${TRITON_BRANCH}" \
|
&& git checkout "${TRITON_BRANCH}" \
|
||||||
@ -129,7 +120,7 @@ COPY . .
|
|||||||
|
|
||||||
# Package upgrades for useful functionality or to avoid dependency issues
|
# Package upgrades for useful functionality or to avoid dependency issues
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
|
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
|
||||||
|
|
||||||
|
|
||||||
# Workaround for ray >= 2.10.0
|
# Workaround for ray >= 2.10.0
|
||||||
@ -138,15 +129,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
|||||||
ENV TOKENIZERS_PARALLELISM=false
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -Ur requirements-rocm.txt \
|
python3 -m pip install -Ur requirements-rocm.txt \
|
||||||
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
|
||||||
*"rocm-6.1"*) \
|
|
||||||
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
|
|
||||||
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
|
|
||||||
# Prevent interference if torch bundles its own HIP runtime
|
|
||||||
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
|
|
||||||
*) ;; esac \
|
|
||||||
&& python3 setup.py clean --all \
|
&& python3 setup.py clean --all \
|
||||||
&& python3 setup.py develop
|
&& python3 setup.py develop
|
||||||
|
|
||||||
|
|||||||
@ -4,14 +4,26 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
|
|||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
git \
|
||||||
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
# Install the TPU and Pallas dependencies.
|
# Install the TPU and Pallas dependencies.
|
||||||
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
|
|
||||||
# Build vLLM.
|
# Build vLLM.
|
||||||
COPY . /workspace/vllm
|
COPY . /workspace/vllm
|
||||||
ENV VLLM_TARGET_DEVICE="tpu"
|
ENV VLLM_TARGET_DEVICE="tpu"
|
||||||
RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
cd /workspace/vllm && \
|
||||||
|
python3 -m pip install \
|
||||||
|
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
|
||||||
|
-r requirements-tpu.txt
|
||||||
RUN cd /workspace/vllm && python3 setup.py develop
|
RUN cd /workspace/vllm && python3 setup.py develop
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
@ -1,22 +1,55 @@
|
|||||||
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
|
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y && \
|
||||||
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
|
apt-get install -y --no-install-recommends --fix-missing \
|
||||||
|
curl \
|
||||||
|
ffmpeg \
|
||||||
|
git \
|
||||||
|
libsndfile1 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libgl1 \
|
||||||
|
lsb-release \
|
||||||
|
numactl \
|
||||||
|
python3 \
|
||||||
|
python3-dev \
|
||||||
|
python3-pip \
|
||||||
|
# vim \
|
||||||
|
wget
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
|
||||||
|
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install --no-cache-dir \
|
||||||
|
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
|
||||||
|
-r requirements-xpu.txt
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
ENV VLLM_TARGET_DEVICE=xpu
|
||||||
|
|
||||||
RUN pip install -v -r requirements-xpu.txt
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
|
python3 setup.py install
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
FROM vllm-base AS vllm-openai
|
||||||
|
|
||||||
|
# install additional dependencies for openai api server
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||||
|
|
||||||
|
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||||
|
TRITON_XPU_PROFILE 1
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include requirements-adag.txt
|
|
||||||
include requirements-common.txt
|
include requirements-common.txt
|
||||||
include requirements-cuda.txt
|
include requirements-cuda.txt
|
||||||
include requirements-rocm.txt
|
include requirements-rocm.txt
|
||||||
|
|||||||
25
README.md
25
README.md
@ -10,22 +10,14 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
|
|
||||||
|
|
||||||
We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
|
|
||||||
Join us to hear the vLLM's recent update about performance.
|
|
||||||
Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
|
||||||
|
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||||
@ -50,7 +42,7 @@ vLLM is fast with:
|
|||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
|
|
||||||
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
|
||||||
|
|
||||||
vLLM is flexible and easy to use with:
|
vLLM is flexible and easy to use with:
|
||||||
|
|
||||||
@ -130,3 +122,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
year={2023}
|
year={2023}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Contact Us
|
||||||
|
|
||||||
|
* For technical questions and feature requests, please use Github issues or discussions.
|
||||||
|
* For discussing with fellow users, please use Discord.
|
||||||
|
* For security disclosures, please use Github's security advisory feature.
|
||||||
|
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||||
|
|||||||
11
SECURITY.md
Normal file
11
SECURITY.md
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Security Policy
|
||||||
|
|
||||||
|
## Reporting a Vulnerability
|
||||||
|
|
||||||
|
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||||
|
|
||||||
|
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
@ -23,7 +23,9 @@ class RequestFuncInput:
|
|||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
best_of: int = 1
|
best_of: int = 1
|
||||||
use_beam_search: bool = False
|
logprobs: Optional[int] = None
|
||||||
|
multi_modal_content: Optional[dict] = None
|
||||||
|
ignore_eos: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -46,13 +48,13 @@ async def async_request_tgi(
|
|||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
params = {
|
params = {
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
|
# TGI does not accept ignore_eos flag.
|
||||||
}
|
}
|
||||||
payload = {
|
payload = {
|
||||||
"inputs": request_func_input.prompt,
|
"inputs": request_func_input.prompt,
|
||||||
@ -117,7 +119,6 @@ async def async_request_trt_llm(
|
|||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
assert request_func_input.best_of == 1
|
assert request_func_input.best_of == 1
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
@ -127,6 +128,8 @@ async def async_request_trt_llm(
|
|||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
}
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["min_length"] = request_func_input.output_len
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@ -181,7 +184,6 @@ async def async_request_deepspeed_mii(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert request_func_input.best_of == 1
|
assert request_func_input.best_of == 1
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
@ -229,14 +231,15 @@ async def async_request_openai_completions(
|
|||||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
|
"ignore_eos": request_func_input.ignore_eos,
|
||||||
}
|
}
|
||||||
headers = {
|
headers = {
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
@ -309,18 +312,21 @@ async def async_request_openai_chat_completions(
|
|||||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
|
if request_func_input.multi_modal_content:
|
||||||
|
content.append(request_func_input.multi_modal_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": request_func_input.prompt,
|
"content": content
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
|
"ignore_eos": request_func_input.ignore_eos,
|
||||||
}
|
}
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
@ -424,4 +430,5 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
|
"sglang": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -10,8 +10,8 @@ import torch
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
|
||||||
from vllm.inputs import PromptInputs
|
from vllm.inputs import PromptType
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
@ -51,9 +51,8 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
temperature=0.0 if args.use_beam_search else 1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
use_beam_search=args.use_beam_search,
|
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=args.output_len,
|
max_tokens=args.output_len,
|
||||||
)
|
)
|
||||||
@ -61,7 +60,7 @@ def main(args: argparse.Namespace):
|
|||||||
dummy_prompt_token_ids = np.random.randint(10000,
|
dummy_prompt_token_ids = np.random.randint(10000,
|
||||||
size=(args.batch_size,
|
size=(args.batch_size,
|
||||||
args.input_len))
|
args.input_len))
|
||||||
dummy_inputs: List[PromptInputs] = [{
|
dummy_prompts: List[PromptType] = [{
|
||||||
"prompt_token_ids": batch
|
"prompt_token_ids": batch
|
||||||
} for batch in dummy_prompt_token_ids.tolist()]
|
} for batch in dummy_prompt_token_ids.tolist()]
|
||||||
|
|
||||||
@ -74,13 +73,13 @@ def main(args: argparse.Namespace):
|
|||||||
],
|
],
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
str(profile_dir))) as p:
|
str(profile_dir))) as p:
|
||||||
llm.generate(dummy_inputs,
|
llm.generate(dummy_prompts,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
use_tqdm=False)
|
use_tqdm=False)
|
||||||
print(p.key_averages())
|
print(p.key_averages())
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm.generate(dummy_inputs,
|
llm.generate(dummy_prompts,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
use_tqdm=False)
|
use_tqdm=False)
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
@ -205,13 +204,11 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help=('path to save the pytorch profiler output. Can be visualized '
|
help=('path to save the pytorch profiler output. Can be visualized '
|
||||||
'with ui.perfetto.dev or Tensorboard.'))
|
'with ui.perfetto.dev or Tensorboard.'))
|
||||||
parser.add_argument(
|
parser.add_argument("--device",
|
||||||
"--device",
|
type=str,
|
||||||
type=str,
|
default="auto",
|
||||||
default="auto",
|
choices=DEVICE_OPTIONS,
|
||||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
help='device type for vLLM execution')
|
||||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
|
||||||
'CPU.')
|
|
||||||
parser.add_argument('--block-size',
|
parser.add_argument('--block-size',
|
||||||
type=int,
|
type=int,
|
||||||
default=16,
|
default=16,
|
||||||
@ -224,7 +221,9 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument("--enable-prefix-caching",
|
parser.add_argument("--enable-prefix-caching",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help="Enable automatic prefix caching")
|
help="Enable automatic prefix caching")
|
||||||
parser.add_argument('--use-v2-block-manager', action='store_true')
|
parser.add_argument('--use-v2-block-manager',
|
||||||
|
action='store_true',
|
||||||
|
default=EngineArgs.use_v2_block_manager)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--ray-workers-use-nsight",
|
"--ray-workers-use-nsight",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
|||||||
@ -33,6 +33,7 @@ from typing import List, Optional, Tuple
|
|||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -113,7 +114,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
|
|||||||
def main(args):
|
def main(args):
|
||||||
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
||||||
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
||||||
|
random.seed(args.seed)
|
||||||
if args.dataset_path is not None:
|
if args.dataset_path is not None:
|
||||||
print(f"Start to sample {args.num_prompts} prompts"
|
print(f"Start to sample {args.num_prompts} prompts"
|
||||||
"from {args.dataset_path}")
|
"from {args.dataset_path}")
|
||||||
@ -177,6 +178,7 @@ if __name__ == "__main__":
|
|||||||
help='enable prefix caching')
|
help='enable prefix caching')
|
||||||
parser.add_argument('--use-v2-block-manager',
|
parser.add_argument('--use-v2-block-manager',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
default=EngineArgs.use_v2_block_manager,
|
||||||
help='Use BlockSpaceMangerV2')
|
help='Use BlockSpaceMangerV2')
|
||||||
parser.add_argument('--num-prompts',
|
parser.add_argument('--num-prompts',
|
||||||
type=int,
|
type=int,
|
||||||
@ -194,5 +196,9 @@ if __name__ == "__main__":
|
|||||||
default='128:256',
|
default='128:256',
|
||||||
help='Range of input lengths for sampling prompts,'
|
help='Range of input lengths for sampling prompts,'
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
|
parser.add_argument("--seed",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='Random seed for reproducibility')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
293
benchmarks/benchmark_prioritization.py
Normal file
293
benchmarks/benchmark_prioritization.py
Normal file
@ -0,0 +1,293 @@
|
|||||||
|
"""Benchmark offline prioritization."""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
|
||||||
|
|
||||||
|
def sample_requests(
|
||||||
|
dataset_path: str,
|
||||||
|
num_requests: int,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
fixed_output_len: Optional[int],
|
||||||
|
) -> List[Tuple[str, int, int]]:
|
||||||
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
|
# Load the dataset.
|
||||||
|
with open(dataset_path) as f:
|
||||||
|
dataset = json.load(f)
|
||||||
|
# Filter out the conversations with less than 2 turns.
|
||||||
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
|
# Only keep the first two turns of each conversation.
|
||||||
|
dataset = [(data["conversations"][0]["value"],
|
||||||
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
|
|
||||||
|
# Shuffle the dataset.
|
||||||
|
random.shuffle(dataset)
|
||||||
|
|
||||||
|
# Filter out sequences that are too long or too short
|
||||||
|
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||||
|
for i in range(len(dataset)):
|
||||||
|
if len(filtered_dataset) == num_requests:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Tokenize the prompts and completions.
|
||||||
|
prompt = dataset[i][0]
|
||||||
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
|
completion = dataset[i][1]
|
||||||
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
|
prompt_len = len(prompt_token_ids)
|
||||||
|
output_len = len(completion_token_ids
|
||||||
|
) if fixed_output_len is None else fixed_output_len
|
||||||
|
if prompt_len < 4 or output_len < 4:
|
||||||
|
# Prune too short sequences.
|
||||||
|
continue
|
||||||
|
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||||
|
# Prune too long sequences.
|
||||||
|
continue
|
||||||
|
|
||||||
|
#Select a equi-probable random priority
|
||||||
|
priority = 0 if random.random() < 0.5 else 1
|
||||||
|
|
||||||
|
filtered_dataset.append((prompt, prompt_len, output_len, priority))
|
||||||
|
|
||||||
|
return filtered_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def run_vllm(
|
||||||
|
requests: List[Tuple[str, int, int]],
|
||||||
|
model: str,
|
||||||
|
tokenizer: str,
|
||||||
|
quantization: Optional[str],
|
||||||
|
tensor_parallel_size: int,
|
||||||
|
seed: int,
|
||||||
|
n: int,
|
||||||
|
trust_remote_code: bool,
|
||||||
|
dtype: str,
|
||||||
|
max_model_len: Optional[int],
|
||||||
|
enforce_eager: bool,
|
||||||
|
kv_cache_dtype: str,
|
||||||
|
quantization_param_path: Optional[str],
|
||||||
|
device: str,
|
||||||
|
enable_prefix_caching: bool,
|
||||||
|
enable_chunked_prefill: bool,
|
||||||
|
max_num_batched_tokens: int,
|
||||||
|
gpu_memory_utilization: float = 0.9,
|
||||||
|
download_dir: Optional[str] = None,
|
||||||
|
) -> float:
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
llm = LLM(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
quantization=quantization,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
seed=seed,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
dtype=dtype,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
|
enforce_eager=enforce_eager,
|
||||||
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
|
quantization_param_path=quantization_param_path,
|
||||||
|
device=device,
|
||||||
|
enable_prefix_caching=enable_prefix_caching,
|
||||||
|
download_dir=download_dir,
|
||||||
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
|
disable_log_stats=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add the requests to the engine.
|
||||||
|
prompts = []
|
||||||
|
sampling_params = []
|
||||||
|
priority = []
|
||||||
|
for prompt, _, output_len, _priority in requests:
|
||||||
|
prompts.append(prompt)
|
||||||
|
priority.append(_priority)
|
||||||
|
sampling_params.append(
|
||||||
|
SamplingParams(
|
||||||
|
n=n,
|
||||||
|
temperature=1.0,
|
||||||
|
top_p=1.0,
|
||||||
|
ignore_eos=True,
|
||||||
|
max_tokens=output_len,
|
||||||
|
))
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
||||||
|
end = time.perf_counter()
|
||||||
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace):
|
||||||
|
print(args)
|
||||||
|
random.seed(args.seed)
|
||||||
|
|
||||||
|
# Sample the requests.
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
|
if args.dataset is None:
|
||||||
|
# Synthesize a prompt with the given input length.
|
||||||
|
prompt = "hi" * (args.input_len - 1)
|
||||||
|
requests = [(prompt, args.input_len, args.output_len)
|
||||||
|
for _ in range(args.num_prompts)]
|
||||||
|
else:
|
||||||
|
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||||
|
args.output_len)
|
||||||
|
|
||||||
|
if args.backend == "vllm":
|
||||||
|
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
||||||
|
args.quantization, args.tensor_parallel_size,
|
||||||
|
args.seed, args.n, args.trust_remote_code,
|
||||||
|
args.dtype, args.max_model_len,
|
||||||
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
|
args.quantization_param_path, args.device,
|
||||||
|
args.enable_prefix_caching,
|
||||||
|
args.enable_chunked_prefill,
|
||||||
|
args.max_num_batched_tokens,
|
||||||
|
args.gpu_memory_utilization, args.download_dir)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
|
total_num_tokens = sum(prompt_len + output_len
|
||||||
|
for _, prompt_len, output_len, priority in requests)
|
||||||
|
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
|
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||||
|
|
||||||
|
# Output JSON results if specified
|
||||||
|
if args.output_json:
|
||||||
|
results = {
|
||||||
|
"elapsed_time": elapsed_time,
|
||||||
|
"num_requests": len(requests),
|
||||||
|
"total_num_tokens": total_num_tokens,
|
||||||
|
"requests_per_second": len(requests) / elapsed_time,
|
||||||
|
"tokens_per_second": total_num_tokens / elapsed_time,
|
||||||
|
}
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(results, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
||||||
|
parser.add_argument("--backend",
|
||||||
|
type=str,
|
||||||
|
choices=["vllm", "hf", "mii"],
|
||||||
|
default="vllm")
|
||||||
|
parser.add_argument("--dataset",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to the dataset.")
|
||||||
|
parser.add_argument("--input-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Input prompt length for each request")
|
||||||
|
parser.add_argument("--output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the "
|
||||||
|
"output length from the dataset.")
|
||||||
|
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
||||||
|
parser.add_argument("--tokenizer", type=str, default=None)
|
||||||
|
parser.add_argument('--quantization',
|
||||||
|
'-q',
|
||||||
|
choices=[*QUANTIZATION_METHODS, None],
|
||||||
|
default=None)
|
||||||
|
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||||
|
parser.add_argument("--n",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of generated sequences per prompt.")
|
||||||
|
parser.add_argument("--num-prompts",
|
||||||
|
type=int,
|
||||||
|
default=200,
|
||||||
|
help="Number of prompts to process.")
|
||||||
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument('--trust-remote-code',
|
||||||
|
action='store_true',
|
||||||
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='Maximum length of a sequence (including prompt and output). '
|
||||||
|
'If None, will be derived from the model.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--dtype',
|
||||||
|
type=str,
|
||||||
|
default='auto',
|
||||||
|
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||||
|
help='data type for model weights and activations. '
|
||||||
|
'The "auto" option will use FP16 precision '
|
||||||
|
'for FP32 and FP16 models, and BF16 precision '
|
||||||
|
'for BF16 models.')
|
||||||
|
parser.add_argument('--gpu-memory-utilization',
|
||||||
|
type=float,
|
||||||
|
default=0.9,
|
||||||
|
help='the fraction of GPU memory to be used for '
|
||||||
|
'the model executor, which can range from 0 to 1.'
|
||||||
|
'If unspecified, will use the default value of 0.9.')
|
||||||
|
parser.add_argument("--enforce-eager",
|
||||||
|
action="store_true",
|
||||||
|
help="enforce eager execution")
|
||||||
|
parser.add_argument(
|
||||||
|
'--kv-cache-dtype',
|
||||||
|
type=str,
|
||||||
|
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||||
|
default="auto",
|
||||||
|
help='Data type for kv cache storage. If "auto", will use model '
|
||||||
|
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||||
|
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--quantization-param-path',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||||
|
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||||
|
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||||
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
|
'instead supported for common inference criteria.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
type=str,
|
||||||
|
default="cuda",
|
||||||
|
choices=["cuda", "cpu"],
|
||||||
|
help='device type for vLLM execution, supporting CUDA and CPU.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--enable-prefix-caching",
|
||||||
|
action='store_true',
|
||||||
|
help="enable automatic prefix caching for vLLM backend.")
|
||||||
|
parser.add_argument("--enable-chunked-prefill",
|
||||||
|
action='store_true',
|
||||||
|
help="enable chunked prefill for vLLM backend.")
|
||||||
|
parser.add_argument('--max-num-batched-tokens',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='maximum number of batched tokens per '
|
||||||
|
'iteration')
|
||||||
|
parser.add_argument('--download-dir',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='directory to download and load the weights, '
|
||||||
|
'default to the default cache dir of huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--output-json',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.tokenizer is None:
|
||||||
|
args.tokenizer = args.model
|
||||||
|
if args.dataset is None:
|
||||||
|
assert args.input_len is not None
|
||||||
|
assert args.output_len is not None
|
||||||
|
else:
|
||||||
|
assert args.input_len is None
|
||||||
|
|
||||||
|
main(args)
|
||||||
@ -1,4 +1,4 @@
|
|||||||
"""Benchmark online serving throughput.
|
r"""Benchmark online serving throughput.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
vLLM OpenAI API server
|
vLLM OpenAI API server
|
||||||
@ -24,6 +24,8 @@ On the client side, run:
|
|||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
@ -31,11 +33,13 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
RequestFuncOutput)
|
RequestFuncOutput)
|
||||||
|
from datasets import load_dataset
|
||||||
|
from PIL.Image import Image
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -84,11 +88,9 @@ def sample_sharegpt_requests(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int] = None,
|
fixed_output_len: Optional[int] = None,
|
||||||
) -> List[Tuple[str, int, int]]:
|
) -> List[Tuple[str, int, int, None]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
|
||||||
raise ValueError("output_len too small")
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path, encoding='utf-8') as f:
|
||||||
dataset = json.load(f)
|
dataset = json.load(f)
|
||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
@ -113,13 +115,13 @@ def sample_sharegpt_requests(
|
|||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = len(completion_token_ids
|
output_len = len(completion_token_ids
|
||||||
) if fixed_output_len is None else fixed_output_len
|
) if fixed_output_len is None else fixed_output_len
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||||
# Prune too long sequences.
|
# Prune too long sequences.
|
||||||
continue
|
continue
|
||||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
filtered_dataset.append((prompt, prompt_len, output_len, None))
|
||||||
|
|
||||||
return filtered_dataset
|
return filtered_dataset
|
||||||
|
|
||||||
@ -131,13 +133,13 @@ def sample_sonnet_requests(
|
|||||||
output_len: int,
|
output_len: int,
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
) -> List[Tuple[str, str, int, int]]:
|
) -> List[Tuple[str, str, int, int, None]]:
|
||||||
assert (
|
assert (
|
||||||
input_len > prefix_len
|
input_len > prefix_len
|
||||||
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
|
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path, encoding='utf-8') as f:
|
||||||
poem_lines = f.readlines()
|
poem_lines = f.readlines()
|
||||||
|
|
||||||
# Tokenize the poem lines.
|
# Tokenize the poem lines.
|
||||||
@ -174,9 +176,9 @@ def sample_sonnet_requests(
|
|||||||
# Sample the rest of lines per request.
|
# Sample the rest of lines per request.
|
||||||
sampled_requests: List[Tuple[str, int, int]] = []
|
sampled_requests: List[Tuple[str, int, int]] = []
|
||||||
for _ in range(num_requests):
|
for _ in range(num_requests):
|
||||||
sampled_lines = "".join(
|
num_lines_needed = num_input_lines - num_prefix_lines
|
||||||
prefix_lines +
|
sampled_lines = "".join(prefix_lines +
|
||||||
random.sample(poem_lines, num_input_lines - num_prefix_lines))
|
random.choices(poem_lines, k=num_lines_needed))
|
||||||
|
|
||||||
prompt = f"{base_prompt}{sampled_lines}"
|
prompt = f"{base_prompt}{sampled_lines}"
|
||||||
message = [
|
message = [
|
||||||
@ -189,14 +191,81 @@ def sample_sonnet_requests(
|
|||||||
message, add_generation_prompt=True, tokenize=False)
|
message, add_generation_prompt=True, tokenize=False)
|
||||||
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
(prompt, prompt_formatted, prompt_len, output_len))
|
(prompt, prompt_formatted, prompt_len, output_len, None))
|
||||||
|
|
||||||
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
|
def sample_hf_requests(
|
||||||
|
dataset_path: str,
|
||||||
|
dataset_subset: str,
|
||||||
|
dataset_split: str,
|
||||||
|
num_requests: int,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
fixed_output_len: Optional[int] = None,
|
||||||
|
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
|
||||||
|
dataset = load_dataset(dataset_path,
|
||||||
|
name=dataset_subset,
|
||||||
|
split=dataset_split,
|
||||||
|
streaming=True)
|
||||||
|
assert "conversations" in dataset.features, (
|
||||||
|
"HF Dataset must have 'conversations' column.")
|
||||||
|
filtered_dataset = dataset.shuffle().filter(
|
||||||
|
lambda x: len(x["conversations"]) >= 2)
|
||||||
|
sampled_requests: List[Tuple[str, int, int, Dict[str,
|
||||||
|
Collection[str]]]] = []
|
||||||
|
for data in filtered_dataset:
|
||||||
|
if len(sampled_requests) == num_requests:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Tokenize the prompts and completions.
|
||||||
|
prompt = data["conversations"][0]["value"]
|
||||||
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
|
completion = data["conversations"][1]["value"]
|
||||||
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
|
prompt_len = len(prompt_token_ids)
|
||||||
|
output_len = len(completion_token_ids
|
||||||
|
) if fixed_output_len is None else fixed_output_len
|
||||||
|
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
|
||||||
|
# Prune too short sequences.
|
||||||
|
continue
|
||||||
|
if fixed_output_len is None and \
|
||||||
|
(prompt_len > 1024 or prompt_len + output_len > 2048):
|
||||||
|
# Prune too long sequences.
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "image" in data and isinstance(data["image"], Image):
|
||||||
|
image: Image = data["image"]
|
||||||
|
image = image.convert("RGB")
|
||||||
|
image_data = io.BytesIO()
|
||||||
|
image.save(image_data, format='JPEG')
|
||||||
|
image_base64 = base64.b64encode(
|
||||||
|
image_data.getvalue()).decode("utf-8")
|
||||||
|
mm_content = {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
mm_content = None
|
||||||
|
|
||||||
|
sampled_requests.append((prompt, prompt_len, output_len, mm_content))
|
||||||
|
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
def sample_random_requests(
|
def sample_random_requests(
|
||||||
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
|
prefix_len: int,
|
||||||
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
|
input_len: int,
|
||||||
|
output_len: int,
|
||||||
|
num_prompts: int,
|
||||||
|
range_ratio: float,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
) -> List[Tuple[str, int, int]]:
|
||||||
|
prefix_token_ids = np.random.randint(0,
|
||||||
|
tokenizer.vocab_size,
|
||||||
|
size=prefix_len).tolist()
|
||||||
|
|
||||||
input_lens = np.random.randint(
|
input_lens = np.random.randint(
|
||||||
int(input_len * range_ratio),
|
int(input_len * range_ratio),
|
||||||
@ -211,10 +280,12 @@ def sample_random_requests(
|
|||||||
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
||||||
input_requests = []
|
input_requests = []
|
||||||
for i in range(num_prompts):
|
for i in range(num_prompts):
|
||||||
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
|
prompt = tokenizer.decode(prefix_token_ids +
|
||||||
|
[(offsets[i] + i + j) % tokenizer.vocab_size
|
||||||
for j in range(input_lens[i])])
|
for j in range(input_lens[i])])
|
||||||
input_requests.append(
|
|
||||||
(prompt, int(input_lens[i]), int(output_lens[i])))
|
input_requests.append((prompt, int(prefix_len + input_lens[i]),
|
||||||
|
int(output_lens[i]), None))
|
||||||
|
|
||||||
return input_requests
|
return input_requests
|
||||||
|
|
||||||
@ -318,13 +389,14 @@ async def benchmark(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
|
logprobs: Optional[int],
|
||||||
best_of: int,
|
best_of: int,
|
||||||
use_beam_search: bool,
|
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
profile: bool,
|
profile: bool,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: List[str],
|
||||||
selected_percentiles: List[str],
|
selected_percentiles: List[str],
|
||||||
|
ignore_eos: bool,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -332,15 +404,22 @@ async def benchmark(
|
|||||||
raise ValueError(f"Unknown backend: {backend}")
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
test_prompt, test_prompt_len, test_output_len, test_mm_content = (
|
||||||
|
input_requests[0])
|
||||||
|
if backend != "openai-chat" and test_mm_content is not None:
|
||||||
|
# multi-modal benchmark is only available on OpenAI Chat backend.
|
||||||
|
raise ValueError(
|
||||||
|
"Multi-modal content is only supported on 'openai-chat' backend.")
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=test_prompt,
|
prompt=test_prompt,
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
multi_modal_content=test_mm_content,
|
||||||
|
ignore_eos=ignore_eos,
|
||||||
)
|
)
|
||||||
test_output = await request_func(request_func_input=test_input)
|
test_output = await request_func(request_func_input=test_input)
|
||||||
if not test_output.success:
|
if not test_output.success:
|
||||||
@ -358,8 +437,9 @@ async def benchmark(
|
|||||||
api_url=base_url + "/start_profile",
|
api_url=base_url + "/start_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
multi_modal_content=test_mm_content,
|
||||||
)
|
)
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
@ -372,15 +452,16 @@ async def benchmark(
|
|||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks: List[asyncio.Task] = []
|
tasks: List[asyncio.Task] = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate):
|
||||||
prompt, prompt_len, output_len = request
|
prompt, prompt_len, output_len, mm_content = request
|
||||||
request_func_input = RequestFuncInput(
|
request_func_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
output_len=output_len,
|
output_len=output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
multi_modal_content=mm_content,
|
||||||
)
|
)
|
||||||
tasks.append(
|
tasks.append(
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
@ -396,8 +477,8 @@ async def benchmark(
|
|||||||
api_url=base_url + "/stop_profile",
|
api_url=base_url + "/stop_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
best_of=best_of,
|
best_of=best_of,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
)
|
)
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
@ -455,7 +536,7 @@ async def benchmark(
|
|||||||
# E.g., "Time to First Token"
|
# E.g., "Time to First Token"
|
||||||
metric_header: str,
|
metric_header: str,
|
||||||
):
|
):
|
||||||
# This function print and add statistics of the specified
|
# This function prints and adds statistics of the specified
|
||||||
# metric.
|
# metric.
|
||||||
if metric_attribute_name not in selected_percentile_metrics:
|
if metric_attribute_name not in selected_percentile_metrics:
|
||||||
return
|
return
|
||||||
@ -541,9 +622,9 @@ def main(args: argparse.Namespace):
|
|||||||
prefix_len=args.sonnet_prefix_len,
|
prefix_len=args.sonnet_prefix_len,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
input_requests = [(prompt, prompt_len, output_len)
|
input_requests = [(prompt, prompt_len, output_len, None)
|
||||||
for prompt, prompt_formatted, prompt_len,
|
for prompt, prompt_formatted, prompt_len,
|
||||||
output_len in input_requests]
|
output_len, _ in input_requests]
|
||||||
else:
|
else:
|
||||||
assert (
|
assert (
|
||||||
tokenizer.chat_template or tokenizer.default_chat_template
|
tokenizer.chat_template or tokenizer.default_chat_template
|
||||||
@ -556,12 +637,23 @@ def main(args: argparse.Namespace):
|
|||||||
prefix_len=args.sonnet_prefix_len,
|
prefix_len=args.sonnet_prefix_len,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
input_requests = [(prompt_formatted, prompt_len, output_len)
|
input_requests = [(prompt_formatted, prompt_len, output_len, None)
|
||||||
for prompt, prompt_formatted, prompt_len,
|
for prompt, prompt_formatted, prompt_len,
|
||||||
output_len in input_requests]
|
output_len, _ in input_requests]
|
||||||
|
|
||||||
|
elif args.dataset_name == "hf":
|
||||||
|
input_requests = sample_hf_requests(
|
||||||
|
dataset_path=args.dataset_path,
|
||||||
|
dataset_subset=args.hf_subset,
|
||||||
|
dataset_split=args.hf_split,
|
||||||
|
num_requests=args.num_prompts,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
fixed_output_len=args.hf_output_len,
|
||||||
|
)
|
||||||
|
|
||||||
elif args.dataset_name == "random":
|
elif args.dataset_name == "random":
|
||||||
input_requests = sample_random_requests(
|
input_requests = sample_random_requests(
|
||||||
|
prefix_len=args.random_prefix_len,
|
||||||
input_len=args.random_input_len,
|
input_len=args.random_input_len,
|
||||||
output_len=args.random_output_len,
|
output_len=args.random_output_len,
|
||||||
num_prompts=args.num_prompts,
|
num_prompts=args.num_prompts,
|
||||||
@ -580,8 +672,8 @@ def main(args: argparse.Namespace):
|
|||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
input_requests=input_requests,
|
input_requests=input_requests,
|
||||||
|
logprobs=args.logprobs,
|
||||||
best_of=args.best_of,
|
best_of=args.best_of,
|
||||||
use_beam_search=args.use_beam_search,
|
|
||||||
request_rate=args.request_rate,
|
request_rate=args.request_rate,
|
||||||
disable_tqdm=args.disable_tqdm,
|
disable_tqdm=args.disable_tqdm,
|
||||||
profile=args.profile,
|
profile=args.profile,
|
||||||
@ -589,6 +681,7 @@ def main(args: argparse.Namespace):
|
|||||||
selected_percentiles=[
|
selected_percentiles=[
|
||||||
float(p) for p in args.metric_percentiles.split(",")
|
float(p) for p in args.metric_percentiles.split(",")
|
||||||
],
|
],
|
||||||
|
ignore_eos=args.ignore_eos,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
@ -602,7 +695,6 @@ def main(args: argparse.Namespace):
|
|||||||
result_json["model_id"] = model_id
|
result_json["model_id"] = model_id
|
||||||
result_json["tokenizer_id"] = tokenizer_id
|
result_json["tokenizer_id"] = tokenizer_id
|
||||||
result_json["best_of"] = args.best_of
|
result_json["best_of"] = args.best_of
|
||||||
result_json["use_beam_search"] = args.use_beam_search
|
|
||||||
result_json["num_prompts"] = args.num_prompts
|
result_json["num_prompts"] = args.num_prompts
|
||||||
|
|
||||||
# Metadata
|
# Metadata
|
||||||
@ -630,7 +722,7 @@ def main(args: argparse.Namespace):
|
|||||||
file_name = args.result_filename
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(file_name, "w") as outfile:
|
with open(file_name, "w", encoding='utf-8') as outfile:
|
||||||
json.dump(result_json, outfile)
|
json.dump(result_json, outfile)
|
||||||
|
|
||||||
|
|
||||||
@ -668,13 +760,14 @@ if __name__ == "__main__":
|
|||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
choices=["sharegpt", "sonnet", "random"],
|
choices=["sharegpt", "sonnet", "random", "hf"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--dataset-path",
|
parser.add_argument("--dataset-path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to the dataset.")
|
help="Path to the sharegpt/sonnet dataset. "
|
||||||
|
"Or the huggingface dataset ID if using HF dataset.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model",
|
"--model",
|
||||||
type=str,
|
type=str,
|
||||||
@ -702,52 +795,14 @@ if __name__ == "__main__":
|
|||||||
help="Number of prompts to process.",
|
help="Number of prompts to process.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--sharegpt-output-len",
|
"--logprobs",
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
help="Output length for each request. Overrides the output length "
|
help=("Number of logprobs-per-token to compute & return as part of "
|
||||||
"from the ShareGPT dataset.")
|
"the request. If unspecified, then either (1) if beam search "
|
||||||
parser.add_argument(
|
"is disabled, no logprobs are computed & a single dummy "
|
||||||
"--sonnet-input-len",
|
"logprob is returned for each token; or (2) if beam search "
|
||||||
type=int,
|
"is enabled 1 logprob per token is computed"),
|
||||||
default=550,
|
|
||||||
help=
|
|
||||||
"Number of input tokens per request, used only for sonnet dataset.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sonnet-output-len",
|
|
||||||
type=int,
|
|
||||||
default=150,
|
|
||||||
help=
|
|
||||||
"Number of output tokens per request, used only for sonnet dataset.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sonnet-prefix-len",
|
|
||||||
type=int,
|
|
||||||
default=200,
|
|
||||||
help=
|
|
||||||
"Number of prefix tokens per request, used only for sonnet dataset.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--random-input-len",
|
|
||||||
type=int,
|
|
||||||
default=1024,
|
|
||||||
help=
|
|
||||||
"Number of input tokens per request, used only for random sampling.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--random-output-len",
|
|
||||||
type=int,
|
|
||||||
default=128,
|
|
||||||
help=
|
|
||||||
"Number of output tokens per request, used only for random sampling.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--random-range-ratio",
|
|
||||||
type=float,
|
|
||||||
default=1.0,
|
|
||||||
help="Range of sampled ratio of input/output length, "
|
|
||||||
"used only for random sampling.",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--request-rate",
|
"--request-rate",
|
||||||
@ -804,6 +859,11 @@ if __name__ == "__main__":
|
|||||||
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||||
" format.",
|
" format.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ignore-eos",
|
||||||
|
action="store_true",
|
||||||
|
help="Set ignore_eos flag when sending the benchmark request."
|
||||||
|
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
@ -822,5 +882,85 @@ if __name__ == "__main__":
|
|||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# group for dataset specific arguments
|
||||||
|
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||||
|
sonnet_group.add_argument(
|
||||||
|
"--sonnet-input-len",
|
||||||
|
type=int,
|
||||||
|
default=550,
|
||||||
|
help=
|
||||||
|
"Number of input tokens per request, used only for sonnet dataset.",
|
||||||
|
)
|
||||||
|
sonnet_group.add_argument(
|
||||||
|
"--sonnet-output-len",
|
||||||
|
type=int,
|
||||||
|
default=150,
|
||||||
|
help=
|
||||||
|
"Number of output tokens per request, used only for sonnet dataset.",
|
||||||
|
)
|
||||||
|
sonnet_group.add_argument(
|
||||||
|
"--sonnet-prefix-len",
|
||||||
|
type=int,
|
||||||
|
default=200,
|
||||||
|
help=
|
||||||
|
"Number of prefix tokens per request, used only for sonnet dataset.",
|
||||||
|
)
|
||||||
|
|
||||||
|
sharegpt_group = parser.add_argument_group("sharegpt dataset options")
|
||||||
|
sharegpt_group.add_argument(
|
||||||
|
"--sharegpt-output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the output length "
|
||||||
|
"from the ShareGPT dataset.")
|
||||||
|
|
||||||
|
random_group = parser.add_argument_group("random dataset options")
|
||||||
|
random_group.add_argument(
|
||||||
|
"--random-input-len",
|
||||||
|
type=int,
|
||||||
|
default=1024,
|
||||||
|
help=
|
||||||
|
"Number of input tokens per request, used only for random sampling.",
|
||||||
|
)
|
||||||
|
random_group.add_argument(
|
||||||
|
"--random-output-len",
|
||||||
|
type=int,
|
||||||
|
default=128,
|
||||||
|
help=
|
||||||
|
"Number of output tokens per request, used only for random sampling.",
|
||||||
|
)
|
||||||
|
random_group.add_argument(
|
||||||
|
"--random-range-ratio",
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="Range of sampled ratio of input/output length, "
|
||||||
|
"used only for random sampling.",
|
||||||
|
)
|
||||||
|
random_group.add_argument(
|
||||||
|
"--random-prefix-len",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Number of fixed prefix tokens before random "
|
||||||
|
" context. The length range of context in a random "
|
||||||
|
" request is [random-prefix-len, "
|
||||||
|
" random-prefix-len + random-prefix-len * random-range-ratio).")
|
||||||
|
|
||||||
|
hf_group = parser.add_argument_group("hf dataset options")
|
||||||
|
hf_group.add_argument("--hf-subset",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Subset of the HF dataset.")
|
||||||
|
hf_group.add_argument("--hf-split",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Split of the HF dataset.")
|
||||||
|
hf_group.add_argument(
|
||||||
|
"--hf-output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the output lengths "
|
||||||
|
"from the sampled HF dataset.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -11,10 +11,11 @@ from tqdm import tqdm
|
|||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
@ -72,7 +73,6 @@ def run_vllm(
|
|||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
seed: int,
|
seed: int,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_model_len: Optional[int],
|
max_model_len: Optional[int],
|
||||||
@ -125,16 +125,33 @@ def run_vllm(
|
|||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
temperature=0.0 if use_beam_search else 1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
))
|
))
|
||||||
|
|
||||||
start = time.perf_counter()
|
use_beam_search = False
|
||||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
|
||||||
end = time.perf_counter()
|
if not use_beam_search:
|
||||||
|
start = time.perf_counter()
|
||||||
|
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||||
|
end = time.perf_counter()
|
||||||
|
else:
|
||||||
|
prompts = [prompt for prompt, _, _ in requests]
|
||||||
|
# output_len should be the same for all requests.
|
||||||
|
output_len = requests[0][2]
|
||||||
|
for prompt, input_len, _output_len in requests:
|
||||||
|
assert _output_len == output_len
|
||||||
|
start = time.perf_counter()
|
||||||
|
llm.beam_search(
|
||||||
|
prompts,
|
||||||
|
BeamSearchParams(
|
||||||
|
beam_width=n,
|
||||||
|
max_tokens=output_len,
|
||||||
|
ignore_eos=True,
|
||||||
|
))
|
||||||
|
end = time.perf_counter()
|
||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
@ -146,7 +163,6 @@ async def run_vllm_async(
|
|||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
seed: int,
|
seed: int,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_model_len: Optional[int],
|
max_model_len: Optional[int],
|
||||||
@ -191,7 +207,6 @@ async def run_vllm_async(
|
|||||||
use_v2_block_manager=use_v2_block_manager,
|
use_v2_block_manager=use_v2_block_manager,
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
disable_async_output_proc=disable_async_output_proc,
|
||||||
worker_use_ray=False,
|
worker_use_ray=False,
|
||||||
engine_use_ray=False,
|
|
||||||
disable_log_requests=True,
|
disable_log_requests=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -206,9 +221,8 @@ async def run_vllm_async(
|
|||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
temperature=0.0 if use_beam_search else 1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
use_beam_search=use_beam_search,
|
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
))
|
))
|
||||||
@ -230,11 +244,9 @@ def run_hf(
|
|||||||
model: str,
|
model: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
n: int,
|
n: int,
|
||||||
use_beam_search: bool,
|
|
||||||
max_batch_size: int,
|
max_batch_size: int,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
) -> float:
|
) -> float:
|
||||||
assert not use_beam_search
|
|
||||||
llm = AutoModelForCausalLM.from_pretrained(
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||||
if llm.config.model_type == "llama":
|
if llm.config.model_type == "llama":
|
||||||
@ -266,7 +278,7 @@ def run_hf(
|
|||||||
padding=True).input_ids
|
padding=True).input_ids
|
||||||
llm_outputs = llm.generate(
|
llm_outputs = llm.generate(
|
||||||
input_ids=input_ids.cuda(),
|
input_ids=input_ids.cuda(),
|
||||||
do_sample=not use_beam_search,
|
do_sample=True,
|
||||||
num_return_sequences=n,
|
num_return_sequences=n,
|
||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
@ -322,7 +334,7 @@ def main(args: argparse.Namespace):
|
|||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
run_args = [
|
run_args = [
|
||||||
requests, args.model, args.tokenizer, args.quantization,
|
requests, args.model, args.tokenizer, args.quantization,
|
||||||
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
|
args.tensor_parallel_size, args.seed, args.n,
|
||||||
args.trust_remote_code, args.dtype, args.max_model_len,
|
args.trust_remote_code, args.dtype, args.max_model_len,
|
||||||
args.enforce_eager, args.kv_cache_dtype,
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
args.quantization_param_path, args.device,
|
args.quantization_param_path, args.device,
|
||||||
@ -341,8 +353,7 @@ def main(args: argparse.Namespace):
|
|||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
args.use_beam_search, args.hf_max_batch_size,
|
args.hf_max_batch_size, args.trust_remote_code)
|
||||||
args.trust_remote_code)
|
|
||||||
elif args.backend == "mii":
|
elif args.backend == "mii":
|
||||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
||||||
args.output_len)
|
args.output_len)
|
||||||
@ -396,7 +407,6 @@ if __name__ == "__main__":
|
|||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
help="Number of generated sequences per prompt.")
|
help="Number of generated sequences per prompt.")
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
|
||||||
parser.add_argument("--num-prompts",
|
parser.add_argument("--num-prompts",
|
||||||
type=int,
|
type=int,
|
||||||
default=1000,
|
default=1000,
|
||||||
@ -451,13 +461,11 @@ if __name__ == "__main__":
|
|||||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
'instead supported for common inference criteria.')
|
'instead supported for common inference criteria.')
|
||||||
parser.add_argument(
|
parser.add_argument("--device",
|
||||||
"--device",
|
type=str,
|
||||||
type=str,
|
default="auto",
|
||||||
default="auto",
|
choices=DEVICE_OPTIONS,
|
||||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
help='device type for vLLM execution')
|
||||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
|
||||||
'CPU.')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-scheduler-steps",
|
"--num-scheduler-steps",
|
||||||
type=int,
|
type=int,
|
||||||
@ -465,6 +473,7 @@ if __name__ == "__main__":
|
|||||||
help="Maximum number of forward steps per scheduler call.")
|
help="Maximum number of forward steps per scheduler call.")
|
||||||
parser.add_argument("--use-v2-block-manager",
|
parser.add_argument("--use-v2-block-manager",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
default=EngineArgs.use_v2_block_manager,
|
||||||
help="Enable block manager v2.")
|
help="Enable block manager v2.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-prefix-caching",
|
"--enable-prefix-caching",
|
||||||
@ -553,8 +562,6 @@ if __name__ == "__main__":
|
|||||||
raise ValueError("dtype must be auto for MII backend.")
|
raise ValueError("dtype must be auto for MII backend.")
|
||||||
if args.n != 1:
|
if args.n != 1:
|
||||||
raise ValueError("n must be 1 for MII backend.")
|
raise ValueError("n must be 1 for MII backend.")
|
||||||
if args.use_beam_search:
|
|
||||||
raise ValueError("Beam search is not supported for MII backend.")
|
|
||||||
if args.quantization is not None:
|
if args.quantization is not None:
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
if args.hf_max_batch_size is not None:
|
if args.hf_max_batch_size is not None:
|
||||||
|
|||||||
@ -1,10 +1,10 @@
|
|||||||
import random
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||||
|
seed_everything)
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@ -16,10 +16,7 @@ def main(num_tokens: int,
|
|||||||
do_profile: bool = False,
|
do_profile: bool = False,
|
||||||
num_warmup_iters: int = 5,
|
num_warmup_iters: int = 5,
|
||||||
num_iters: int = 100) -> None:
|
num_iters: int = 100) -> None:
|
||||||
random.seed(seed)
|
seed_everything(seed)
|
||||||
torch.random.manual_seed(seed)
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.manual_seed(seed)
|
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||||
|
|||||||
@ -4,8 +4,10 @@ import itertools
|
|||||||
import math
|
import math
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from typing import Callable, Iterable, List, Tuple
|
from itertools import product
|
||||||
|
from typing import Callable, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
from torch.utils.benchmark import Measurement as TMeasurement
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
@ -84,6 +86,10 @@ def loop_over_weights(
|
|||||||
fn(a, w_ref, w_q, w_s)
|
fn(a, w_ref, w_q, w_s)
|
||||||
|
|
||||||
|
|
||||||
|
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||||
|
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
def bench(atype: torch.dtype,
|
def bench(atype: torch.dtype,
|
||||||
wtype: ScalarType,
|
wtype: ScalarType,
|
||||||
group_size: int,
|
group_size: int,
|
||||||
@ -94,6 +100,8 @@ def bench(atype: torch.dtype,
|
|||||||
sub_label: str,
|
sub_label: str,
|
||||||
benchmark_marlinv1: bool = True,
|
benchmark_marlinv1: bool = True,
|
||||||
sweep_schedules: bool = True) -> Iterable[TMeasurement]:
|
sweep_schedules: bool = True) -> Iterable[TMeasurement]:
|
||||||
|
global _SWEEP_SCHEDULES_RESULTS
|
||||||
|
|
||||||
a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
|
a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
|
||||||
sub_label += f", L={len(weights)}"
|
sub_label += f", L={len(weights)}"
|
||||||
|
|
||||||
@ -163,6 +171,11 @@ def bench(atype: torch.dtype,
|
|||||||
best_schedule = None
|
best_schedule = None
|
||||||
schedules = ops.machete_supported_schedules(wtype)
|
schedules = ops.machete_supported_schedules(wtype)
|
||||||
for schedule in reversed(schedules):
|
for schedule in reversed(schedules):
|
||||||
|
schedule_M = int(schedule.split("_")[0].split("x")[1])
|
||||||
|
|
||||||
|
# Prune known bad schedules
|
||||||
|
if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
|
||||||
|
continue
|
||||||
|
|
||||||
def run(a, _, w_q, w_s, schedule=schedule):
|
def run(a, _, w_q, w_s, schedule=schedule):
|
||||||
ops.machete_gemm(a,
|
ops.machete_gemm(a,
|
||||||
@ -175,6 +188,20 @@ def bench(atype: torch.dtype,
|
|||||||
res = bench_fn(label, sub_label, "machete_best",
|
res = bench_fn(label, sub_label, "machete_best",
|
||||||
lambda: loop_over_weights(a, weights_machete, run))
|
lambda: loop_over_weights(a, weights_machete, run))
|
||||||
|
|
||||||
|
results_row = {
|
||||||
|
"M": m,
|
||||||
|
"K": k,
|
||||||
|
"N": n,
|
||||||
|
"group_size": group_size,
|
||||||
|
"schedule": schedule,
|
||||||
|
"median": res.median,
|
||||||
|
}
|
||||||
|
if _SWEEP_SCHEDULES_RESULTS is None:
|
||||||
|
_SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
|
||||||
|
columns=results_row.keys())
|
||||||
|
_SWEEP_SCHEDULES_RESULTS.\
|
||||||
|
loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
|
||||||
|
|
||||||
print(f" {res.median:5.5} ", schedule)
|
print(f" {res.median:5.5} ", schedule)
|
||||||
if not best or res.median < best.median:
|
if not best or res.median < best.median:
|
||||||
best = res
|
best = res
|
||||||
@ -235,18 +262,22 @@ def run_square_bench(args):
|
|||||||
dim_sizes = list(
|
dim_sizes = list(
|
||||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
|
|
||||||
data = run(args.dtype, args.sweep_schedules, MKNs)
|
data = run(args.dtype, args.sweep_schedules, MKNs)
|
||||||
|
|
||||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
def run_range_bench(args):
|
def run_range_bench(args):
|
||||||
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
|
m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
|
||||||
n = len(dim_sizes)
|
m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
|
||||||
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
|
m_increment, k_increment, n_increment = \
|
||||||
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
[int(x) for x in args.dim_increment.split(",")]
|
||||||
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
Ms = list(range(m_start, m_end + 1, m_increment))
|
||||||
MKNs = list(zip(Ms, Ks, Ns))
|
Ks = list(range(k_start, k_end + 1, k_increment))
|
||||||
|
Ns = list(range(n_start, n_end + 1, n_increment))
|
||||||
|
MKNs = list(product(Ms, Ks, Ns))
|
||||||
|
|
||||||
data = run(args.dtype, args.sweep_schedules, MKNs)
|
data = run(args.dtype, args.sweep_schedules, MKNs)
|
||||||
|
|
||||||
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||||
@ -333,6 +364,9 @@ Benchmark Machete GEMM.
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Run a sweep over all supported schedules",
|
help="Run a sweep over all supported schedules",
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--sweep-csv-out",
|
||||||
|
help="CSV to store sweep results",
|
||||||
|
default="sch_sweep_results.csv")
|
||||||
subparsers = parser.add_subparsers(dest="cmd", required=True)
|
subparsers = parser.add_subparsers(dest="cmd", required=True)
|
||||||
|
|
||||||
square_parser = subparsers.add_parser("square_bench")
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
@ -342,12 +376,21 @@ Benchmark Machete GEMM.
|
|||||||
square_parser.set_defaults(func=run_square_bench)
|
square_parser.set_defaults(func=run_square_bench)
|
||||||
|
|
||||||
range_parser = subparsers.add_parser("range_bench")
|
range_parser = subparsers.add_parser("range_bench")
|
||||||
range_parser.add_argument("--dim-start", type=int, required=True)
|
range_parser.add_argument(
|
||||||
range_parser.add_argument("--dim-end", type=int, required=True)
|
"--dim-start",
|
||||||
range_parser.add_argument("--dim-increment", type=int, required=True)
|
type=str,
|
||||||
range_parser.add_argument("--m-constant", type=int, default=None)
|
required=True,
|
||||||
range_parser.add_argument("--n-constant", type=int, default=None)
|
help="Start value for M,K,N as common separated list")
|
||||||
range_parser.add_argument("--k-constant", type=int, default=None)
|
range_parser.add_argument(
|
||||||
|
"--dim-end",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="End value (inclusive) for M,K,N as common separated list")
|
||||||
|
range_parser.add_argument(
|
||||||
|
"--dim-increment",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Increment value for M,K,N as common separated list")
|
||||||
range_parser.set_defaults(func=run_range_bench)
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
model_parser = subparsers.add_parser("model_bench")
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
@ -369,4 +412,9 @@ Benchmark Machete GEMM.
|
|||||||
model_parser.set_defaults(func=run_model_bench)
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
_SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
|
||||||
args.func(args)
|
args.func(args)
|
||||||
|
|
||||||
|
if _SWEEP_SCHEDULES_RESULTS is not None:
|
||||||
|
_SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
|
||||||
|
|||||||
@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
|
|||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser, seed_everything
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkConfig(TypedDict):
|
class BenchmarkConfig(TypedDict):
|
||||||
@ -166,7 +166,7 @@ class BenchmarkWorker:
|
|||||||
|
|
||||||
def __init__(self, seed: int) -> None:
|
def __init__(self, seed: int) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
torch.cuda.manual_seed_all(seed)
|
seed_everything(seed)
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
|
|
||||||
def benchmark(
|
def benchmark(
|
||||||
@ -180,7 +180,7 @@ class BenchmarkWorker:
|
|||||||
use_fp8_w8a8: bool,
|
use_fp8_w8a8: bool,
|
||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
) -> Tuple[Dict[str, int], float]:
|
) -> Tuple[Dict[str, int], float]:
|
||||||
torch.cuda.manual_seed_all(self.seed)
|
seed_everything(self.seed)
|
||||||
dtype_str = get_config_dtype_str(dtype,
|
dtype_str = get_config_dtype_str(dtype,
|
||||||
use_int8_w8a16=use_int8_w8a16,
|
use_int8_w8a16=use_int8_w8a16,
|
||||||
use_fp8_w8a8=use_fp8_w8a8)
|
use_fp8_w8a8=use_fp8_w8a8)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import torch
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||||
create_kv_caches_with_random)
|
create_kv_caches_with_random, seed_everything)
|
||||||
|
|
||||||
NUM_BLOCKS = 1024
|
NUM_BLOCKS = 1024
|
||||||
PARTITION_SIZE = 512
|
PARTITION_SIZE = 512
|
||||||
@ -28,10 +28,7 @@ def main(
|
|||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: Optional[str] = None,
|
kv_cache_dtype: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
seed_everything(seed)
|
||||||
torch.random.manual_seed(seed)
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.manual_seed(seed)
|
|
||||||
|
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
query = torch.empty(num_seqs,
|
query = torch.empty(num_seqs,
|
||||||
|
|||||||
@ -1,10 +1,10 @@
|
|||||||
import random
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||||
|
seed_everything)
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@ -17,10 +17,7 @@ def main(num_tokens: int,
|
|||||||
do_profile: bool = False,
|
do_profile: bool = False,
|
||||||
num_warmup_iters: int = 5,
|
num_warmup_iters: int = 5,
|
||||||
num_iters: int = 100) -> None:
|
num_iters: int = 100) -> None:
|
||||||
random.seed(seed)
|
seed_everything(seed)
|
||||||
torch.random.manual_seed(seed)
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.manual_seed(seed)
|
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import torch
|
|||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||||
get_rope)
|
get_rope)
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser, seed_everything
|
||||||
|
|
||||||
|
|
||||||
def benchmark_rope_kernels_multi_lora(
|
def benchmark_rope_kernels_multi_lora(
|
||||||
@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
max_position: int = 8192,
|
max_position: int = 8192,
|
||||||
base: int = 10000,
|
base: int = 10000,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
seed_everything(seed)
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.manual_seed(seed)
|
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
if rotary_dim is None:
|
if rotary_dim is None:
|
||||||
rotary_dim = head_size
|
rotary_dim = head_size
|
||||||
|
|||||||
@ -45,8 +45,7 @@ if __name__ == "__main__":
|
|||||||
rows = int(math.ceil(len(results) / 2))
|
rows = int(math.ceil(len(results) / 2))
|
||||||
fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
|
fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
|
||||||
axs = axs.flatten()
|
axs = axs.flatten()
|
||||||
axs_idx = 0
|
for axs_idx, (shape, data) in enumerate(results.items()):
|
||||||
for shape, data in results.items():
|
|
||||||
plt.sca(axs[axs_idx])
|
plt.sca(axs[axs_idx])
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
sns.lineplot(data=df,
|
sns.lineplot(data=df,
|
||||||
@ -59,6 +58,5 @@ if __name__ == "__main__":
|
|||||||
palette="Dark2")
|
palette="Dark2")
|
||||||
plt.title(f"Shape: {shape}")
|
plt.title(f"Shape: {shape}")
|
||||||
plt.ylabel("time (median, s)")
|
plt.ylabel("time (median, s)")
|
||||||
axs_idx += 1
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig("graph_machete_bench.pdf")
|
plt.savefig("graph_machete_bench.pdf")
|
||||||
|
|||||||
1
benchmarks/kernels/requirements.txt
Normal file
1
benchmarks/kernels/requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
pandas
|
||||||
@ -1,4 +1,5 @@
|
|||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define environment variables for special configurations
|
# Define environment variables for special configurations
|
||||||
@ -83,12 +84,12 @@ endif()
|
|||||||
|
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
|
|
||||||
list(APPEND LIBS "numa")
|
list(APPEND LIBS numa)
|
||||||
|
|
||||||
|
# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
|
||||||
#
|
if (AVX2_FOUND OR AVX512_FOUND)
|
||||||
# Define extension targets
|
list(APPEND LIBS dnnl)
|
||||||
#
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# _C extension
|
# _C extension
|
||||||
@ -102,6 +103,16 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/cpu/torch_bindings.cpp")
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
|
|
||||||
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/cpu/quant.cpp"
|
||||||
|
${VLLM_EXT_SRC})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Define extension targets
|
||||||
|
#
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_C
|
_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
@ -114,4 +125,3 @@ define_gpu_extension_target(
|
|||||||
)
|
)
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
add_dependencies(default _C)
|
|
||||||
|
|||||||
@ -133,10 +133,181 @@ macro(string_to_ver OUT_VER IN_STR)
|
|||||||
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
||||||
endmacro()
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
|
||||||
|
# `CUDA_ARCH_FLAGS`.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
|
||||||
|
# clear_cuda_arches(CUDA_ARCH_FLAGS)
|
||||||
|
# CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
|
||||||
|
# CMAKE_CUDA_FLAGS="-Wall"
|
||||||
|
#
|
||||||
|
macro(clear_cuda_arches CUDA_ARCH_FLAGS)
|
||||||
|
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
|
||||||
|
string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
|
||||||
|
${CMAKE_CUDA_FLAGS})
|
||||||
|
|
||||||
|
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
|
||||||
|
# and passed back via the `CUDA_ARCHITECTURES` property.
|
||||||
|
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
|
||||||
|
${CMAKE_CUDA_FLAGS})
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Extract unique CUDA architectures from a list of compute capabilities codes in
|
||||||
|
# the form `<major><minor>[<letter>]`, convert them to the form sort
|
||||||
|
# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
|
||||||
|
# stores them in `OUT_ARCHES`.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
|
||||||
|
# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
|
||||||
|
# OUT_ARCHES="7.5;...;9.0"
|
||||||
|
function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
|
||||||
|
set(_CUDA_ARCHES)
|
||||||
|
foreach(_ARCH ${CUDA_ARCH_FLAGS})
|
||||||
|
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
|
||||||
|
if (_COMPUTE)
|
||||||
|
set(_COMPUTE ${CMAKE_MATCH_1})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
string_to_ver(_COMPUTE_VER ${_COMPUTE})
|
||||||
|
list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
list(REMOVE_DUPLICATES _CUDA_ARCHES)
|
||||||
|
list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
|
||||||
|
set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
#
|
||||||
|
# For a specific file set the `-gencode` flag in compile options conditionally
|
||||||
|
# for the CUDA language.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# set_gencode_flag_for_srcs(
|
||||||
|
# SRCS "foo.cu"
|
||||||
|
# ARCH "compute_75"
|
||||||
|
# CODE "sm_75")
|
||||||
|
# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
|
||||||
|
# `foo.cu` (only for the CUDA language).
|
||||||
|
#
|
||||||
|
macro(set_gencode_flag_for_srcs)
|
||||||
|
set(options)
|
||||||
|
set(oneValueArgs ARCH CODE)
|
||||||
|
set(multiValueArgs SRCS)
|
||||||
|
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
|
||||||
|
"${multiValueArgs}" ${ARGN} )
|
||||||
|
set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
|
||||||
|
set_property(
|
||||||
|
SOURCE ${arg_SRCS}
|
||||||
|
APPEND PROPERTY
|
||||||
|
COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
|
||||||
|
)
|
||||||
|
|
||||||
|
message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
|
||||||
|
endmacro(set_gencode_flag_for_srcs)
|
||||||
|
|
||||||
|
#
|
||||||
|
# For a list of source files set the `-gencode` flags in the files specific
|
||||||
|
# compile options (specifically for the CUDA language).
|
||||||
|
#
|
||||||
|
# arguments are:
|
||||||
|
# SRCS: list of source files
|
||||||
|
# CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
|
||||||
|
# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
|
||||||
|
# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
|
||||||
|
# that is larger than BUILD_PTX_FOR_ARCH.
|
||||||
|
#
|
||||||
|
macro(set_gencode_flags_for_srcs)
|
||||||
|
set(options)
|
||||||
|
set(oneValueArgs BUILD_PTX_FOR_ARCH)
|
||||||
|
set(multiValueArgs SRCS CUDA_ARCHS)
|
||||||
|
cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
|
||||||
|
"${multiValueArgs}" ${ARGN} )
|
||||||
|
|
||||||
|
foreach(_ARCH ${arg_CUDA_ARCHS})
|
||||||
|
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||||
|
set_gencode_flag_for_srcs(
|
||||||
|
SRCS ${arg_SRCS}
|
||||||
|
ARCH "compute_${_ARCH}"
|
||||||
|
CODE "sm_${_ARCH}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
if (${arg_BUILD_PTX_FOR_ARCH})
|
||||||
|
list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
|
||||||
|
list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
|
||||||
|
if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
|
||||||
|
string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
|
||||||
|
set_gencode_flag_for_srcs(
|
||||||
|
SRCS ${arg_SRCS}
|
||||||
|
ARCH "compute_${_PTX_ARCH}"
|
||||||
|
CODE "compute_${_PTX_ARCH}")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
|
||||||
|
# `<major>.<minor>[letter]` compute the "loose intersection" with the
|
||||||
|
# `TGT_CUDA_ARCHS` list of gencodes.
|
||||||
|
# The loose intersection is defined as:
|
||||||
|
# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
|
||||||
|
# where `<=` is the version comparison operator.
|
||||||
|
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
|
||||||
|
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
|
||||||
|
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
|
||||||
|
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
|
||||||
|
# 9.0a to the result.
|
||||||
|
# The result is stored in `OUT_CUDA_ARCHS`.
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
|
||||||
|
# TGT_CUDA_ARCHS="8.0;8.9;9.0"
|
||||||
|
# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
|
||||||
|
# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
|
||||||
|
#
|
||||||
|
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
|
||||||
|
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
|
||||||
|
|
||||||
|
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
|
||||||
|
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
|
||||||
|
set(_CUDA_ARCHS)
|
||||||
|
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
|
||||||
|
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
|
||||||
|
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
|
||||||
|
set(_CUDA_ARCHS "9.0a")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
|
||||||
|
|
||||||
|
# for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is
|
||||||
|
# less or eqault to ARCH
|
||||||
|
foreach(_ARCH ${CUDA_ARCHS})
|
||||||
|
set(_TMP_ARCH)
|
||||||
|
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
|
||||||
|
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
|
||||||
|
set(_TMP_ARCH ${_SRC_ARCH})
|
||||||
|
else()
|
||||||
|
break()
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
if (_TMP_ARCH)
|
||||||
|
list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
list(REMOVE_DUPLICATES _CUDA_ARCHS)
|
||||||
|
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Override the GPU architectures detected by cmake/torch and filter them by
|
# Override the GPU architectures detected by cmake/torch and filter them by
|
||||||
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
|
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
|
||||||
# `GPU_ARCHES`.
|
# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
|
||||||
|
# the architectures on a per file basis.
|
||||||
#
|
#
|
||||||
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
|
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
|
||||||
#
|
#
|
||||||
@ -174,109 +345,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
|||||||
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
||||||
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
elseif(${GPU_LANG} STREQUAL "CUDA")
|
|
||||||
#
|
|
||||||
# Setup/process CUDA arch flags.
|
|
||||||
#
|
|
||||||
# The torch cmake setup hardcodes the detected architecture flags in
|
|
||||||
# `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
|
|
||||||
# can't modified on a per-target basis.
|
|
||||||
# So, all the `-gencode` flags need to be extracted and removed from
|
|
||||||
# `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
|
|
||||||
# Since it's not possible to use `target_compiler_options` for adding target
|
|
||||||
# specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
|
|
||||||
# must be used instead. This requires repackaging the architecture flags
|
|
||||||
# into a format that cmake expects for `CUDA_ARCHITECTURES`.
|
|
||||||
#
|
|
||||||
# This is a bit fragile in that it depends on torch using `-gencode` as opposed
|
|
||||||
# to one of the other nvcc options to specify architectures.
|
|
||||||
#
|
|
||||||
# Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
|
|
||||||
# detected architectures.
|
|
||||||
#
|
|
||||||
message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
|
||||||
|
|
||||||
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
|
|
||||||
string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
|
|
||||||
${CMAKE_CUDA_FLAGS})
|
|
||||||
|
|
||||||
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
|
|
||||||
# and passed back via the `CUDA_ARCHITECTURES` property.
|
|
||||||
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
|
|
||||||
${CMAKE_CUDA_FLAGS})
|
|
||||||
|
|
||||||
# If this error is triggered, it might mean that torch has changed how it sets
|
|
||||||
# up nvcc architecture code generation flags.
|
|
||||||
if (NOT _CUDA_ARCH_FLAGS)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"Could not find any architecture related code generation flags in "
|
|
||||||
"CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
|
||||||
message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
|
|
||||||
|
|
||||||
# Initialize the architecture lists to empty.
|
|
||||||
set(${GPU_ARCHES})
|
|
||||||
|
|
||||||
# Process each `gencode` flag.
|
|
||||||
foreach(_ARCH ${_CUDA_ARCH_FLAGS})
|
|
||||||
# For each flag, extract the version number and whether it refers to PTX
|
|
||||||
# or native code.
|
|
||||||
# Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
|
|
||||||
# for that match.
|
|
||||||
|
|
||||||
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
|
|
||||||
if (_COMPUTE)
|
|
||||||
set(_COMPUTE ${CMAKE_MATCH_1})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
|
|
||||||
if (_SM)
|
|
||||||
set(_SM ${CMAKE_MATCH_1})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
|
|
||||||
if (_CODE)
|
|
||||||
set(_CODE ${CMAKE_MATCH_1})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Make sure the virtual architecture can be matched.
|
|
||||||
if (NOT _COMPUTE)
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"Could not determine virtual architecture from: ${_ARCH}.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# One of sm_ or compute_ must exist.
|
|
||||||
if ((NOT _SM) AND (NOT _CODE))
|
|
||||||
message(FATAL_ERROR
|
|
||||||
"Could not determine a codegen architecture from: ${_ARCH}.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (_SM)
|
|
||||||
# -real suffix let CMake to only generate elf code for the kernels.
|
|
||||||
# we want this, otherwise the added ptx (default) will increase binary size.
|
|
||||||
set(_VIRT "-real")
|
|
||||||
set(_CODE_ARCH ${_SM})
|
|
||||||
else()
|
|
||||||
# -virtual suffix let CMake to generate ptx code for the kernels.
|
|
||||||
set(_VIRT "-virtual")
|
|
||||||
set(_CODE_ARCH ${_CODE})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Check if the current version is in the supported arch list.
|
|
||||||
string_to_ver(_CODE_VER ${_CODE_ARCH})
|
|
||||||
if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
|
||||||
message(STATUS "discarding unsupported CUDA arch ${_VER}.")
|
|
||||||
continue()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Add it to the arch list.
|
|
||||||
list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
|
|
||||||
endforeach()
|
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
|
|
||||||
endmacro()
|
endmacro()
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -350,17 +419,19 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
||||||
${GPU_INCLUDE_DIRECTORIES})
|
${GPU_INCLUDE_DIRECTORIES})
|
||||||
|
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
|
||||||
${GPU_LIBRARIES})
|
|
||||||
|
|
||||||
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
|
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
|
||||||
# dependencies that are not necessary and may not be installed.
|
# dependencies that are not necessary and may not be installed.
|
||||||
if (GPU_LANGUAGE STREQUAL "CUDA")
|
if (GPU_LANGUAGE STREQUAL "CUDA")
|
||||||
|
if ("${CUDA_CUDA_LIB}" STREQUAL "")
|
||||||
|
set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
|
||||||
|
endif()
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
|
||||||
${CUDA_LIBRARIES})
|
${CUDA_LIBRARIES})
|
||||||
else()
|
else()
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
|
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|||||||
@ -267,13 +267,16 @@ def get_neuron_sdk_version(run_lambda):
|
|||||||
|
|
||||||
|
|
||||||
def get_vllm_version():
|
def get_vllm_version():
|
||||||
try:
|
from vllm import __version__, __version_tuple__
|
||||||
import vllm
|
|
||||||
return vllm.__version__ + "@" + vllm.__commit__
|
|
||||||
except Exception:
|
|
||||||
# old version of vllm does not have __commit__
|
|
||||||
return 'N/A'
|
|
||||||
|
|
||||||
|
if __version__ == "dev":
|
||||||
|
return "N/A (dev)"
|
||||||
|
|
||||||
|
if len(__version_tuple__) == 4: # dev build
|
||||||
|
git_sha = __version_tuple__[-1][1:] # type: ignore
|
||||||
|
return f"{__version__} (git sha: {git_sha}"
|
||||||
|
|
||||||
|
return __version__
|
||||||
|
|
||||||
def summarize_vllm_build_flags():
|
def summarize_vllm_build_flags():
|
||||||
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
||||||
@ -285,9 +288,14 @@ def summarize_vllm_build_flags():
|
|||||||
|
|
||||||
|
|
||||||
def get_gpu_topo(run_lambda):
|
def get_gpu_topo(run_lambda):
|
||||||
|
output = None
|
||||||
|
|
||||||
if get_platform() == 'linux':
|
if get_platform() == 'linux':
|
||||||
return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
|
output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
|
||||||
return None
|
if output is None:
|
||||||
|
output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
# example outputs of CPU infos
|
# example outputs of CPU infos
|
||||||
|
|||||||
3
csrc/core/exception.hpp
Normal file
3
csrc/core/exception.hpp
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#define VLLM_IMPLIES(p, q) (!(p) || (q))
|
||||||
@ -12,6 +12,11 @@
|
|||||||
// could be a macro instead of a literal token.
|
// could be a macro instead of a literal token.
|
||||||
#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
|
#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
|
||||||
|
|
||||||
|
// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
|
||||||
|
// could be a macro instead of a literal token.
|
||||||
|
#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
|
||||||
|
TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
|
||||||
|
|
||||||
// REGISTER_EXTENSION allows the shared library to be loaded and initialized
|
// REGISTER_EXTENSION allows the shared library to be loaded and initialized
|
||||||
// via python's import statement.
|
// via python's import statement.
|
||||||
#define REGISTER_EXTENSION(NAME) \
|
#define REGISTER_EXTENSION(NAME) \
|
||||||
|
|||||||
@ -24,8 +24,8 @@ namespace vec_op {
|
|||||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
#else
|
#else
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
std::cout << #NAME << " invoked." << std::endl;
|
RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
|||||||
explicit BF16Vec16(const FP32Vec16 &);
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||||
|
|
||||||
|
void save(void* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm256_mask_storeu_epi16(ptr, mask, reg);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
|
||||||
|
return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 max(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(_mm512_max_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 abs() const {
|
||||||
|
return FP32Vec16(_mm512_abs_ps(reg));
|
||||||
|
}
|
||||||
|
|
||||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||||
|
|
||||||
|
float reduce_max() const { return _mm512_reduce_max_ps(reg); }
|
||||||
|
|
||||||
template <int group_size> float reduce_sub_sum(int idx) {
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||||
|
|
||||||
|
void save(float* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm512_mask_storeu_ps(ptr, mask, reg);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
#else
|
#else
|
||||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
struct INT8Vec16: public Vec<INT8Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
__m128i reg;
|
||||||
|
int8_t values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m128i reg;
|
||||||
|
|
||||||
|
explicit INT8Vec16(const FP32Vec16& vec) : reg(
|
||||||
|
_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
|
||||||
|
) {}
|
||||||
|
|
||||||
|
void save(int8_t* ptr) const {
|
||||||
|
_mm_storeu_epi8(ptr, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(int8_t* ptr, const int elem_num) const {
|
||||||
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
|
_mm_mask_storeu_epi8(ptr, mask, reg);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T> struct VecType { using vec_type = void; };
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|||||||
168
csrc/cpu/dnnl_helper.hpp
Normal file
168
csrc/cpu/dnnl_helper.hpp
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
#ifndef DNNL_HELPER_HPP
|
||||||
|
#define DNNL_HELPER_HPP
|
||||||
|
|
||||||
|
#include <c10/util/BFloat16.h>
|
||||||
|
|
||||||
|
#include "oneapi/dnnl/dnnl.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T>
|
||||||
|
struct DNNLType {
|
||||||
|
static constexpr dnnl::memory::data_type type =
|
||||||
|
dnnl::memory::data_type::undef;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<int8_t> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<int32_t> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<float> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<c10::BFloat16> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
constexpr inline dnnl::memory::data_type get_dnnl_type() {
|
||||||
|
return DNNLType<std::decay_t<T>>::type;
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <bool InputNoScale>
|
||||||
|
class DNNLPrimitiveHelper {
|
||||||
|
public:
|
||||||
|
// I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
|
||||||
|
// A: [M, K], row-major
|
||||||
|
// B: [K, N], column-major
|
||||||
|
// C: [M, N], row-major
|
||||||
|
// bias: [N], row-major, optional
|
||||||
|
// a_scales: [MS]
|
||||||
|
// b_scales: [NS]
|
||||||
|
// Note: Due to the limitation of oneDNN
|
||||||
|
// (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
|
||||||
|
// not supported.
|
||||||
|
template <typename OutputT, typename BiasT>
|
||||||
|
static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
|
||||||
|
const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
|
||||||
|
dnnl_dim_t K, const float* a_scales,
|
||||||
|
const float* b_scales, dnnl_dim_t MS,
|
||||||
|
dnnl_dim_t NS) {
|
||||||
|
auto&& OutputType = get_dnnl_type<OutputT>();
|
||||||
|
auto&& BiasType = get_dnnl_type<BiasT>();
|
||||||
|
|
||||||
|
dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
|
||||||
|
dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
|
||||||
|
dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
|
||||||
|
|
||||||
|
dnnl::primitive_attr attr;
|
||||||
|
if constexpr (!InputNoScale) {
|
||||||
|
if (MS == 1) {
|
||||||
|
// per-tensor
|
||||||
|
attr.set_scales_mask(DNNL_ARG_SRC, 0);
|
||||||
|
} else {
|
||||||
|
// per-token
|
||||||
|
TORCH_CHECK(false, "per-token quantization is unsupported.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NS == 1) {
|
||||||
|
// per-tensor
|
||||||
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
|
||||||
|
} else {
|
||||||
|
// per-channel
|
||||||
|
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
dnnl::matmul::primitive_desc matmul_pd;
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
|
||||||
|
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
|
||||||
|
bias_md, c_md, attr);
|
||||||
|
} else {
|
||||||
|
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
|
||||||
|
c_md, attr);
|
||||||
|
}
|
||||||
|
dnnl::matmul matmul(matmul_pd);
|
||||||
|
|
||||||
|
auto& engine = default_engine();
|
||||||
|
|
||||||
|
dnnl::memory a_m(a_md, engine, (void*)a);
|
||||||
|
dnnl::memory b_m(b_md, engine, (void*)b);
|
||||||
|
dnnl::memory c_m(c_md, engine, (void*)c);
|
||||||
|
dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
|
||||||
|
(void*)a_scales);
|
||||||
|
dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
|
||||||
|
(void*)b_scales);
|
||||||
|
|
||||||
|
auto& stream = default_stream();
|
||||||
|
if constexpr (InputNoScale) {
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({N}, BiasType, {1});
|
||||||
|
dnnl::memory bias_m(bias_md, engine, (void*)bias);
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_BIAS, bias_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (bias) {
|
||||||
|
dnnl::memory::desc bias_md({N}, BiasType, {1});
|
||||||
|
dnnl::memory bias_m(bias_md, engine, (void*)bias);
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_BIAS, bias_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
matmul.execute(
|
||||||
|
stream, {
|
||||||
|
{DNNL_ARG_SRC, a_m},
|
||||||
|
{DNNL_ARG_WEIGHTS, b_m},
|
||||||
|
{DNNL_ARG_DST, c_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
|
||||||
|
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stream.wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static dnnl::engine& default_engine() {
|
||||||
|
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
|
||||||
|
return engine;
|
||||||
|
}
|
||||||
|
|
||||||
|
static dnnl::stream& default_stream() {
|
||||||
|
static dnnl::stream stream(default_engine());
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
297
csrc/cpu/quant.cpp
Normal file
297
csrc/cpu/quant.cpp
Normal file
@ -0,0 +1,297 @@
|
|||||||
|
#include "cpu_types.hpp"
|
||||||
|
#include "dnnl_helper.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t>
|
||||||
|
struct KernelVecType {
|
||||||
|
using load_vec_type = void;
|
||||||
|
using cvt_vec_type = void;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<float> {
|
||||||
|
using load_vec_type = vec_op::FP32Vec16;
|
||||||
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<c10::BFloat16> {
|
||||||
|
using load_vec_type = vec_op::BF16Vec16;
|
||||||
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
template <typename scalar_t>
|
||||||
|
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
const float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
constexpr float i8_min =
|
||||||
|
static_cast<float>(std::numeric_limits<int8_t>::min());
|
||||||
|
constexpr float i8_max =
|
||||||
|
static_cast<float>(std::numeric_limits<int8_t>::max());
|
||||||
|
const cvt_vec_t inv_scale(1.0 / *scale);
|
||||||
|
const cvt_vec_t i8_min_vec(i8_min);
|
||||||
|
const cvt_vec_t i8_max_vec(i8_max);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
cvt_vec_t max_abs(0.0);
|
||||||
|
{
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs());
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs());
|
||||||
|
} else {
|
||||||
|
max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float scale_val = max_abs.reduce_max() / 127.0f;
|
||||||
|
scale[i] = scale_val;
|
||||||
|
const cvt_vec_t inv_scale(1.0 / scale_val);
|
||||||
|
|
||||||
|
{
|
||||||
|
int j = 0;
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems(input + i * hidden_size + j);
|
||||||
|
cvt_vec_t elems_fp32(elems);
|
||||||
|
elems_fp32 = (elems_fp32 * inv_scale);
|
||||||
|
vec_op::INT8Vec16 elems_int8(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_int8.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool Bias, typename scalar_t>
|
||||||
|
void dynamic_output_scale_impl(const float* input, scalar_t* output,
|
||||||
|
const float* scale, const scalar_t* bias,
|
||||||
|
const int num_tokens, const int hidden_size) {
|
||||||
|
CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
|
||||||
|
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
|
||||||
|
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
|
||||||
|
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
int j = 0;
|
||||||
|
cvt_vec_t token_scale_vec(scale[i]);
|
||||||
|
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
|
||||||
|
cvt_vec_t elems_fp32(input + i * hidden_size + j);
|
||||||
|
elems_fp32 = elems_fp32 * token_scale_vec;
|
||||||
|
|
||||||
|
if constexpr (Bias) {
|
||||||
|
load_vec_t bias_vec(bias + j);
|
||||||
|
cvt_vec_t bias_vec_fp32(bias_vec);
|
||||||
|
elems_fp32 = elems_fp32 + bias_vec_fp32;
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems_out(elems_fp32);
|
||||||
|
elems_out.save(output + i * hidden_size + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
cvt_vec_t elems_fp32(input + i * hidden_size + j);
|
||||||
|
elems_fp32 = elems_fp32 * token_scale_vec;
|
||||||
|
|
||||||
|
if constexpr (Bias) {
|
||||||
|
load_vec_t bias_vec(bias + j);
|
||||||
|
cvt_vec_t bias_vec_fp32(bias_vec);
|
||||||
|
elems_fp32 = elems_fp32 + bias_vec_fp32;
|
||||||
|
}
|
||||||
|
|
||||||
|
load_vec_t elems_out(elems_fp32);
|
||||||
|
|
||||||
|
if (j + vec_elem_num == hidden_size) {
|
||||||
|
elems_out.save(output + i * hidden_size + j);
|
||||||
|
} else {
|
||||||
|
elems_out.save(output + i * hidden_size + j, hidden_size - j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <typename scalar_t>
|
||||||
|
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
const float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
float* scale, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void dynamic_output_scale_impl() {
|
||||||
|
TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
|
||||||
|
const torch::Tensor& a, // [M, IC], row-major
|
||||||
|
const torch::Tensor& b, // [IC, OC], column-major
|
||||||
|
const torch::Tensor& a_scales, // [1] or [M]
|
||||||
|
const torch::Tensor& b_scales, // [1] or [OC]
|
||||||
|
const c10::optional<torch::Tensor>& bias // [OC]
|
||||||
|
) {
|
||||||
|
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
|
||||||
|
// Checks for conformality
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
|
||||||
|
"int8_scaled_mm only supports INT8 inputs.")
|
||||||
|
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
|
||||||
|
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
|
||||||
|
b.size(1) == c.size(1));
|
||||||
|
TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
|
||||||
|
TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
|
||||||
|
|
||||||
|
// Check for strides and alignment
|
||||||
|
TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
|
||||||
|
TORCH_CHECK(b.stride(0) == 1); // Column-major
|
||||||
|
TORCH_CHECK(c.stride(0) % 16 == 0 &&
|
||||||
|
b.stride(1) % 16 == 0); // 16 Byte Alignment
|
||||||
|
TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
|
||||||
|
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
|
||||||
|
bias->dim() == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
|
||||||
|
if (a_scales.numel() != 1) {
|
||||||
|
// per-token
|
||||||
|
// Note: oneDNN doesn't support per-token activation quantization
|
||||||
|
torch::Tensor tmp_fp32_out =
|
||||||
|
torch::empty_like(c, ::at::ScalarType::Float);
|
||||||
|
DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
|
||||||
|
tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
|
||||||
|
a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
|
||||||
|
b_scales.numel());
|
||||||
|
if (bias.has_value()) {
|
||||||
|
dynamic_output_scale_impl<true>(
|
||||||
|
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
|
||||||
|
a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
|
||||||
|
c.size(1));
|
||||||
|
} else {
|
||||||
|
dynamic_output_scale_impl<false>(
|
||||||
|
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
|
||||||
|
a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// per-tensor
|
||||||
|
if (bias.has_value()) {
|
||||||
|
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
|
||||||
|
bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
|
||||||
|
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
|
||||||
|
a_scales.numel(), b_scales.numel());
|
||||||
|
} else {
|
||||||
|
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
|
||||||
|
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
|
||||||
|
(void*)(0), a.size(0), b.size(1), a.size(1),
|
||||||
|
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
|
||||||
|
a_scales.numel(), b_scales.numel());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// static-per-tensor quantization.
|
||||||
|
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||||
|
const torch::Tensor& input, // [..., hidden_size]
|
||||||
|
const torch::Tensor& scale,
|
||||||
|
c10::optional<torch::Tensor> const& azp) {
|
||||||
|
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
TORCH_CHECK(scale.numel() == 1);
|
||||||
|
TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
|
||||||
|
|
||||||
|
const int hidden_size = input.size(-1);
|
||||||
|
const int num_tokens = input.numel() / hidden_size;
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
|
||||||
|
static_scaled_int8_quant_impl(
|
||||||
|
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
|
||||||
|
scale.data_ptr<float>(), num_tokens, hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// dynamic-per-token quantization.
|
||||||
|
void dynamic_scaled_int8_quant(
|
||||||
|
torch::Tensor& out, // [..., hidden_size]
|
||||||
|
const torch::Tensor& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& scale, // [..., 1]
|
||||||
|
c10::optional<torch::Tensor> const& azp) {
|
||||||
|
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
|
||||||
|
|
||||||
|
int const hidden_size = input.size(-1);
|
||||||
|
int const num_tokens = input.numel() / hidden_size;
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
|
||||||
|
dynamic_scaled_int8_quant_impl(
|
||||||
|
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
|
||||||
|
scale.data_ptr<float>(), num_tokens, hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
@ -4,7 +4,12 @@
|
|||||||
|
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
void init_cpu_threads_env(const std::string& cpu_ids);
|
std::string init_cpu_threads_env(const std::string& cpu_ids);
|
||||||
|
|
||||||
|
void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||||
|
const torch::Tensor& b, const torch::Tensor& a_scales,
|
||||||
|
const torch::Tensor& b_scales,
|
||||||
|
const c10::optional<torch::Tensor>& bias);
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||||
// vLLM custom ops
|
// vLLM custom ops
|
||||||
@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
// PagedAttention V2.
|
// PagedAttention V2.
|
||||||
ops.def(
|
ops.def(
|
||||||
"paged_attention_v2("
|
"paged_attention_v2("
|
||||||
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
" Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
|
||||||
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
" Tensor! tmp_out, Tensor query, Tensor key_cache,"
|
||||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
" int max_seq_len, Tensor? alibi_slopes,"
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
@ -84,6 +89,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" Tensor! key, int head_size,"
|
" Tensor! key, int head_size,"
|
||||||
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
||||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
|
||||||
|
// Quantization
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
|
ops.def(
|
||||||
|
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
|
||||||
|
"Tensor? azp) -> ()");
|
||||||
|
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
|
||||||
|
|
||||||
|
// Compute int8 quantized tensor and scaling factor
|
||||||
|
ops.def(
|
||||||
|
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
|
||||||
|
"Tensor!? azp) -> ()");
|
||||||
|
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
|
||||||
|
&dynamic_scaled_int8_quant);
|
||||||
|
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
|
// quantization.
|
||||||
|
ops.def(
|
||||||
|
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
||||||
|
" Tensor b, Tensor a_scales,"
|
||||||
|
" Tensor b_scales, Tensor? bias) -> ()");
|
||||||
|
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||||
@ -95,8 +123,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
// Copy the cache blocks from src to dst.
|
// Copy the cache blocks from src to dst.
|
||||||
cache_ops.def(
|
cache_ops.def(
|
||||||
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
|
||||||
"block_mapping) -> ()");
|
"Tensor block_mapping) -> ()");
|
||||||
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
||||||
|
|
||||||
// Reshape the key and value tensors and cache them.
|
// Reshape the key and value tensors and cache them.
|
||||||
@ -111,7 +139,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
|||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||||
// CPU utils
|
// CPU utils
|
||||||
utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
|
utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#include "cpu_types.hpp"
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
void init_cpu_threads_env(const std::string& cpu_ids) {
|
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||||
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
|
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
|
||||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
TORCH_CHECK(omp_cpu_mask->size > 0);
|
||||||
std::vector<int> omp_cpu_ids;
|
std::vector<int> omp_cpu_ids;
|
||||||
@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
|
|||||||
torch::set_num_threads((int)omp_cpu_ids.size());
|
torch::set_num_threads((int)omp_cpu_ids.size());
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
|
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
|
||||||
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
|
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> thread_core_mapping;
|
||||||
|
thread_core_mapping.reserve(omp_cpu_ids.size());
|
||||||
|
omp_lock_t writelock;
|
||||||
|
omp_init_lock(&writelock);
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static, 1)
|
#pragma omp parallel for schedule(static, 1)
|
||||||
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
|
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
|
||||||
cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
|
cpu_set_t mask;
|
||||||
size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
|
CPU_ZERO(&mask);
|
||||||
CPU_ZERO_S(size, mask);
|
CPU_SET(omp_cpu_ids[i], &mask);
|
||||||
CPU_SET_S(omp_cpu_ids[i], size, mask);
|
int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
|
||||||
sched_setaffinity(0, sizeof(cpu_set_t), mask);
|
if (ret == -1) {
|
||||||
CPU_FREE(mask);
|
TORCH_CHECK(false,
|
||||||
|
"sched_setaffinity failed. errno: " + std::to_string(errno));
|
||||||
|
}
|
||||||
|
|
||||||
|
omp_set_lock(&writelock);
|
||||||
|
thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
|
||||||
|
omp_unset_lock(&writelock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
omp_destroy_lock(&writelock);
|
||||||
|
|
||||||
numa_free_nodemask(omp_cpu_mask);
|
numa_free_nodemask(omp_cpu_mask);
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "OMP threads binding of Process " << getpid() << ":\n";
|
||||||
|
std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
|
||||||
|
[](auto&& a, auto&& b) { return a.second < b.second; });
|
||||||
|
for (auto&& item : thread_core_mapping) {
|
||||||
|
ss << "\t"
|
||||||
|
<< "OMP tid: " << item.first << ", core " << item.second << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -55,18 +55,6 @@ bool _is_weak_contiguous(torch::Tensor& t) {
|
|||||||
t.numel() * t.element_size());
|
t.numel() * t.element_size());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
|
|
||||||
bool full_nvlink) {
|
|
||||||
auto inp_size = inp.numel() * inp.element_size();
|
|
||||||
// custom allreduce requires input byte size to be multiples of 16
|
|
||||||
if (inp_size % 16 != 0) return false;
|
|
||||||
if (!_is_weak_contiguous(inp)) return false;
|
|
||||||
if (world_size == 2 || full_nvlink) return inp_size <= max_size;
|
|
||||||
// for 4 or more non NVLink-capable GPUs, custom allreduce provides little
|
|
||||||
// performance improvement over NCCL.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
|
|||||||
@ -6,6 +6,7 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <array>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
@ -23,17 +24,23 @@
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
constexpr int kMaxBlocks = 64;
|
constexpr int kMaxBlocks = 36;
|
||||||
// note: we don't want to use atomics for signals because peer atomics are no
|
// Counter may overflow, but it's fine since unsigned int overflow is
|
||||||
// supported on PCIe links
|
// well-defined behavior.
|
||||||
|
using FlagType = uint32_t;
|
||||||
struct Signal {
|
struct Signal {
|
||||||
alignas(128) uint32_t start[kMaxBlocks][8];
|
alignas(128) FlagType self_counter[kMaxBlocks][8];
|
||||||
alignas(128) uint32_t end[kMaxBlocks][8];
|
// Two sets of peer counters are needed for two syncs. The reason is that
|
||||||
|
// it's possible for peer GPU block to arrive at the second sync point while
|
||||||
|
// the current GPU block haven't passed the first sync point. Thus, peer GPU
|
||||||
|
// may write counter+1 while current GPU is busy waiting for counter. We use
|
||||||
|
// alternating counter array to avoid this possibility.
|
||||||
|
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
|
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
|
||||||
|
|
||||||
struct __align__(16) RankSignals { volatile Signal* signals[8]; };
|
struct __align__(16) RankSignals { Signal* signals[8]; };
|
||||||
|
|
||||||
// like std::array, but aligned
|
// like std::array, but aligned
|
||||||
template <typename T, int sz>
|
template <typename T, int sz>
|
||||||
@ -123,47 +130,71 @@ DINLINE O downcast(array_t<float, O::size> val) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function is meant to be used as the first synchronization in the all
|
static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
|
||||||
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||||
// prior memory accesses. Note: volatile writes will not be reordered against
|
asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
|
||||||
// other volatile writes.
|
"l"(flag_addr));
|
||||||
template <int ngpus>
|
#else
|
||||||
DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg,
|
asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
|
||||||
int rank) {
|
"l"(flag_addr));
|
||||||
if (threadIdx.x < ngpus) {
|
#endif
|
||||||
// reset flag for next time
|
|
||||||
self_sg->end[blockIdx.x][threadIdx.x] = 0;
|
|
||||||
// simultaneously write to the corresponding flag of all ranks.
|
|
||||||
// Latency = 1 p2p write
|
|
||||||
sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
|
|
||||||
// wait until we got true from all ranks
|
|
||||||
while (!self_sg->start[blockIdx.x][threadIdx.x]);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function is meant to be used as the second or the final synchronization
|
static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
|
||||||
// barrier in the all reduce kernel. If it's the final synchronization barrier,
|
FlagType flag;
|
||||||
// we don't need to make any visibility guarantees for prior memory accesses.
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||||
template <int ngpus, bool final_sync = false>
|
asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
|
||||||
DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg,
|
: "=r"(flag)
|
||||||
int rank) {
|
: "l"(flag_addr));
|
||||||
__syncthreads();
|
#else
|
||||||
// eliminate the case that prior writes are not visible after signals become
|
asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
|
||||||
// visible. Note that I did not managed to make this happen through a lot of
|
: "=r"(flag)
|
||||||
// testing. Might be the case that hardware provides stronger guarantee than
|
: "l"(flag_addr));
|
||||||
// the memory model.
|
#endif
|
||||||
if constexpr (!final_sync) __threadfence_system();
|
return flag;
|
||||||
|
}
|
||||||
|
|
||||||
|
static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
|
||||||
|
asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
|
||||||
|
}
|
||||||
|
|
||||||
|
static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
|
||||||
|
FlagType flag;
|
||||||
|
asm volatile("ld.volatile.global.u32 %0, [%1];"
|
||||||
|
: "=r"(flag)
|
||||||
|
: "l"(flag_addr));
|
||||||
|
return flag;
|
||||||
|
}
|
||||||
|
|
||||||
|
// is_start: whether this is the very first synchronization barrier.
|
||||||
|
// need_fence: whether a memory fence is needed. If true, a release-acquire
|
||||||
|
// semantic is used to enforce memory access order before and after this
|
||||||
|
// barrier.
|
||||||
|
template <int ngpus, bool is_start, bool need_fence = false>
|
||||||
|
DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
|
||||||
|
int rank) {
|
||||||
|
if constexpr (!is_start) __syncthreads();
|
||||||
|
static_assert(
|
||||||
|
!(is_start && need_fence)); // Start barrier shouldn't need fence.
|
||||||
if (threadIdx.x < ngpus) {
|
if (threadIdx.x < ngpus) {
|
||||||
// reset flag for next time
|
// Increment the counter. Technically we only need one counter, but we use
|
||||||
self_sg->start[blockIdx.x][threadIdx.x] = 0;
|
// multiple per block to eliminate the need to share the counter via smem.
|
||||||
// simultaneously write to the corresponding flag of all ranks.
|
auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
|
||||||
// Latency = 1 p2p write
|
// Write the expected counter value to peer and wait for correct value from
|
||||||
sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
|
// peer.
|
||||||
// wait until we got true from all ranks
|
auto peer_counter_ptr =
|
||||||
while (!self_sg->end[blockIdx.x][threadIdx.x]);
|
&sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
|
||||||
|
auto self_counter_ptr =
|
||||||
|
&self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
|
||||||
|
if constexpr (need_fence) {
|
||||||
|
st_flag_release(peer_counter_ptr, val);
|
||||||
|
while (ld_flag_acquire(self_counter_ptr) != val);
|
||||||
|
} else {
|
||||||
|
st_flag_volatile(peer_counter_ptr, val);
|
||||||
|
while (ld_flag_volatile(self_counter_ptr) != val);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if constexpr (!final_sync) __syncthreads();
|
if constexpr (is_start || need_fence) __syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename P, int ngpus, typename A>
|
template <typename P, int ngpus, typename A>
|
||||||
@ -178,33 +209,31 @@ DINLINE P packed_reduce(const P* ptrs[], int idx) {
|
|||||||
|
|
||||||
template <typename T, int ngpus>
|
template <typename T, int ngpus>
|
||||||
__global__ void __launch_bounds__(512, 1)
|
__global__ void __launch_bounds__(512, 1)
|
||||||
cross_device_reduce_1stage(RankData* _dp, RankSignals sg,
|
cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
|
||||||
volatile Signal* self_sg, T* __restrict__ result,
|
T* __restrict__ result, int rank, int size) {
|
||||||
int rank, int size) {
|
|
||||||
using P = typename packed_t<T>::P;
|
using P = typename packed_t<T>::P;
|
||||||
using A = typename packed_t<T>::A;
|
using A = typename packed_t<T>::A;
|
||||||
// note: we don't reorder the address so the accumulation order is the same
|
// note: we don't reorder the address so the accumulation order is the same
|
||||||
// for all ranks, ensuring bitwise identical results
|
// for all ranks, ensuring bitwise identical results
|
||||||
auto dp = *_dp;
|
auto dp = *_dp;
|
||||||
start_sync<ngpus>(sg, self_sg, rank);
|
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
|
||||||
// do the actual reduction
|
// do the actual reduction
|
||||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
||||||
idx += gridDim.x * blockDim.x) {
|
idx += gridDim.x * blockDim.x) {
|
||||||
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
|
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
|
||||||
}
|
}
|
||||||
end_sync<ngpus, true>(sg, self_sg, rank);
|
multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename P>
|
template <typename P>
|
||||||
DINLINE P* get_tmp_buf(volatile Signal* sg) {
|
DINLINE P* get_tmp_buf(Signal* sg) {
|
||||||
return (P*)(((Signal*)sg) + 1);
|
return (P*)(((Signal*)sg) + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int ngpus>
|
template <typename T, int ngpus>
|
||||||
__global__ void __launch_bounds__(512, 1)
|
__global__ void __launch_bounds__(512, 1)
|
||||||
cross_device_reduce_2stage(RankData* _dp, RankSignals sg,
|
cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
|
||||||
volatile Signal* self_sg, T* __restrict__ result,
|
T* __restrict__ result, int rank, int size) {
|
||||||
int rank, int size) {
|
|
||||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int stride = gridDim.x * blockDim.x;
|
int stride = gridDim.x * blockDim.x;
|
||||||
using P = typename packed_t<T>::P;
|
using P = typename packed_t<T>::P;
|
||||||
@ -222,12 +251,12 @@ __global__ void __launch_bounds__(512, 1)
|
|||||||
tmps[i] = get_tmp_buf<P>(sg.signals[target]);
|
tmps[i] = get_tmp_buf<P>(sg.signals[target]);
|
||||||
}
|
}
|
||||||
auto tmp_out = tmps[0];
|
auto tmp_out = tmps[0];
|
||||||
start_sync<ngpus>(sg, self_sg, rank);
|
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
|
||||||
// stage 1: reduce scatter
|
// stage 1: reduce scatter
|
||||||
for (int idx = start + tid; idx < end; idx += stride) {
|
for (int idx = start + tid; idx < end; idx += stride) {
|
||||||
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
|
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
|
||||||
}
|
}
|
||||||
end_sync<ngpus>(sg, self_sg, rank);
|
multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
|
||||||
|
|
||||||
// stage 2: allgather. Note: it's important to match the tid between
|
// stage 2: allgather. Note: it's important to match the tid between
|
||||||
// the two stages, because visibility across devices is only guaranteed
|
// the two stages, because visibility across devices is only guaranteed
|
||||||
@ -437,6 +466,8 @@ class CustomAllreduce {
|
|||||||
#define KL(ngpus, name) \
|
#define KL(ngpus, name) \
|
||||||
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
|
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
|
||||||
rank_, size);
|
rank_, size);
|
||||||
|
// TODO(hanzhi713): Threshold is different for A100 and H100.
|
||||||
|
// Add per device threshold.
|
||||||
#define REDUCE_CASE(ngpus) \
|
#define REDUCE_CASE(ngpus) \
|
||||||
case ngpus: { \
|
case ngpus: { \
|
||||||
if (world_size_ == 2) { \
|
if (world_size_ == 2) { \
|
||||||
|
|||||||
@ -1,15 +1,15 @@
|
|||||||
/**
|
/**
|
||||||
* This is a standalone test for custom allreduce.
|
* This is a standalone test for custom allreduce.
|
||||||
* To compile, make sure you have MPI and NCCL installed in your system.
|
* To compile, make sure you have MPI and NCCL installed in your system.
|
||||||
* export MPI_HOME=XXX
|
* export MPI_HOME=xxx
|
||||||
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
|
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
|
||||||
* custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
|
* custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
|
||||||
*
|
*
|
||||||
* Warning: this C++ test is not designed to be very readable and was used
|
* Warning: this C++ test is not designed to be very readable and was used
|
||||||
* during the rapid prototyping process.
|
* during the rapid prototyping process.
|
||||||
*
|
*
|
||||||
* To run:
|
* To run:
|
||||||
* mpirun -np 8 ./custom_all_reduce_test
|
* mpirun --allow-run-as-root -np 8 ./custom_all_reduce_test
|
||||||
*/
|
*/
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <curand_kernel.h>
|
#include <curand_kernel.h>
|
||||||
@ -44,7 +44,14 @@
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
__global__ void dummy_kernel() {
|
__global__ void dummy_kernel() {
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||||
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
|
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
|
||||||
|
#else
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
long long int start = clock64();
|
||||||
|
while (clock64() - start < 150000000); // approximately 98.4ms on P40
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -302,15 +309,19 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
bool performance_test = true;
|
bool performance_test = true;
|
||||||
cudaProfilerStart();
|
cudaProfilerStart();
|
||||||
// for (int threads : {256, 512}) {
|
// Uncomment to scan through different block size configs.
|
||||||
|
// for (int threads : {256, 512, 1024}) {
|
||||||
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
|
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
|
||||||
// run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
|
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
|
||||||
|
// performance_test);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
// Scan through different sizes to test performance.
|
||||||
for (int sz = 512; sz <= (8 << 20); sz *= 2) {
|
for (int sz = 512; sz <= (8 << 20); sz *= 2) {
|
||||||
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
|
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaProfilerStop();
|
cudaProfilerStop();
|
||||||
|
MPICHECK(MPI_Finalize());
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -68,7 +68,13 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
|
|||||||
name, ".stride(", idx, ") to be ", StrideEle::value);
|
name, ".stride(", idx, ") to be ", StrideEle::value);
|
||||||
return StrideEle{};
|
return StrideEle{};
|
||||||
} else {
|
} else {
|
||||||
return tensor.stride(idx);
|
if (tensor.size(idx) == 1) {
|
||||||
|
// use 0 stride for dim with size 1, this is easier for
|
||||||
|
// cute/cutlass to optimize (helps the TMA code flatten dims)
|
||||||
|
return StrideEle{0};
|
||||||
|
} else {
|
||||||
|
return tensor.stride(idx);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Extra strides are assumed to be 0 or 1
|
// Extra strides are assumed to be 0 or 1
|
||||||
|
|||||||
@ -39,8 +39,6 @@
|
|||||||
|
|
||||||
template<typename input_t, typename weight_t>
|
template<typename input_t, typename weight_t>
|
||||||
void causal_conv1d_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
void causal_conv1d_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
template <typename input_t, typename weight_t>
|
|
||||||
void causal_conv1d_channellast_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
|
|
||||||
template<typename input_t, typename weight_t>
|
template<typename input_t, typename weight_t>
|
||||||
void causal_conv1d_update_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
void causal_conv1d_update_cuda(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
@ -55,8 +53,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
|||||||
const at::Tensor x,
|
const at::Tensor x,
|
||||||
const at::Tensor weight,
|
const at::Tensor weight,
|
||||||
const at::Tensor out,
|
const at::Tensor out,
|
||||||
void* bias_ptr,
|
const c10::optional<at::Tensor>& bias,
|
||||||
bool silu_activation) {
|
bool silu_activation,
|
||||||
|
const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
|
||||||
|
const c10::optional<at::Tensor>& cache_indices = std::nullopt,
|
||||||
|
const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
|
||||||
|
|
||||||
// Reset the parameters
|
// Reset the parameters
|
||||||
memset(¶ms, 0, sizeof(params));
|
memset(¶ms, 0, sizeof(params));
|
||||||
@ -71,26 +72,31 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
|||||||
// Set the pointers and strides.
|
// Set the pointers and strides.
|
||||||
params.x_ptr = x.data_ptr();
|
params.x_ptr = x.data_ptr();
|
||||||
params.weight_ptr = weight.data_ptr();
|
params.weight_ptr = weight.data_ptr();
|
||||||
params.bias_ptr = bias_ptr;
|
params.bias_ptr = bias.has_value() ? bias.value().data_ptr() : nullptr;
|
||||||
params.out_ptr = out.data_ptr();
|
params.out_ptr = out.data_ptr();
|
||||||
// All stride are in elements, not bytes.
|
// All stride are in elements, not bytes.
|
||||||
params.x_batch_stride = x.stride(0);
|
params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
|
||||||
params.x_c_stride = x.stride(1);
|
params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
|
||||||
params.x_l_stride = x.stride(-1);
|
params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
|
||||||
|
const bool varlen = params.query_start_loc_ptr != nullptr;
|
||||||
|
params.x_batch_stride = x.stride(varlen ? 1 : 0);
|
||||||
|
params.x_c_stride = x.stride(varlen ? 0 : 1);
|
||||||
|
params.x_l_stride = x.stride(varlen ? 1 : -1);
|
||||||
params.weight_c_stride = weight.stride(0);
|
params.weight_c_stride = weight.stride(0);
|
||||||
params.weight_width_stride = weight.stride(1);
|
params.weight_width_stride = weight.stride(1);
|
||||||
params.out_batch_stride = out.stride(0);
|
params.out_batch_stride = out.stride(varlen ? 1 : 0);
|
||||||
params.out_c_stride = out.stride(1);
|
params.out_c_stride = out.stride(varlen ? 0 : 1);
|
||||||
params.out_l_stride = out.stride(-1);
|
params.out_l_stride = out.stride(varlen ? 1 : -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
at::Tensor
|
at::Tensor
|
||||||
causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
||||||
const c10::optional<at::Tensor> &bias_,
|
const c10::optional<at::Tensor> &bias_,
|
||||||
const c10::optional<at::Tensor> &seq_idx_,
|
const c10::optional<at::Tensor> &conv_states,
|
||||||
const c10::optional<at::Tensor> &initial_states_,
|
const c10::optional<at::Tensor> &query_start_loc,
|
||||||
const c10::optional<at::Tensor> &final_states_out_,
|
const c10::optional<at::Tensor> &cache_indices,
|
||||||
|
const c10::optional<at::Tensor> &has_initial_state,
|
||||||
bool silu_activation) {
|
bool silu_activation) {
|
||||||
auto input_type = x.scalar_type();
|
auto input_type = x.scalar_type();
|
||||||
auto weight_type = weight.scalar_type();
|
auto weight_type = weight.scalar_type();
|
||||||
@ -99,24 +105,22 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
|||||||
|
|
||||||
TORCH_CHECK(x.is_cuda());
|
TORCH_CHECK(x.is_cuda());
|
||||||
TORCH_CHECK(weight.is_cuda());
|
TORCH_CHECK(weight.is_cuda());
|
||||||
|
|
||||||
|
const bool varlen = query_start_loc.has_value() ? true : false;
|
||||||
const auto sizes = x.sizes();
|
const auto sizes = x.sizes();
|
||||||
const int batch_size = sizes[0];
|
const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
|
||||||
const int dim = sizes[1];
|
const int dim = varlen ? sizes[0] : sizes[1];
|
||||||
const int seqlen = sizes[2];
|
const int seqlen = varlen ? sizes[1] : sizes[2];
|
||||||
const int width = weight.size(-1);
|
const int width = weight.size(-1);
|
||||||
|
if (varlen){
|
||||||
CHECK_SHAPE(x, batch_size, dim, seqlen);
|
CHECK_SHAPE(x, dim, seqlen);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
CHECK_SHAPE(x, batch_size, dim, seqlen);
|
||||||
|
}
|
||||||
CHECK_SHAPE(weight, dim, width);
|
CHECK_SHAPE(weight, dim, width);
|
||||||
|
|
||||||
TORCH_CHECK(x.stride(2) == 1 || x.stride(1) == 1);
|
|
||||||
const bool is_channel_last = x.stride(1) == 1 && x.stride(2) > 1;
|
|
||||||
|
|
||||||
if (is_channel_last) {
|
|
||||||
TORCH_CHECK(dim % 8 == 0, "causal_conv1d only supports channel dimension divisible by 8 for now");
|
|
||||||
TORCH_CHECK(x.stride(2) % 8 == 0 and x.stride(0) % 8 == 0, "causal_conv1d with channel last layout requires strides (x.stride(0) and x.stride(2)) to be multiples of 8");
|
|
||||||
}
|
|
||||||
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
|
|
||||||
|
|
||||||
if (bias_.has_value()) {
|
if (bias_.has_value()) {
|
||||||
auto bias = bias_.value();
|
auto bias = bias_.value();
|
||||||
@ -126,56 +130,50 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
|||||||
CHECK_SHAPE(bias, dim);
|
CHECK_SHAPE(bias, dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (seq_idx_.has_value()) {
|
|
||||||
TORCH_CHECK(is_channel_last, "seq_idx is only supported for channel last layout");
|
if (has_initial_state.has_value()) {
|
||||||
auto seq_idx = seq_idx_.value();
|
auto has_initial_state_ = has_initial_state.value();
|
||||||
TORCH_CHECK(seq_idx.scalar_type() == torch::kInt32);
|
TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
|
||||||
TORCH_CHECK(seq_idx.is_cuda());
|
TORCH_CHECK(has_initial_state_.is_cuda());
|
||||||
TORCH_CHECK(seq_idx.is_contiguous());
|
CHECK_SHAPE(has_initial_state_, batch_size);
|
||||||
CHECK_SHAPE(seq_idx, batch_size, seqlen);
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (query_start_loc.has_value()) {
|
||||||
|
auto query_start_loc_ = query_start_loc.value();
|
||||||
|
TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(query_start_loc_.is_cuda());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (cache_indices.has_value()) {
|
||||||
|
auto cache_indices_ = cache_indices.value();
|
||||||
|
TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(cache_indices_.is_cuda());
|
||||||
|
CHECK_SHAPE(cache_indices_, batch_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
at::Tensor out = torch::empty_like(x);
|
at::Tensor out = torch::empty_like(x);
|
||||||
|
|
||||||
ConvParamsBase params;
|
ConvParamsBase params;
|
||||||
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
|
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
|
||||||
bias_.has_value() ? bias_.value().data_ptr() : nullptr,
|
bias_,
|
||||||
silu_activation);
|
silu_activation,
|
||||||
|
query_start_loc,
|
||||||
|
cache_indices,
|
||||||
|
has_initial_state
|
||||||
|
);
|
||||||
|
|
||||||
if (seq_idx_.has_value()) {
|
if (conv_states.has_value()) {
|
||||||
params.seq_idx_ptr = seq_idx_.value().data_ptr();
|
auto conv_states_ = conv_states.value();
|
||||||
|
TORCH_CHECK(conv_states_.scalar_type() == input_type);
|
||||||
|
TORCH_CHECK(conv_states_.is_cuda());
|
||||||
|
params.conv_states_ptr = conv_states_.data_ptr();
|
||||||
|
params.conv_states_batch_stride = conv_states_.stride(0);
|
||||||
|
params.conv_states_c_stride = conv_states_.stride(1);
|
||||||
|
params.conv_states_l_stride = conv_states_.stride(2);
|
||||||
} else {
|
} else {
|
||||||
params.seq_idx_ptr = nullptr;
|
params.conv_states_ptr = nullptr;
|
||||||
}
|
|
||||||
|
|
||||||
if (initial_states_.has_value()) {
|
|
||||||
TORCH_CHECK(is_channel_last, "initial_states is only supported for channel last layout");
|
|
||||||
auto initial_states = initial_states_.value();
|
|
||||||
TORCH_CHECK(initial_states.scalar_type() == input_type);
|
|
||||||
TORCH_CHECK(initial_states.is_cuda());
|
|
||||||
CHECK_SHAPE(initial_states, batch_size, dim, width - 1);
|
|
||||||
TORCH_CHECK(initial_states.stride(1) == 1);
|
|
||||||
params.initial_states_ptr = initial_states.data_ptr();
|
|
||||||
params.initial_states_batch_stride = initial_states.stride(0);
|
|
||||||
params.initial_states_c_stride = initial_states.stride(1);
|
|
||||||
params.initial_states_l_stride = initial_states.stride(2);
|
|
||||||
} else {
|
|
||||||
params.initial_states_ptr = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (final_states_out_.has_value()) {
|
|
||||||
TORCH_CHECK(is_channel_last, "final_states is only supported for channel last layout");
|
|
||||||
auto final_states = final_states_out_.value();
|
|
||||||
TORCH_CHECK(final_states.scalar_type() == input_type);
|
|
||||||
TORCH_CHECK(final_states.is_cuda());
|
|
||||||
CHECK_SHAPE(final_states, batch_size, dim, width - 1);
|
|
||||||
TORCH_CHECK(final_states.stride(1) == 1);
|
|
||||||
params.final_states_ptr = final_states.data_ptr();
|
|
||||||
params.final_states_batch_stride = final_states.stride(0);
|
|
||||||
params.final_states_c_stride = final_states.stride(1);
|
|
||||||
params.final_states_l_stride = final_states.stride(2);
|
|
||||||
} else {
|
|
||||||
params.final_states_ptr = nullptr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise the kernel will be launched from cuda:0 device
|
// Otherwise the kernel will be launched from cuda:0 device
|
||||||
@ -183,11 +181,7 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
|||||||
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
|
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
|
||||||
if (!is_channel_last) {
|
causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
|
||||||
causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
|
|
||||||
} else {
|
|
||||||
causal_conv1d_channellast_fwd_cuda<input_t, weight_t>(params, stream);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
@ -198,7 +192,9 @@ causal_conv1d_update(const at::Tensor &x,
|
|||||||
const at::Tensor &conv_state,
|
const at::Tensor &conv_state,
|
||||||
const at::Tensor &weight,
|
const at::Tensor &weight,
|
||||||
const c10::optional<at::Tensor> &bias_,
|
const c10::optional<at::Tensor> &bias_,
|
||||||
bool silu_activation) {
|
bool silu_activation,
|
||||||
|
const c10::optional<at::Tensor> &cache_seqlens_,
|
||||||
|
const c10::optional<at::Tensor> &conv_state_indices_) {
|
||||||
auto input_type = x.scalar_type();
|
auto input_type = x.scalar_type();
|
||||||
auto weight_type = weight.scalar_type();
|
auto weight_type = weight.scalar_type();
|
||||||
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
|
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
|
||||||
@ -213,10 +209,12 @@ causal_conv1d_update(const at::Tensor &x,
|
|||||||
const auto sizes = x.sizes();
|
const auto sizes = x.sizes();
|
||||||
const int batch_size = sizes[0];
|
const int batch_size = sizes[0];
|
||||||
const int dim = sizes[1];
|
const int dim = sizes[1];
|
||||||
|
const int seqlen = sizes[2];
|
||||||
const int width = weight.size(-1);
|
const int width = weight.size(-1);
|
||||||
|
const int conv_state_len = conv_state.size(2);
|
||||||
|
TORCH_CHECK(conv_state_len >= width - 1);
|
||||||
|
|
||||||
CHECK_SHAPE(x, batch_size, dim);
|
CHECK_SHAPE(x, batch_size, dim, seqlen);
|
||||||
CHECK_SHAPE(conv_state, batch_size, dim, width);
|
|
||||||
CHECK_SHAPE(weight, dim, width);
|
CHECK_SHAPE(weight, dim, width);
|
||||||
|
|
||||||
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
|
TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
|
||||||
@ -232,15 +230,43 @@ causal_conv1d_update(const at::Tensor &x,
|
|||||||
at::Tensor out = torch::empty_like(x);
|
at::Tensor out = torch::empty_like(x);
|
||||||
|
|
||||||
ConvParamsBase params;
|
ConvParamsBase params;
|
||||||
set_conv_params_fwd(params, batch_size, dim, /*seqlen=*/1, width, x, weight, out,
|
set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
|
||||||
bias_.has_value() ? bias_.value().data_ptr() : nullptr,
|
bias_,
|
||||||
silu_activation);
|
silu_activation);
|
||||||
params.conv_state_ptr = conv_state.data_ptr();
|
params.conv_state_ptr = conv_state.data_ptr();
|
||||||
|
params.conv_state_len = conv_state_len;
|
||||||
// All stride are in elements, not bytes.
|
// All stride are in elements, not bytes.
|
||||||
params.conv_state_batch_stride = conv_state.stride(0);
|
params.conv_state_batch_stride = conv_state.stride(0);
|
||||||
params.conv_state_c_stride = conv_state.stride(1);
|
params.conv_state_c_stride = conv_state.stride(1);
|
||||||
params.conv_state_l_stride = conv_state.stride(2);
|
params.conv_state_l_stride = conv_state.stride(2);
|
||||||
|
|
||||||
|
if (cache_seqlens_.has_value()) {
|
||||||
|
auto cache_seqlens = cache_seqlens_.value();
|
||||||
|
TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
|
||||||
|
TORCH_CHECK(cache_seqlens.is_cuda());
|
||||||
|
TORCH_CHECK(cache_seqlens.stride(-1) == 1);
|
||||||
|
CHECK_SHAPE(cache_seqlens, batch_size);
|
||||||
|
params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
|
||||||
|
} else {
|
||||||
|
params.cache_seqlens = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (conv_state_indices_.has_value()) {
|
||||||
|
auto conv_state_indices = conv_state_indices_.value();
|
||||||
|
TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
|
||||||
|
TORCH_CHECK(conv_state_indices.is_cuda());
|
||||||
|
TORCH_CHECK(conv_state_indices.stride(0) == 1)
|
||||||
|
CHECK_SHAPE(conv_state_indices, batch_size);
|
||||||
|
|
||||||
|
int conv_state_entries = conv_state.size(0);
|
||||||
|
CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
|
||||||
|
|
||||||
|
params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
|
||||||
|
params.conv_state_indices_ptr = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
// Otherwise the kernel will be launched from cuda:0 device
|
// Otherwise the kernel will be launched from cuda:0 device
|
||||||
// Cast to char to avoid compiler warning about narrowing
|
// Cast to char to avoid compiler warning about narrowing
|
||||||
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
||||||
@ -280,7 +306,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
constexpr int kWidth = Ktraits::kWidth;
|
constexpr int kWidth = Ktraits::kWidth;
|
||||||
constexpr int kNThreads = Ktraits::kNThreads;
|
constexpr int kNThreads = Ktraits::kNThreads;
|
||||||
constexpr int kNElts = Ktraits::kNElts;
|
constexpr int kNElts = Ktraits::kNElts;
|
||||||
static constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
|
constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
|
||||||
using input_t = typename Ktraits::input_t;
|
using input_t = typename Ktraits::input_t;
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
using weight_t = typename Ktraits::weight_t;
|
using weight_t = typename Ktraits::weight_t;
|
||||||
@ -293,20 +319,39 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
|
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
|
||||||
vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
|
vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
|
||||||
|
|
||||||
|
const bool kVarlen = params.query_start_loc_ptr != nullptr;
|
||||||
const int tidx = threadIdx.x;
|
const int tidx = threadIdx.x;
|
||||||
const int batch_id = blockIdx.x;
|
const int batch_id = blockIdx.x;
|
||||||
const int channel_id = blockIdx.y;
|
const int channel_id = blockIdx.y;
|
||||||
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
const int *query_start_loc = kVarlen ? reinterpret_cast<int *>(params.query_start_loc_ptr) : nullptr;
|
||||||
|
const int sequence_start_index = kVarlen ? query_start_loc[batch_id] : batch_id;
|
||||||
|
const int seqlen = kVarlen ? query_start_loc[batch_id + 1] - sequence_start_index : params.seqlen;
|
||||||
|
|
||||||
|
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + sequence_start_index * params.x_batch_stride
|
||||||
+ channel_id * params.x_c_stride;
|
+ channel_id * params.x_c_stride;
|
||||||
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
|
||||||
+ channel_id * params.out_c_stride;
|
+ channel_id * params.out_c_stride;
|
||||||
float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
||||||
|
|
||||||
|
bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
|
||||||
|
: reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
|
||||||
|
|
||||||
|
int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
|
||||||
|
: reinterpret_cast<int *>(params.cache_indices_ptr);
|
||||||
|
int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
|
||||||
|
|
||||||
|
input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
|
||||||
|
: reinterpret_cast<input_t *>(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
|
||||||
|
|
||||||
// Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
|
// Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
|
||||||
if (tidx == 0) {
|
if (tidx == 0) {
|
||||||
input_t zeros[kNElts] = {0};
|
input_t initial_state[kNElts] = {0};
|
||||||
smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(zeros)[0];
|
if (has_initial_state) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){ initial_state[kNElts - 1 - (kWidth - 2) + w ] = conv_states[w]; }
|
||||||
|
}
|
||||||
|
smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(initial_state)[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
float weight_vals[kWidth];
|
float weight_vals[kWidth];
|
||||||
@ -314,14 +359,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
||||||
|
|
||||||
constexpr int kChunkSize = kNThreads * kNElts;
|
constexpr int kChunkSize = kNThreads * kNElts;
|
||||||
const int n_chunks = (params.seqlen + kChunkSize - 1) / kChunkSize;
|
const int n_chunks = (seqlen + kChunkSize - 1) / kChunkSize;
|
||||||
for (int chunk = 0; chunk < n_chunks; ++chunk) {
|
for (int chunk = 0; chunk < n_chunks; ++chunk) {
|
||||||
input_t x_vals_load[2 * kNElts] = {0};
|
input_t x_vals_load[2 * kNElts] = {0};
|
||||||
if constexpr(kIsVecLoad) {
|
if constexpr(kIsVecLoad) {
|
||||||
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (params.seqlen - chunk * kChunkSize) / kNElts);
|
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (seqlen - chunk * kChunkSize) / kNElts);
|
||||||
} else {
|
} else {
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), params.seqlen - chunk * kChunkSize);
|
typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
x += kChunkSize;
|
x += kChunkSize;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -359,19 +404,57 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
|
for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
|
||||||
if constexpr(kIsVecLoad) {
|
if constexpr(kIsVecLoad) {
|
||||||
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (params.seqlen - chunk * kChunkSize) / kNElts);
|
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (seqlen - chunk * kChunkSize) / kNElts);
|
||||||
} else {
|
} else {
|
||||||
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, params.seqlen - chunk * kChunkSize);
|
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
out += kChunkSize;
|
out += kChunkSize;
|
||||||
}
|
}
|
||||||
|
// Final state is stored in the smem_exchange last token slot,
|
||||||
|
// in case seqlen < kWidth, we would need to take the final state from the
|
||||||
|
// initial state which is stored in conv_states
|
||||||
|
// in case seqlen > kWidth, we would need to load the last kWidth - 1 data
|
||||||
|
// and load it into conv_state accordingly
|
||||||
|
int last_thread = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize) / kNElts;
|
||||||
|
if (conv_states != nullptr && tidx == last_thread) {
|
||||||
|
input_t x_vals_load[kNElts * 2] = {0};
|
||||||
|
// in case we are on the first kWidth tokens
|
||||||
|
if (last_thread == 0 && seqlen < kWidth){
|
||||||
|
// Need to take the initial state
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[0];
|
||||||
|
const int offset = seqlen - (kWidth - 1);
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){
|
||||||
|
// pad the existing state
|
||||||
|
if ((w - seqlen) >= 0 && has_initial_state) { conv_states[w - seqlen] = conv_states[w]; }
|
||||||
|
else if ((w - seqlen) >= 0 && !has_initial_state) { conv_states[w - seqlen] = input_t(0.0f); }
|
||||||
|
}
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){
|
||||||
|
if (offset + w >= 0)
|
||||||
|
conv_states[w] = x_vals_load[offset + w ];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// in case the final state is in between the threads data
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
|
||||||
|
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < kWidth - 1; ++w){
|
||||||
|
conv_states[w] = x_vals_load[offset + w ];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
||||||
void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
||||||
static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
|
static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
|
||||||
BOOL_SWITCH(params.seqlen % kNElts == 0, kIsVecLoad, [&] {
|
const bool kVarlen = params.query_start_loc_ptr != nullptr;
|
||||||
|
BOOL_SWITCH(params.seqlen % kNElts == 0 && !kVarlen, kIsVecLoad, [&] {
|
||||||
using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
|
using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
|
||||||
constexpr int kSmemSize = Ktraits::kSmemSize;
|
constexpr int kSmemSize = Ktraits::kSmemSize;
|
||||||
dim3 grid(params.batch, params.dim);
|
dim3 grid(params.batch, params.dim);
|
||||||
@ -406,220 +489,11 @@ void causal_conv1d_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int kNThreads_, int kWidth_, int kChunkSizeL_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
|
|
||||||
struct Causal_conv1d_channellast_fwd_kernel_traits {
|
|
||||||
// The cache line is 128 bytes, and we try to read 16 bytes per thread.
|
|
||||||
// So we have 8 threads per "row", so 32 or 64 elements in the channel dimension.
|
|
||||||
// That leaves 4 columns per warp, and so 16 columns per block (assuming each block has 128
|
|
||||||
// threads). Each each load is 16 x 32|64 elements in the L x C dimensions.
|
|
||||||
using input_t = input_t_;
|
|
||||||
using weight_t = weight_t_;
|
|
||||||
static constexpr int kNThreads = kNThreads_;
|
|
||||||
static_assert(kNThreads % 32 == 0);
|
|
||||||
static constexpr int kNWarps = kNThreads / 32;
|
|
||||||
static constexpr int kWidth = kWidth_;
|
|
||||||
static constexpr int kChunkSizeL = kChunkSizeL_;
|
|
||||||
static constexpr int kNBytes = sizeof(input_t);
|
|
||||||
static_assert(kNBytes == 2 || kNBytes == 4);
|
|
||||||
static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
|
|
||||||
static constexpr int kNEltsPerRow = 128 / kNBytes;
|
|
||||||
static constexpr int kNThreadsPerRow = kNEltsPerRow / kNElts; // Always 8 for now
|
|
||||||
static_assert(kNThreadsPerRow * kNBytes * kNElts == 128);
|
|
||||||
static constexpr int kNColsPerWarp = 32 / kNThreadsPerRow; // Always 4 for now
|
|
||||||
static_assert(kNColsPerWarp * kNThreadsPerRow == 32);
|
|
||||||
static constexpr int kNColsPerLoad = kNColsPerWarp * kNWarps;
|
|
||||||
static constexpr int kNLoads = kChunkSizeL / kNColsPerLoad;
|
|
||||||
static_assert(kNLoads * kNColsPerLoad == kChunkSizeL);
|
|
||||||
static constexpr bool kIsVecLoad = kIsVecLoad_;
|
|
||||||
using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
|
|
||||||
// using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
||||||
// using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNItems, cub::BLOCK_STORE_WARP_TRANSPOSE>;
|
|
||||||
// static constexpr int kSmemSize = std::max({sizeof(typename BlockLoadT::TempStorage),
|
|
||||||
// sizeof(typename BlockStoreT::TempStorage)});
|
|
||||||
// static constexpr int kSmemSize = kChunkSizeL * kNEltsPerRow * kNBytes;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename Ktraits, bool kHasSeqIdx>
|
|
||||||
__global__ __launch_bounds__(Ktraits::kNThreads)
|
|
||||||
void causal_conv1d_channellast_fwd_kernel(ConvParamsBase params) {
|
|
||||||
constexpr int kWidth = Ktraits::kWidth;
|
|
||||||
constexpr int kNThreads = Ktraits::kNThreads;
|
|
||||||
constexpr int kNElts = Ktraits::kNElts;
|
|
||||||
constexpr int kNThreadsPerC = Ktraits::kNThreadsPerRow;
|
|
||||||
constexpr int kLPerLoad = Ktraits::kNColsPerLoad;
|
|
||||||
constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
|
|
||||||
constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
|
|
||||||
using input_t = typename Ktraits::input_t;
|
|
||||||
using vec_t = typename Ktraits::vec_t;
|
|
||||||
using weight_t = typename Ktraits::weight_t;
|
|
||||||
|
|
||||||
// Shared memory.
|
|
||||||
__shared__ input_t x_smem[kWidth - 1 + kChunkSizeL][kChunkSizeC + kNElts];
|
|
||||||
|
|
||||||
const int batch_id = blockIdx.x;
|
|
||||||
const int chunk_l_id = blockIdx.y;
|
|
||||||
const int chunk_c_id = blockIdx.z;
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
const int l_idx = tid / kNThreadsPerC;
|
|
||||||
const int c_idx = tid % kNThreadsPerC;
|
|
||||||
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
|
||||||
+ (chunk_l_id * kChunkSizeL + l_idx) * params.x_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr)
|
|
||||||
+ chunk_c_id * kChunkSizeC * params.weight_c_stride;
|
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
|
||||||
+ (chunk_l_id * kChunkSizeL + l_idx) * params.out_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
int *seq_idx = !kHasSeqIdx ? nullptr : reinterpret_cast<int *>(params.seq_idx_ptr)
|
|
||||||
+ batch_id * params.seqlen + chunk_l_id * kChunkSizeL;
|
|
||||||
input_t *initial_states = params.initial_states_ptr == nullptr || chunk_l_id > 0 ? nullptr
|
|
||||||
: reinterpret_cast<input_t *>(params.initial_states_ptr) + batch_id * params.initial_states_batch_stride + l_idx * params.initial_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
// The last L-chunk will also have enough info to write to final states, since it also contain a few x values
|
|
||||||
// from the previous L-chunk.
|
|
||||||
input_t *final_states = params.final_states_ptr == nullptr || chunk_l_id < gridDim.y - 1 ? nullptr
|
|
||||||
: reinterpret_cast<input_t *>(params.final_states_ptr) + batch_id * params.final_states_batch_stride + l_idx * params.final_states_l_stride + chunk_c_id * kChunkSizeC + c_idx * kNElts;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 0; l < Ktraits::kNLoads; ++l) {
|
|
||||||
input_t x_vals_load[kNElts] = {0};
|
|
||||||
if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x + l * kLPerLoad * params.x_l_stride);
|
|
||||||
}
|
|
||||||
reinterpret_cast<vec_t *>(x_smem[kWidth - 1 + l * kLPerLoad + l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
|
|
||||||
}
|
|
||||||
// Load the elements from the previous chunk that are needed for convolution.
|
|
||||||
if (l_idx < kWidth - 1) {
|
|
||||||
input_t x_vals_load[kNElts] = {0};
|
|
||||||
if (chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) >= 0
|
|
||||||
&& chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < params.seqlen
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(x - (kWidth - 1) * params.x_l_stride);
|
|
||||||
} else if (initial_states != nullptr
|
|
||||||
&& chunk_l_id * kChunkSizeL + l_idx - (kWidth - 1) < 0
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[0] = *reinterpret_cast<vec_t *>(initial_states);
|
|
||||||
}
|
|
||||||
reinterpret_cast<vec_t *>(x_smem[l_idx])[c_idx] = reinterpret_cast<vec_t *>(x_vals_load)[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (final_states != nullptr
|
|
||||||
&& l_idx < kWidth - 1
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
// x_smem[0] contains element at index chunk_l_id * kChunkSizeL - (kWidth - 1)
|
|
||||||
// So last few elements (index params.seqlen - kWidth + 1 + l_idx) are stored in x_smem[params.seqlen - kWidth + 1 + l_idx - (chunk_l_id * kChunkSizeL - kWidth + 1)][c_idx]
|
|
||||||
*reinterpret_cast<vec_t *>(final_states) = reinterpret_cast<vec_t *>(x_smem[params.seqlen + l_idx - chunk_l_id * kChunkSizeL])[c_idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int kLPerThread = constexpr_min(kChunkSizeL * kChunkSizeC / kNThreads, kChunkSizeL);
|
|
||||||
static_assert(kLPerThread * kNThreads == kChunkSizeL * kChunkSizeC);
|
|
||||||
constexpr int kNThreadsPerRow = kChunkSizeL / kLPerThread;
|
|
||||||
static_assert(kNThreadsPerRow * kLPerThread == kChunkSizeL);
|
|
||||||
// kChunkSizeL, kLPerThread, kNThreadsPerRow should be powers of 2 for simplicity
|
|
||||||
static_assert((kChunkSizeL & (kChunkSizeL - 1)) == 0);
|
|
||||||
static_assert((kLPerThread & (kLPerThread - 1)) == 0);
|
|
||||||
static_assert((kNThreadsPerRow & (kNThreadsPerRow - 1)) == 0);
|
|
||||||
static_assert(kNThreadsPerRow <= 32);
|
|
||||||
|
|
||||||
const int row_idx = tid / kNThreadsPerRow;
|
|
||||||
const int col_idx = tid % kNThreadsPerRow;
|
|
||||||
|
|
||||||
float bias_val = params.bias_ptr == nullptr || chunk_c_id * kChunkSizeC + row_idx >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[chunk_c_id * kChunkSizeC + row_idx]);
|
|
||||||
float weight_vals[kWidth] = {0};
|
|
||||||
if (chunk_c_id * kChunkSizeC + row_idx < params.dim) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int w = 0; w < kWidth; ++w) {
|
|
||||||
weight_vals[w] = weight[row_idx * params.weight_c_stride + w * params.weight_width_stride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
float x_vals[kWidth - 1 + kLPerThread];
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
|
|
||||||
x_vals[i] = float(x_smem[col_idx * kLPerThread + i][row_idx]);
|
|
||||||
}
|
|
||||||
int seq_idx_thread[kWidth - 1 + kLPerThread];
|
|
||||||
if constexpr (kHasSeqIdx) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kWidth - 1 + kLPerThread; ++i) {
|
|
||||||
seq_idx_thread[i] = chunk_l_id * kChunkSizeL + col_idx * kLPerThread + i - (kWidth - 1) >= 0 ? seq_idx[col_idx * kLPerThread + i - (kWidth - 1)] : -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float out_vals[kLPerThread];
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kLPerThread; ++i) {
|
|
||||||
out_vals[i] = bias_val;
|
|
||||||
const int seq_idx_cur = !kHasSeqIdx ? 0 : seq_idx_thread[i + kWidth - 1];
|
|
||||||
#pragma unroll
|
|
||||||
for (int w = 0; w < kWidth; ++w) {
|
|
||||||
if constexpr (!kHasSeqIdx) {
|
|
||||||
out_vals[i] += weight_vals[w] * x_vals[i + w];
|
|
||||||
} else {
|
|
||||||
out_vals[i] += seq_idx_thread[i + w] == seq_idx_cur ? weight_vals[w] * x_vals[i + w] : 0.f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (params.silu_activation) {out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i])); }
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kLPerThread; ++i) { x_smem[col_idx * kLPerThread + i][row_idx] = out_vals[i]; }
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 0; l < Ktraits::kNLoads; ++l) {
|
|
||||||
input_t out_vals_store[kNElts];
|
|
||||||
reinterpret_cast<vec_t *>(out_vals_store)[0] = reinterpret_cast<vec_t *>(x_smem[l * kLPerLoad + l_idx])[c_idx];
|
|
||||||
if (chunk_l_id * kChunkSizeL + l * kLPerLoad + l_idx < params.seqlen
|
|
||||||
&& chunk_c_id * kChunkSizeC + c_idx * kNElts < params.dim) {
|
|
||||||
*reinterpret_cast<vec_t *>(out + l * kLPerLoad * params.out_l_stride) = reinterpret_cast<vec_t *>(out_vals_store)[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
|
||||||
void causal_conv1d_channellast_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
|
||||||
BOOL_SWITCH(params.seq_idx_ptr != nullptr, kHasSeqIdx, [&] {
|
|
||||||
using Ktraits = Causal_conv1d_channellast_fwd_kernel_traits<kNThreads, kWidth, 64, true, input_t, weight_t>;
|
|
||||||
// constexpr int kSmemSize = Ktraits::kSmemSize;
|
|
||||||
constexpr int kChunkSizeL = Ktraits::kChunkSizeL;
|
|
||||||
constexpr int kChunkSizeC = Ktraits::kNEltsPerRow;
|
|
||||||
const int n_chunks_L = (params.seqlen + kChunkSizeL - 1) / kChunkSizeL;
|
|
||||||
const int n_chunks_C = (params.dim + kChunkSizeC - 1) / kChunkSizeC;
|
|
||||||
dim3 grid(params.batch, n_chunks_L, n_chunks_C);
|
|
||||||
dim3 block(Ktraits::kNThreads);
|
|
||||||
auto kernel = &causal_conv1d_channellast_fwd_kernel<Ktraits, kHasSeqIdx>;
|
|
||||||
// if (kSmemSize >= 48 * 1024) {
|
|
||||||
// C10_CUDA_CHECK(cudaFuncSetAttribute(
|
|
||||||
// kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
|
|
||||||
// }
|
|
||||||
// kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
|
|
||||||
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
|
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename input_t, typename weight_t>
|
|
||||||
void causal_conv1d_channellast_fwd_cuda(ConvParamsBase ¶ms, cudaStream_t stream) {
|
|
||||||
if (params.width == 2) {
|
|
||||||
causal_conv1d_channellast_fwd_launch<128, 2, input_t, weight_t>(params, stream);
|
|
||||||
} else if (params.width == 3) {
|
|
||||||
causal_conv1d_channellast_fwd_launch<128, 3, input_t, weight_t>(params, stream);
|
|
||||||
} else if (params.width == 4) {
|
|
||||||
causal_conv1d_channellast_fwd_launch<128, 4, input_t, weight_t>(params, stream);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase ¶ms, cudaStream_t stream);
|
template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase ¶ms, cudaStream_t stream);
|
template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase ¶ms, cudaStream_t stream);
|
template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase ¶ms, cudaStream_t stream);
|
||||||
|
|
||||||
template void causal_conv1d_channellast_fwd_cuda<float, float>(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
template void causal_conv1d_channellast_fwd_cuda<at::Half, at::Half>(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
template void causal_conv1d_channellast_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase ¶ms, cudaStream_t stream);
|
|
||||||
///////
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -633,7 +507,7 @@ struct Causal_conv1d_update_kernel_traits {
|
|||||||
static_assert(kNBytes == 2 || kNBytes == 4);
|
static_assert(kNBytes == 2 || kNBytes == 4);
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Ktraits>
|
template<typename Ktraits, bool kIsCircularBuffer>
|
||||||
__global__ __launch_bounds__(Ktraits::kNThreads)
|
__global__ __launch_bounds__(Ktraits::kNThreads)
|
||||||
void causal_conv1d_update_kernel(ConvParamsBase params) {
|
void causal_conv1d_update_kernel(ConvParamsBase params) {
|
||||||
constexpr int kWidth = Ktraits::kWidth;
|
constexpr int kWidth = Ktraits::kWidth;
|
||||||
@ -644,42 +518,87 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
|
|||||||
const int tidx = threadIdx.x;
|
const int tidx = threadIdx.x;
|
||||||
const int batch_id = blockIdx.x;
|
const int batch_id = blockIdx.x;
|
||||||
const int channel_id = blockIdx.y * kNThreads + tidx;
|
const int channel_id = blockIdx.y * kNThreads + tidx;
|
||||||
|
if (channel_id >= params.dim) return;
|
||||||
|
|
||||||
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
|
||||||
+ channel_id * params.x_c_stride;
|
+ channel_id * params.x_c_stride;
|
||||||
input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr) + batch_id * params.conv_state_batch_stride
|
|
||||||
|
// If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
|
||||||
|
// along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
|
||||||
|
const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
|
||||||
|
? batch_id
|
||||||
|
: params.conv_state_indices_ptr[batch_id];
|
||||||
|
input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr)
|
||||||
|
+ conv_state_batch_coord * params.conv_state_batch_stride
|
||||||
+ channel_id * params.conv_state_c_stride;
|
+ channel_id * params.conv_state_c_stride;
|
||||||
|
|
||||||
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
||||||
+ channel_id * params.out_c_stride;
|
+ channel_id * params.out_c_stride;
|
||||||
float bias_val = params.bias_ptr == nullptr || channel_id >= params.dim ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
|
||||||
|
|
||||||
|
int state_len = params.conv_state_len;
|
||||||
|
int advance_len = params.seqlen;
|
||||||
|
int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
|
||||||
|
int update_idx = cache_seqlen - (kWidth - 1);
|
||||||
|
update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
|
||||||
|
|
||||||
float weight_vals[kWidth] = {0};
|
float weight_vals[kWidth] = {0};
|
||||||
if (channel_id < params.dim) {
|
#pragma unroll
|
||||||
#pragma unroll
|
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
||||||
for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
|
|
||||||
}
|
|
||||||
|
|
||||||
float x_vals[kWidth] = {0};
|
float x_vals[kWidth] = {0};
|
||||||
if (channel_id < params.dim) {
|
if constexpr (!kIsCircularBuffer) {
|
||||||
|
#pragma unroll 2
|
||||||
|
for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
|
||||||
|
conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
|
||||||
|
}
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = float(conv_state[(i + 1) * params.conv_state_l_stride]); }
|
for (int i = 0; i < kWidth - 1; ++i) {
|
||||||
x_vals[kWidth - 1] = float(x[0]);
|
input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
|
||||||
|
if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
|
||||||
|
conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
|
||||||
|
}
|
||||||
|
x_vals[i] = float(state_val);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kWidth; ++i) { conv_state[i * params.conv_state_l_stride] = input_t(x_vals[i]); }
|
for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
|
||||||
|
input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
|
||||||
|
x_vals[i] = float(state_val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#pragma unroll 2
|
||||||
|
for (int i = 0; i < params.seqlen; ++i) {
|
||||||
|
input_t x_val = x[i * params.x_l_stride];
|
||||||
|
if constexpr (!kIsCircularBuffer) {
|
||||||
|
if (i < advance_len && state_len - advance_len + i >= 0) {
|
||||||
|
conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
conv_state[update_idx * params.conv_state_l_stride] = x_val;
|
||||||
|
++update_idx;
|
||||||
|
update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
|
||||||
|
}
|
||||||
|
x_vals[kWidth - 1] = float(x_val);
|
||||||
|
float out_val = bias_val;
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
|
||||||
|
if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
|
||||||
|
out[i * params.out_l_stride] = input_t(out_val);
|
||||||
|
// Shift the input buffer by 1
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
|
||||||
}
|
}
|
||||||
|
|
||||||
float out_val = bias_val;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < kWidth; ++i) { out_val += weight_vals[i] * x_vals[i]; }
|
|
||||||
if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
|
|
||||||
if (channel_id < params.dim) { out[0] = input_t(out_val); }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
template<int kNThreads, int kWidth, typename input_t, typename weight_t>
|
||||||
void causal_conv1d_update_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
void causal_conv1d_update_launch(ConvParamsBase ¶ms, cudaStream_t stream) {
|
||||||
using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
|
using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
|
||||||
dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
|
dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
|
||||||
auto kernel = &causal_conv1d_update_kernel<Ktraits>;
|
auto kernel = params.cache_seqlens == nullptr
|
||||||
|
? &causal_conv1d_update_kernel<Ktraits, false>
|
||||||
|
: &causal_conv1d_update_kernel<Ktraits, true>;
|
||||||
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
|
kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -24,6 +24,7 @@ struct ConvParamsBase {
|
|||||||
index_t out_c_stride;
|
index_t out_c_stride;
|
||||||
index_t out_l_stride;
|
index_t out_l_stride;
|
||||||
|
|
||||||
|
int conv_state_len;
|
||||||
index_t conv_state_batch_stride;
|
index_t conv_state_batch_stride;
|
||||||
index_t conv_state_c_stride;
|
index_t conv_state_c_stride;
|
||||||
index_t conv_state_l_stride;
|
index_t conv_state_l_stride;
|
||||||
@ -35,6 +36,14 @@ struct ConvParamsBase {
|
|||||||
void *__restrict__ out_ptr;
|
void *__restrict__ out_ptr;
|
||||||
|
|
||||||
void *__restrict__ conv_state_ptr;
|
void *__restrict__ conv_state_ptr;
|
||||||
|
void *__restrict__ query_start_loc_ptr;
|
||||||
|
void *__restrict__ has_initial_state_ptr;
|
||||||
|
void *__restrict__ cache_indices_ptr;
|
||||||
|
int32_t *__restrict__ cache_seqlens;
|
||||||
|
|
||||||
|
// For the continuous batching case. Makes it so that the mamba state for
|
||||||
|
// the current batch doesn't need to be a contiguous tensor.
|
||||||
|
int32_t *__restrict__ conv_state_indices_ptr;
|
||||||
|
|
||||||
void *__restrict__ seq_idx_ptr;
|
void *__restrict__ seq_idx_ptr;
|
||||||
|
|
||||||
@ -48,6 +57,11 @@ struct ConvParamsBase {
|
|||||||
index_t final_states_batch_stride;
|
index_t final_states_batch_stride;
|
||||||
index_t final_states_l_stride;
|
index_t final_states_l_stride;
|
||||||
index_t final_states_c_stride;
|
index_t final_states_c_stride;
|
||||||
|
|
||||||
|
void * conv_states_ptr;
|
||||||
|
index_t conv_states_batch_stride;
|
||||||
|
index_t conv_states_l_stride;
|
||||||
|
index_t conv_states_c_stride;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -54,10 +54,14 @@ struct SSMParamsBase {
|
|||||||
void *__restrict__ delta_ptr;
|
void *__restrict__ delta_ptr;
|
||||||
void *__restrict__ delta_bias_ptr;
|
void *__restrict__ delta_bias_ptr;
|
||||||
void *__restrict__ out_ptr;
|
void *__restrict__ out_ptr;
|
||||||
void *__restrict__ x_ptr;
|
void *__restrict__ ssm_states_ptr;
|
||||||
void *__restrict__ z_ptr;
|
void *__restrict__ z_ptr;
|
||||||
void *__restrict__ out_z_ptr;
|
void *__restrict__ out_z_ptr;
|
||||||
void *__restrict__ index_ptr;
|
|
||||||
|
void *__restrict__ query_start_loc_ptr;
|
||||||
|
void *__restrict__ cache_indices_ptr;
|
||||||
|
void *__restrict__ has_initial_state_ptr;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -201,7 +205,7 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
|
|||||||
typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
|
typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
|
||||||
typename Ktraits::BlockLoadT::TempStorage &smem_load,
|
typename Ktraits::BlockLoadT::TempStorage &smem_load,
|
||||||
int seqlen) {
|
int seqlen) {
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
|
||||||
auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
|
auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
|
typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
|
||||||
@ -217,21 +221,6 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Ktraits>
|
|
||||||
inline __device__ void load_index(int *u,
|
|
||||||
int (&u_vals)[Ktraits::kNItems],
|
|
||||||
typename Ktraits::BlockLoadIndexT::TempStorage &smem_load_index,
|
|
||||||
int seqlen) {
|
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
|
||||||
auto& smem_load_index_vec = reinterpret_cast<typename Ktraits::BlockLoadIndexVecT::TempStorage&>(smem_load_index);
|
|
||||||
Ktraits::BlockLoadIndexVecT(smem_load_index_vec).Load(
|
|
||||||
reinterpret_cast<uint4*>(u),
|
|
||||||
reinterpret_cast<uint4(&)[Ktraits::kNLoadsIndex]>(u_vals)
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
Ktraits::BlockLoadIndexT(smem_load_index).Load(u, u_vals, seqlen, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Ktraits>
|
template<typename Ktraits>
|
||||||
inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
|
inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
|
||||||
@ -240,7 +229,7 @@ inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
|
|||||||
int seqlen) {
|
int seqlen) {
|
||||||
constexpr int kNItems = Ktraits::kNItems;
|
constexpr int kNItems = Ktraits::kNItems;
|
||||||
typename Ktraits::input_t B_vals_load[kNItems];
|
typename Ktraits::input_t B_vals_load[kNItems];
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
|
||||||
auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
|
auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
|
typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
|
||||||
@ -263,7 +252,7 @@ inline __device__ void store_output(typename Ktraits::input_t *out,
|
|||||||
typename Ktraits::input_t write_vals[Ktraits::kNItems];
|
typename Ktraits::input_t write_vals[Ktraits::kNItems];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
|
for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
|
||||||
if constexpr (Ktraits::kIsEvenLen) {
|
if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
|
||||||
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
|
auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
|
||||||
using vec_t = typename Ktraits::vec_t;
|
using vec_t = typename Ktraits::vec_t;
|
||||||
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
|
typename Ktraits::BlockStoreVecT(smem_store_vec).Store(
|
||||||
|
|||||||
@ -23,7 +23,7 @@
|
|||||||
|
|
||||||
template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
|
template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
|
||||||
bool kIsVariableB_, bool kIsVariableC_,
|
bool kIsVariableB_, bool kIsVariableC_,
|
||||||
bool kHasZ_, bool kUseIndex_, typename input_t_, typename weight_t_>
|
bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
|
||||||
struct Selective_Scan_fwd_kernel_traits {
|
struct Selective_Scan_fwd_kernel_traits {
|
||||||
static_assert(kNItems_ % 4 == 0);
|
static_assert(kNItems_ % 4 == 0);
|
||||||
using input_t = input_t_;
|
using input_t = input_t_;
|
||||||
@ -38,22 +38,19 @@ struct Selective_Scan_fwd_kernel_traits {
|
|||||||
static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
|
static constexpr int kNElts = kNBytes == 4 ? 4 : constexpr_min(8, kNItems);
|
||||||
static_assert(kNItems % kNElts == 0);
|
static_assert(kNItems % kNElts == 0);
|
||||||
static constexpr int kNLoads = kNItems / kNElts;
|
static constexpr int kNLoads = kNItems / kNElts;
|
||||||
static constexpr bool kIsEvenLen = kIsEvenLen_;
|
static constexpr bool kIsEvenLen = kVarlen_ ? false : kIsEvenLen_;
|
||||||
static constexpr bool kIsVariableB = kIsVariableB_;
|
static constexpr bool kIsVariableB = kIsVariableB_;
|
||||||
static constexpr bool kIsVariableC = kIsVariableC_;
|
static constexpr bool kIsVariableC = kIsVariableC_;
|
||||||
static constexpr bool kHasZ = kHasZ_;
|
static constexpr bool kHasZ = kHasZ_;
|
||||||
static constexpr bool kUseIndex = kUseIndex_;
|
static constexpr bool kVarlen = kVarlen_;
|
||||||
|
|
||||||
static constexpr bool kDirectIO = kIsEvenLen && kNLoads == 1;
|
static constexpr bool kDirectIO = kVarlen_ ? false : kIsEvenLen && kNLoads == 1;
|
||||||
static constexpr int kNLoadsIndex = kNItems / 4;
|
static constexpr int kNLoadsIndex = kNItems / 4;
|
||||||
using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
|
using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
|
||||||
using scan_t = float2;
|
using scan_t = float2;
|
||||||
using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
||||||
using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
|
using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads,
|
||||||
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
||||||
using BlockLoadIndexT = cub::BlockLoad<int, kNThreads, kNItems, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
||||||
using BlockLoadIndexVecT = cub::BlockLoad<uint4, kNThreads, kNLoadsIndex,
|
|
||||||
!(kIsEvenLen && kNLoadsIndex == 1) ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
|
||||||
using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
using BlockLoadWeightT = cub::BlockLoad<input_t, kNThreads, kNItems , cub::BLOCK_LOAD_WARP_TRANSPOSE>;
|
||||||
using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
|
using BlockLoadWeightVecT = cub::BlockLoad<vec_t, kNThreads, kNLoads ,
|
||||||
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
!kDirectIO ? cub::BLOCK_LOAD_WARP_TRANSPOSE : cub::BLOCK_LOAD_DIRECT>;
|
||||||
@ -65,8 +62,6 @@ struct Selective_Scan_fwd_kernel_traits {
|
|||||||
using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
|
using BlockScanT = cub::BlockScan<scan_t, kNThreads, cub::BLOCK_SCAN_WARP_SCANS>;
|
||||||
static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
|
static constexpr int kSmemIOSize = custom_max({sizeof(typename BlockLoadT::TempStorage),
|
||||||
sizeof(typename BlockLoadVecT::TempStorage),
|
sizeof(typename BlockLoadVecT::TempStorage),
|
||||||
sizeof(typename BlockLoadIndexT::TempStorage),
|
|
||||||
sizeof(typename BlockLoadIndexVecT::TempStorage),
|
|
||||||
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
|
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightT::TempStorage),
|
||||||
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
|
(int(kIsVariableB) + int(kIsVariableC)) * sizeof(typename BlockLoadWeightVecT::TempStorage),
|
||||||
sizeof(typename BlockStoreT::TempStorage),
|
sizeof(typename BlockStoreT::TempStorage),
|
||||||
@ -80,7 +75,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
constexpr bool kIsVariableB = Ktraits::kIsVariableB;
|
constexpr bool kIsVariableB = Ktraits::kIsVariableB;
|
||||||
constexpr bool kIsVariableC = Ktraits::kIsVariableC;
|
constexpr bool kIsVariableC = Ktraits::kIsVariableC;
|
||||||
constexpr bool kHasZ = Ktraits::kHasZ;
|
constexpr bool kHasZ = Ktraits::kHasZ;
|
||||||
constexpr bool kUseIndex = Ktraits::kUseIndex;
|
constexpr bool kVarlen = Ktraits::kVarlen;
|
||||||
constexpr int kNThreads = Ktraits::kNThreads;
|
constexpr int kNThreads = Ktraits::kNThreads;
|
||||||
constexpr int kNItems = Ktraits::kNItems;
|
constexpr int kNItems = Ktraits::kNItems;
|
||||||
constexpr int kNRows = Ktraits::kNRows;
|
constexpr int kNRows = Ktraits::kNRows;
|
||||||
@ -97,7 +92,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
|
// auto& smem_load = reinterpret_cast<typename BlockLoadT::TempStorage&>(smem_loadstorescan);
|
||||||
auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
|
auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
|
||||||
auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
|
auto& smem_load_weight = reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage&>(smem_);
|
||||||
auto& smem_load_index = reinterpret_cast<typename Ktraits::BlockLoadIndexT::TempStorage&>(smem_);
|
|
||||||
auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
|
auto& smem_load_weight1 = *reinterpret_cast<typename Ktraits::BlockLoadWeightT::TempStorage*>(smem_ + sizeof(typename Ktraits::BlockLoadWeightT::TempStorage));
|
||||||
auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
|
auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
|
||||||
auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
|
auto& smem_scan = *reinterpret_cast<typename Ktraits::BlockScanT::TempStorage*>(smem_ + Ktraits::kSmemIOSize);
|
||||||
@ -108,17 +102,29 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
const int batch_id = blockIdx.x;
|
const int batch_id = blockIdx.x;
|
||||||
const int dim_id = blockIdx.y;
|
const int dim_id = blockIdx.y;
|
||||||
const int group_id = dim_id / (params.dim_ngroups_ratio);
|
const int group_id = dim_id / (params.dim_ngroups_ratio);
|
||||||
input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + batch_id * params.u_batch_stride
|
int seqlen = params.seqlen;
|
||||||
|
int sequence_start_index = batch_id;
|
||||||
|
if constexpr (kVarlen){
|
||||||
|
int *query_start_loc = reinterpret_cast<int *>(params.query_start_loc_ptr);
|
||||||
|
sequence_start_index = query_start_loc[batch_id];
|
||||||
|
seqlen = query_start_loc[batch_id + 1] - sequence_start_index;
|
||||||
|
}
|
||||||
|
const bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
|
||||||
|
: reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
|
||||||
|
|
||||||
|
const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
|
||||||
|
: reinterpret_cast<int *>(params.cache_indices_ptr);
|
||||||
|
const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
|
||||||
|
input_t *u = reinterpret_cast<input_t *>(params.u_ptr) + sequence_start_index * params.u_batch_stride
|
||||||
+ dim_id * kNRows * params.u_d_stride;
|
+ dim_id * kNRows * params.u_d_stride;
|
||||||
input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + batch_id * params.delta_batch_stride
|
input_t *delta = reinterpret_cast<input_t *>(params.delta_ptr) + sequence_start_index * params.delta_batch_stride
|
||||||
+ dim_id * kNRows * params.delta_d_stride;
|
+ dim_id * kNRows * params.delta_d_stride;
|
||||||
weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
|
weight_t *A = reinterpret_cast<weight_t *>(params.A_ptr) + dim_id * kNRows * params.A_d_stride;
|
||||||
weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
|
weight_t *B = reinterpret_cast<weight_t *>(params.B_ptr) + dim_id * kNRows * params.B_d_stride;
|
||||||
input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + batch_id * params.B_batch_stride + group_id * params.B_group_stride;
|
input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
|
||||||
weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
|
weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
|
||||||
input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
|
input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
|
||||||
scan_t *x = reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id * kNRows) * params.n_chunks * params.dstate;
|
input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
|
||||||
int *index = !kUseIndex ? nullptr :reinterpret_cast<int *>(params.index_ptr) + batch_id * params.seqlen;
|
|
||||||
|
|
||||||
float D_val[kNRows] = {0};
|
float D_val[kNRows] = {0};
|
||||||
if (params.D_ptr != nullptr) {
|
if (params.D_ptr != nullptr) {
|
||||||
@ -142,9 +148,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
constexpr int kChunkSize = kNThreads * kNItems;
|
constexpr int kChunkSize = kNThreads * kNItems;
|
||||||
for (int chunk = 0; chunk < params.n_chunks; ++chunk) {
|
const int n_chunks = (seqlen + 2048 - 1) / 2048;
|
||||||
|
for (int chunk = 0; chunk < n_chunks; ++chunk) {
|
||||||
input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
|
input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
|
||||||
int index_vals_load[kNRows][kNItems];
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@ -152,15 +158,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
if constexpr (!kDirectIO) {
|
if constexpr (!kDirectIO) {
|
||||||
if (r > 0) { __syncthreads(); }
|
if (r > 0) { __syncthreads(); }
|
||||||
}
|
}
|
||||||
load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, params.seqlen - chunk * kChunkSize);
|
load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
|
||||||
if constexpr (!kDirectIO) { __syncthreads(); }
|
if constexpr (!kDirectIO) { __syncthreads(); }
|
||||||
load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, params.seqlen - chunk * kChunkSize);
|
load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
|
||||||
if constexpr (kUseIndex) {
|
|
||||||
load_index<Ktraits>(index + r * params.delta_d_stride, index_vals_load[r], smem_load_index, params.seqlen - chunk * kChunkSize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if constexpr (kUseIndex) {
|
|
||||||
index += kChunkSize;
|
|
||||||
}
|
}
|
||||||
u += kChunkSize;
|
u += kChunkSize;
|
||||||
delta += kChunkSize;
|
delta += kChunkSize;
|
||||||
@ -195,9 +195,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// If both B and C vary, this is unused.
|
// If both B and C vary, this is unused.
|
||||||
weight_t BC_val[kNRows];
|
weight_t BC_val[kNRows];
|
||||||
weight_t B_vals[kNItems], C_vals[kNItems];
|
weight_t B_vals[kNItems], C_vals[kNItems];
|
||||||
if constexpr (kIsVariableB) {
|
if constexpr (kIsVariableB) {
|
||||||
load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
|
load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
|
||||||
smem_load_weight, (params.seqlen - chunk * kChunkSize) * (1));
|
smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
|
||||||
if constexpr (!kIsVariableC) {
|
if constexpr (!kIsVariableC) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int r = 0; r < kNRows; ++r) {
|
for (int r = 0; r < kNRows; ++r) {
|
||||||
@ -208,7 +208,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
if constexpr (kIsVariableC) {
|
if constexpr (kIsVariableC) {
|
||||||
auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
|
auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
|
||||||
load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
|
load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
|
||||||
smem_load_weight_C, (params.seqlen - chunk * kChunkSize) * (1 ));
|
smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
|
||||||
if constexpr (!kIsVariableB) {
|
if constexpr (!kIsVariableB) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int r = 0; r < kNRows; ++r) {
|
for (int r = 0; r < kNRows; ++r) {
|
||||||
@ -232,24 +232,16 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
|
thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
|
||||||
!kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
|
!kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
|
||||||
|
|
||||||
// Reset A bar for cumulative sequences (Real)
|
if (seqlen % (kNItems * kNThreads) != 0) { // So that the last state is correct
|
||||||
if constexpr (kUseIndex) {
|
if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
|
||||||
if (index_vals_load[r][i] == 0) {
|
|
||||||
thread_data[i].x = 0.f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if constexpr (!Ktraits::kIsEvenLen) { // So that the last state is correct
|
|
||||||
if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
|
|
||||||
thread_data[i] = make_float2(1.f, 0.f);
|
thread_data[i] = make_float2(1.f, 0.f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Initialize running total
|
// Initialize running total
|
||||||
scan_t running_prefix;
|
|
||||||
// If we use WARP_SCAN then all lane 0 of all warps (not just thread 0) needs to read
|
scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
|
||||||
running_prefix = chunk == 0 ? x[(r * params.n_chunks) * params.dstate + state_idx] : ( threadIdx.x % 32 == 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.f, 0.f));
|
|
||||||
// running_prefix = chunk > 0 && threadIdx.x == 0 ? smem_running_prefix[state_idx] : make_float2(1.f, 0.f);
|
|
||||||
SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
|
SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
|
||||||
typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
|
typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
|
||||||
thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
|
thread_data, thread_data, SSMScanOp<weight_t>(), prefix_op
|
||||||
@ -258,7 +250,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
// Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
|
// Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
smem_running_prefix[state_idx] = prefix_op.running_prefix;
|
smem_running_prefix[state_idx] = prefix_op.running_prefix;
|
||||||
x[(r * params.n_chunks + chunk) * params.dstate + state_idx] = prefix_op.running_prefix;
|
if (chunk == n_chunks - 1) {
|
||||||
|
ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kNItems; ++i) {
|
for (int i = 0; i < kNItems; ++i) {
|
||||||
@ -270,7 +264,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
|
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
|
||||||
+ dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
|
+ dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@ -278,26 +272,26 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
|||||||
if constexpr (!kDirectIO) {
|
if constexpr (!kDirectIO) {
|
||||||
if (r > 0) { __syncthreads(); }
|
if (r > 0) { __syncthreads(); }
|
||||||
}
|
}
|
||||||
store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
|
store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (kHasZ) {
|
if constexpr (kHasZ) {
|
||||||
input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + batch_id * params.z_batch_stride
|
input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
|
||||||
+ dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
|
+ dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
|
||||||
input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + batch_id * params.out_z_batch_stride
|
input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
|
||||||
+ dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
|
+ dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int r = 0; r < kNRows; ++r) {
|
for (int r = 0; r < kNRows; ++r) {
|
||||||
input_t z_vals[kNItems];
|
input_t z_vals[kNItems];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, params.seqlen - chunk * kChunkSize);
|
load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < kNItems; ++i) {
|
for (int i = 0; i < kNItems; ++i) {
|
||||||
float z_val = z_vals[i];
|
float z_val = z_vals[i];
|
||||||
out_vals[r][i] *= z_val / (1 + expf(-z_val));
|
out_vals[r][i] *= z_val / (1 + expf(-z_val));
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, params.seqlen - chunk * kChunkSize);
|
store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -316,8 +310,8 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
|||||||
constexpr bool kIsVariableC = true;
|
constexpr bool kIsVariableC = true;
|
||||||
constexpr bool kHasZ = true;
|
constexpr bool kHasZ = true;
|
||||||
BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
|
BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
|
||||||
BOOL_SWITCH(params.index_ptr != nullptr , kUseIndex, [&] {
|
BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
|
||||||
using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kUseIndex, input_t, weight_t>;
|
using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kVarlen, input_t, weight_t>;
|
||||||
constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
|
constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
|
||||||
dim3 grid(params.batch, params.dim / kNRows);
|
dim3 grid(params.batch, params.dim / kNRows);
|
||||||
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
||||||
@ -405,12 +399,15 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
|||||||
const torch::Tensor out,
|
const torch::Tensor out,
|
||||||
const torch::Tensor z,
|
const torch::Tensor z,
|
||||||
const torch::Tensor out_z,
|
const torch::Tensor out_z,
|
||||||
void* D_ptr,
|
const c10::optional<at::Tensor>& D,
|
||||||
void* delta_bias_ptr,
|
const c10::optional<at::Tensor>& delta_bias,
|
||||||
void* x_ptr,
|
const torch::Tensor ssm_states,
|
||||||
bool has_z,
|
bool has_z,
|
||||||
bool delta_softplus,
|
bool delta_softplus,
|
||||||
void* index_ptr) {
|
const c10::optional<at::Tensor>& query_start_loc,
|
||||||
|
const c10::optional<at::Tensor>& cache_indices,
|
||||||
|
const c10::optional<at::Tensor>& has_initial_state,
|
||||||
|
bool varlen) {
|
||||||
|
|
||||||
// Reset the parameters
|
// Reset the parameters
|
||||||
memset(¶ms, 0, sizeof(params));
|
memset(¶ms, 0, sizeof(params));
|
||||||
@ -434,55 +431,83 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
|||||||
params.A_ptr = A.data_ptr();
|
params.A_ptr = A.data_ptr();
|
||||||
params.B_ptr = B.data_ptr();
|
params.B_ptr = B.data_ptr();
|
||||||
params.C_ptr = C.data_ptr();
|
params.C_ptr = C.data_ptr();
|
||||||
params.D_ptr = D_ptr;
|
params.D_ptr = D.has_value() ? D.value().data_ptr() : nullptr;
|
||||||
params.delta_bias_ptr = delta_bias_ptr;
|
params.delta_bias_ptr = delta_bias.has_value() ? delta_bias.value().data_ptr() : nullptr;
|
||||||
params.out_ptr = out.data_ptr();
|
params.out_ptr = out.data_ptr();
|
||||||
params.x_ptr = x_ptr;
|
params.ssm_states_ptr = ssm_states.data_ptr();
|
||||||
params.z_ptr = has_z ? z.data_ptr() : nullptr;
|
params.z_ptr = has_z ? z.data_ptr() : nullptr;
|
||||||
params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
|
params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
|
||||||
|
params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
|
||||||
|
params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
|
||||||
|
params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
|
||||||
|
|
||||||
params.index_ptr = index_ptr;
|
|
||||||
|
|
||||||
// All stride are in elements, not bytes.
|
// All stride are in elements, not bytes.
|
||||||
params.A_d_stride = A.stride(0);
|
params.A_d_stride = A.stride(0);
|
||||||
params.A_dstate_stride = A.stride(1);
|
params.A_dstate_stride = A.stride(1);
|
||||||
if (!is_variable_B) {
|
|
||||||
params.B_d_stride = B.stride(0);
|
if (varlen){
|
||||||
} else {
|
params.B_batch_stride = B.stride(2);
|
||||||
params.B_batch_stride = B.stride(0);
|
params.B_group_stride = B.stride(0);
|
||||||
params.B_group_stride = B.stride(1);
|
params.B_dstate_stride = B.stride(1);
|
||||||
|
params.C_batch_stride = C.stride(2);
|
||||||
|
params.C_group_stride = C.stride(0);
|
||||||
|
params.C_dstate_stride = C.stride(1);
|
||||||
|
|
||||||
|
params.u_batch_stride = u.stride(1);
|
||||||
|
params.u_d_stride = u.stride(0);
|
||||||
|
params.delta_batch_stride = delta.stride(1);
|
||||||
|
params.delta_d_stride = delta.stride(0);
|
||||||
|
if (has_z) {
|
||||||
|
params.z_batch_stride = z.stride(1);
|
||||||
|
params.z_d_stride = z.stride(0);
|
||||||
|
params.out_z_batch_stride = out_z.stride(1);
|
||||||
|
params.out_z_d_stride = out_z.stride(0);
|
||||||
|
}
|
||||||
|
params.out_batch_stride = out.stride(1);
|
||||||
|
params.out_d_stride = out.stride(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
|
else{
|
||||||
if (!is_variable_C) {
|
if (!is_variable_B) {
|
||||||
params.C_d_stride = C.stride(0);
|
params.B_d_stride = B.stride(0);
|
||||||
} else {
|
} else {
|
||||||
params.C_batch_stride = C.stride(0);
|
params.B_batch_stride = B.stride(0);
|
||||||
params.C_group_stride = C.stride(1);
|
params.B_group_stride = B.stride(1);
|
||||||
|
}
|
||||||
|
params.B_dstate_stride = !is_variable_B ? B.stride(1) : B.stride(2);
|
||||||
|
if (!is_variable_C) {
|
||||||
|
params.C_d_stride = C.stride(0);
|
||||||
|
} else {
|
||||||
|
params.C_batch_stride = C.stride(0);
|
||||||
|
params.C_group_stride = C.stride(1);
|
||||||
|
}
|
||||||
|
params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
|
||||||
|
params.u_batch_stride = u.stride(0);
|
||||||
|
params.u_d_stride = u.stride(1);
|
||||||
|
params.delta_batch_stride = delta.stride(0);
|
||||||
|
params.delta_d_stride = delta.stride(1);
|
||||||
|
if (has_z) {
|
||||||
|
params.z_batch_stride = z.stride(0);
|
||||||
|
params.z_d_stride = z.stride(1);
|
||||||
|
params.out_z_batch_stride = out_z.stride(0);
|
||||||
|
params.out_z_d_stride = out_z.stride(1);
|
||||||
|
}
|
||||||
|
params.out_batch_stride = out.stride(0);
|
||||||
|
params.out_d_stride = out.stride(1);
|
||||||
}
|
}
|
||||||
params.C_dstate_stride = !is_variable_C ? C.stride(1) : C.stride(2);
|
|
||||||
params.u_batch_stride = u.stride(0);
|
|
||||||
params.u_d_stride = u.stride(1);
|
|
||||||
params.delta_batch_stride = delta.stride(0);
|
|
||||||
params.delta_d_stride = delta.stride(1);
|
|
||||||
if (has_z) {
|
|
||||||
params.z_batch_stride = z.stride(0);
|
|
||||||
params.z_d_stride = z.stride(1);
|
|
||||||
params.out_z_batch_stride = out_z.stride(0);
|
|
||||||
params.out_z_d_stride = out_z.stride(1);
|
|
||||||
}
|
|
||||||
params.out_batch_stride = out.stride(0);
|
|
||||||
params.out_d_stride = out.stride(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<torch::Tensor>
|
void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||||
selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|
||||||
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
|
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
|
||||||
const c10::optional<torch::Tensor> &D_,
|
const c10::optional<torch::Tensor> &D_,
|
||||||
const c10::optional<torch::Tensor> &z_,
|
const c10::optional<torch::Tensor> &z_,
|
||||||
const c10::optional<torch::Tensor> &delta_bias_,
|
const c10::optional<torch::Tensor> &delta_bias_,
|
||||||
bool delta_softplus,
|
bool delta_softplus,
|
||||||
const c10::optional<torch::Tensor> &index_,
|
const c10::optional<torch::Tensor> &query_start_loc,
|
||||||
const c10::optional<torch::Tensor> &x) {
|
const c10::optional<torch::Tensor> &cache_indices,
|
||||||
|
const c10::optional<torch::Tensor> &has_initial_state,
|
||||||
|
const torch::Tensor &ssm_states) {
|
||||||
auto input_type = u.scalar_type();
|
auto input_type = u.scalar_type();
|
||||||
auto weight_type = A.scalar_type();
|
auto weight_type = A.scalar_type();
|
||||||
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
|
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
|
||||||
@ -505,23 +530,37 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
|
TORCH_CHECK(delta.stride(-1) == 1 || delta.size(-1) == 1);
|
||||||
|
|
||||||
const auto sizes = u.sizes();
|
const auto sizes = u.sizes();
|
||||||
const int batch_size = sizes[0];
|
const bool varlen = query_start_loc.has_value();
|
||||||
const int dim = sizes[1];
|
const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
|
||||||
const int seqlen = sizes[2];
|
const int dim = varlen ? sizes[0] : sizes[1];
|
||||||
|
const int seqlen = varlen ? sizes[1] : sizes[2];
|
||||||
const int dstate = A.size(1);
|
const int dstate = A.size(1);
|
||||||
const int n_groups = is_variable_B ? B.size(1) : 1;
|
const int n_groups = varlen ? B.size(0) : B.size(1);
|
||||||
|
|
||||||
TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
|
TORCH_CHECK(dstate <= 256, "selective_scan only supports state dimension <= 256");
|
||||||
|
|
||||||
CHECK_SHAPE(u, batch_size, dim, seqlen);
|
if (varlen) {
|
||||||
CHECK_SHAPE(delta, batch_size, dim, seqlen);
|
CHECK_SHAPE(u, dim, seqlen);
|
||||||
|
CHECK_SHAPE(delta, dim, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(u, batch_size, dim, seqlen);
|
||||||
|
CHECK_SHAPE(delta, batch_size, dim, seqlen);
|
||||||
|
}
|
||||||
CHECK_SHAPE(A, dim, dstate);
|
CHECK_SHAPE(A, dim, dstate);
|
||||||
TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
|
TORCH_CHECK(is_variable_B, "is_variable_B = False is disabled in favor of reduced binary size")
|
||||||
CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen );
|
if (varlen) {
|
||||||
|
CHECK_SHAPE(B, n_groups, dstate, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(B, batch_size, n_groups, dstate, seqlen);
|
||||||
|
}
|
||||||
TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
|
TORCH_CHECK(B.stride(-1) == 1 || B.size(-1) == 1);
|
||||||
|
|
||||||
TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
|
TORCH_CHECK(is_variable_C, "is_variable_C = False is disabled in favor of reduced binary size")
|
||||||
CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen);
|
if (varlen) {
|
||||||
|
CHECK_SHAPE(C, n_groups, dstate, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(C, batch_size, n_groups, dstate, seqlen);
|
||||||
|
}
|
||||||
TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
|
TORCH_CHECK(C.stride(-1) == 1 || C.size(-1) == 1);
|
||||||
|
|
||||||
if (D_.has_value()) {
|
if (D_.has_value()) {
|
||||||
@ -539,13 +578,31 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
|
TORCH_CHECK(delta_bias.stride(-1) == 1 || delta_bias.size(-1) == 1);
|
||||||
CHECK_SHAPE(delta_bias, dim);
|
CHECK_SHAPE(delta_bias, dim);
|
||||||
}
|
}
|
||||||
if (index_.has_value()) {
|
|
||||||
auto index = index_.value();
|
|
||||||
TORCH_CHECK(index.scalar_type() == at::ScalarType::Int);
|
if (has_initial_state.has_value()) {
|
||||||
TORCH_CHECK(index.is_cuda());
|
auto has_initial_state_ = has_initial_state.value();
|
||||||
CHECK_SHAPE(index, batch_size, seqlen);
|
TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
|
||||||
|
TORCH_CHECK(has_initial_state_.is_cuda());
|
||||||
|
CHECK_SHAPE(has_initial_state_, batch_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (query_start_loc.has_value()) {
|
||||||
|
auto query_start_loc_ = query_start_loc.value();
|
||||||
|
TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(query_start_loc_.is_cuda());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (cache_indices.has_value()) {
|
||||||
|
auto cache_indices_ = cache_indices.value();
|
||||||
|
TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
|
||||||
|
TORCH_CHECK(cache_indices_.is_cuda());
|
||||||
|
CHECK_SHAPE(cache_indices_, batch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
at::Tensor z, out_z;
|
at::Tensor z, out_z;
|
||||||
const bool has_z = z_.has_value();
|
const bool has_z = z_.has_value();
|
||||||
TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
|
TORCH_CHECK(has_z, "has_z = False is disabled in favor of reduced binary size")
|
||||||
@ -553,32 +610,39 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
TORCH_CHECK(z.scalar_type() == input_type);
|
TORCH_CHECK(z.scalar_type() == input_type);
|
||||||
TORCH_CHECK(z.is_cuda());
|
TORCH_CHECK(z.is_cuda());
|
||||||
TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
|
TORCH_CHECK(z.stride(-1) == 1 || z.size(-1) == 1);
|
||||||
CHECK_SHAPE(z, batch_size, dim, seqlen);
|
if (varlen){
|
||||||
out_z = torch::empty_like(z);
|
CHECK_SHAPE(z, dim, seqlen);
|
||||||
|
} else {
|
||||||
|
CHECK_SHAPE(z, batch_size, dim, seqlen);
|
||||||
|
}
|
||||||
|
|
||||||
|
out_z = z;
|
||||||
|
|
||||||
const int n_chunks = (seqlen + 2048 - 1) / 2048;
|
const int n_chunks = (seqlen + 2048 - 1) / 2048;
|
||||||
// const int n_chunks = (seqlen + 1024 - 1) / 1024;
|
// const int n_chunks = (seqlen + 1024 - 1) / 1024;
|
||||||
// at::Tensor out = torch::empty_like(u);
|
// at::Tensor out = torch::empty_like(u);
|
||||||
// Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
|
// Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
|
||||||
at::Tensor out = torch::empty_like(delta);
|
at::Tensor out = delta;
|
||||||
if (x.has_value()){
|
TORCH_CHECK(ssm_states.scalar_type() == input_type);
|
||||||
auto _x = x.value();
|
TORCH_CHECK(ssm_states.is_cuda());
|
||||||
TORCH_CHECK(_x.scalar_type() == weight_type);
|
TORCH_CHECK(ssm_states.stride(-1) == 1);
|
||||||
TORCH_CHECK(_x.is_cuda());
|
CHECK_SHAPE(ssm_states, batch_size, dim, dstate);
|
||||||
TORCH_CHECK(_x.stride(-1) == 1);
|
|
||||||
CHECK_SHAPE(_x, batch_size, dim, n_chunks, dstate * 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
SSMParamsBase params;
|
SSMParamsBase params;
|
||||||
set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
|
set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
|
||||||
u, delta, A, B, C, out, z, out_z,
|
u, delta, A, B, C, out, z, out_z,
|
||||||
D_.has_value() ? D_.value().data_ptr() : nullptr,
|
D_,
|
||||||
delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
|
delta_bias_,
|
||||||
x.value().data_ptr(),
|
ssm_states,
|
||||||
has_z,
|
has_z,
|
||||||
delta_softplus,
|
delta_softplus,
|
||||||
index_.has_value() ? index_.value().data_ptr() : nullptr);
|
query_start_loc,
|
||||||
|
cache_indices,
|
||||||
|
has_initial_state,
|
||||||
|
varlen
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
// Otherwise the kernel will be launched from cuda:0 device
|
// Otherwise the kernel will be launched from cuda:0 device
|
||||||
// Cast to char to avoid compiler warning about narrowing
|
// Cast to char to avoid compiler warning about narrowing
|
||||||
at::cuda::CUDAGuard device_guard{(char)u.get_device()};
|
at::cuda::CUDAGuard device_guard{(char)u.get_device()};
|
||||||
@ -586,8 +650,5 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
|||||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
|
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
|
||||||
selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
|
selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
|
||||||
});
|
});
|
||||||
std::vector<at::Tensor> result = {out, x.value()};
|
|
||||||
if (has_z) { result.push_back(out_z); }
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
1616
csrc/moe/marlin_kernels/marlin_moe_kernel.h
Normal file
1616
csrc/moe/marlin_kernels/marlin_moe_kernel.h
Normal file
File diff suppressed because it is too large
Load Diff
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
Normal file
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#include "marlin_moe_kernel_ku4.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
// We return bool so we can create these different kernel calls as a sequence
|
||||||
|
// of if-elseif's.
|
||||||
|
bool call_marlin_moe_kernel_ku4(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks) {
|
||||||
|
bool has_zp = true;
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
|
||||||
|
AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace marlin_moe
|
||||||
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
Normal file
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "marlin_moe_kernel.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
// We return bool so we can create these different kernel calls as a sequence
|
||||||
|
// of if-elseif's.
|
||||||
|
bool call_marlin_moe_kernel_ku4(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks);
|
||||||
|
|
||||||
|
} // namespace marlin_moe
|
||||||
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
Normal file
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#include "marlin_moe_kernel_ku4b8.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
// We return bool so we can create these different kernel calls as a sequence
|
||||||
|
// of if-elseif's.
|
||||||
|
bool call_marlin_moe_kernel_ku4b8(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks) {
|
||||||
|
bool has_zp = false;
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace marlin_moe
|
||||||
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
Normal file
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "marlin_moe_kernel.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
// We return bool so we can create these different kernel calls as a sequence
|
||||||
|
// of if-elseif's.
|
||||||
|
bool call_marlin_moe_kernel_ku4b8(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks);
|
||||||
|
|
||||||
|
} // namespace marlin_moe
|
||||||
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
Normal file
31
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#include "marlin_moe_kernel_ku8b128.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
// We return bool so we can create these different kernel calls as a sequence
|
||||||
|
// of if-elseif's.
|
||||||
|
bool call_marlin_moe_kernel_ku8b128(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks) {
|
||||||
|
bool has_zp = false;
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
|
||||||
|
GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace marlin_moe
|
||||||
18
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
Normal file
18
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "marlin_moe_kernel.h"
|
||||||
|
|
||||||
|
namespace marlin_moe {
|
||||||
|
|
||||||
|
bool call_marlin_moe_kernel_ku8b128(
|
||||||
|
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||||
|
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||||
|
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||||
|
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||||
|
const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
|
||||||
|
const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
|
||||||
|
int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
|
||||||
|
int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
|
||||||
|
int m_block, int max_par, int cfg_max_m_blocks);
|
||||||
|
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user