mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-21 07:13:52 +08:00
Compare commits
202 Commits
Author | SHA1 | Date | |
---|---|---|---|
bb2fc08072 | |||
3eda4ec780 | |||
22fa2e35cb | |||
c5201240a4 | |||
97234be0ec | |||
c051bfe4eb | |||
9e0b558a09 | |||
e519ae097a | |||
7c2749a4fd | |||
729171ae58 | |||
c5e8330997 | |||
e0c15758b8 | |||
bdf5fd1386 | |||
5a96ee52a3 | |||
42c7f66a38 | |||
69d5ae38dc | |||
fea59c7712 | |||
739b61a348 | |||
89c1c6a196 | |||
42de2cefcb | |||
c9eef37f32 | |||
396d92d5e0 | |||
25e778aa16 | |||
b6df37f943 | |||
14f91fe67c | |||
d7f4178dd9 | |||
082ecd80d5 | |||
f952bbc8ff | |||
9364f74eee | |||
06d6c5fe9f | |||
683e3cb9c4 | |||
9042d68362 | |||
3f8d42c81f | |||
7bd82002ae | |||
2e26564259 | |||
e81522e879 | |||
45ceb85a0c | |||
4cc24f01b1 | |||
07eb6f19f3 | |||
f0bbfaf917 | |||
30efe41532 | |||
9ed82e7074 | |||
51f8aa90ad | |||
a5314e8698 | |||
a921e86392 | |||
6366efc67b | |||
dbe5588554 | |||
d4201e06d5 | |||
b5672a112c | |||
c5df56f88b | |||
1689219ebf | |||
4ffffccb7e | |||
f53b8f0d05 | |||
2d4733ba2d | |||
15c6a079b1 | |||
ecdb462c24 | |||
58ca663224 | |||
4634c8728b | |||
c8a7d51c49 | |||
e2fbaee725 | |||
8a74c68bd1 | |||
61e592747c | |||
d25877dd9b | |||
1c27d25fb5 | |||
18fecc3559 | |||
b5af8c223c | |||
b5241e41d9 | |||
e76466dde2 | |||
5f0b9933e6 | |||
a38524f338 | |||
2fa4623d9e | |||
a9a2e74d21 | |||
e09ce759aa | |||
5fa6e9876e | |||
5bf35a91e4 | |||
a19e8d3726 | |||
10383887e0 | |||
1d094fd7c0 | |||
ce37be7ba0 | |||
7f62077af5 | |||
09c2eb85dd | |||
978aed5300 | |||
160e1d8c99 | |||
94162beb9f | |||
c467dff24f | |||
9f4ccec761 | |||
38ef94888a | |||
2bb0489cb3 | |||
7508a3dc34 | |||
7a3d2a5b95 | |||
d97011512e | |||
37d776606f | |||
d92b3c5cde | |||
9ad32dacd9 | |||
d6f3b3d5c4 | |||
4552e37b55 | |||
ec9933f4a5 | |||
3dee97b05f | |||
4cf256ae7f | |||
64fdc08c72 | |||
4ef95b0f06 | |||
eaec4b9153 | |||
a63a4c6341 | |||
c8fd97f26d | |||
94b82e8c18 | |||
6ae1597ddf | |||
22e79ee8f3 | |||
de19916314 | |||
69672f116c | |||
44874a0bf9 | |||
b47008b4d2 | |||
9bfece89fd | |||
32c9d7f765 | |||
ccb20db8bd | |||
a754dc2cb9 | |||
61e85dbad8 | |||
dbfe254eda | |||
73030b7dae | |||
ccd3c04571 | |||
9dad5cc859 | |||
6ef3bf912c | |||
540c0368b1 | |||
fb6af8bc08 | |||
eeceadaecc | |||
babf52dade | |||
9da4aad44b | |||
41708e5034 | |||
d80aef3776 | |||
e1684a766a | |||
a27f87da34 | |||
16ff6bd58c | |||
f8f9ff57ee | |||
6bc9710f6e | |||
111fc6e7ec | |||
75f64d8b94 | |||
21b2dcedab | |||
07b35af86d | |||
bb1a784b05 | |||
d719ba24c5 | |||
aa48e502fb | |||
4dbebd03cc | |||
b75bce1008 | |||
b039cbbce3 | |||
f9d25c2519 | |||
024ad87cdc | |||
aea19f0989 | |||
f7160d946a | |||
6047187cd8 | |||
b6c16cf8ff | |||
d26a8b3f1f | |||
d59eb98489 | |||
adf32e0a0f | |||
2b0fb53481 | |||
d6ab528997 | |||
7ed6a4f0e1 | |||
a4feba929b | |||
2d23b42d92 | |||
1df43de9bb | |||
52b7fcb35a | |||
b675069d74 | |||
55f692b46e | |||
8a1415cf77 | |||
546b101fa0 | |||
3963a5335b | |||
c4774eb841 | |||
fc17110bbe | |||
439c84581a | |||
99ded1e1c4 | |||
997df46a32 | |||
ae151d73be | |||
44cc76610d | |||
b422d4961a | |||
c38eba3046 | |||
e72ae80b06 | |||
8a924d2248 | |||
5ed3505d82 | |||
da78caecfa | |||
2416b26e11 | |||
d3a245138a | |||
673dd4cae9 | |||
4d6ada947c | |||
a0550cbc80 | |||
08c5bdecae | |||
5d5b4c5fe5 | |||
70c232f85a | |||
a3c9435d93 | |||
4f0e0ea131 | |||
ddc369fba1 | |||
185ad31f37 | |||
543aa48573 | |||
f7a8fa39d8 | |||
717f4bcea0 | |||
16620f439d | |||
3b08fe2b13 | |||
abfe705a02 | |||
333306a252 | |||
6206dcb29e | |||
9389380015 | |||
175c43eca4 | |||
bc96d5c330 | |||
f0250620dd | |||
2de490d60f |
@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
||||
mkdir -p images
|
||||
cd images
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
||||
|
||||
cd -
|
@ -0,0 +1,11 @@
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.671
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.664
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.905
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.905
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.752
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.754
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.753
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.753
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.755
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.755
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,11 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.756
|
||||
value: 0.753
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.752
|
||||
limit: 250
|
||||
value: 0.753
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.728
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.728
|
||||
limit: 250
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.758
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.759
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.593
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.588
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.595
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.582
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
@ -1,3 +1,5 @@
|
||||
Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
|
||||
Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
|
@ -1,2 +1,7 @@
|
||||
Meta-Llama-3-8B-Instruct.yaml
|
||||
Meta-Llama-3-8B-Instruct-FP8.yaml
|
||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||
|
@ -3,7 +3,7 @@
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.2
|
||||
# pip install lm-eval==0.4.3
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
|
||||
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
|
||||
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||
--batch_size $BATCH_SIZE
|
||||
|
@ -24,7 +24,8 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||
|
||||
def launch_lm_eval(eval_config):
|
||||
model_args = f"pretrained={eval_config['model_name']}," \
|
||||
f"tensor_parallel_size={TP_SIZE}"
|
||||
f"tensor_parallel_size={TP_SIZE}," \
|
||||
f"add_bos_token=true"
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
|
@ -1,5 +1,6 @@
|
||||
# vLLM benchmark suite
|
||||
|
||||
|
||||
## Introduction
|
||||
|
||||
This directory contains the performance benchmarking CI for vllm.
|
||||
|
@ -11,7 +11,7 @@ steps:
|
||||
- sh
|
||||
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
- wait
|
||||
- label: "A100 Benchmark"
|
||||
- label: "A100"
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
@ -42,21 +42,20 @@ steps:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
# - label: "H100: NVIDIA SMI"
|
||||
# agents:
|
||||
# queue: H100
|
||||
# plugins:
|
||||
# - docker#v5.11.0:
|
||||
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
# command:
|
||||
# - bash
|
||||
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||
# mount-buildkite-agent: true
|
||||
# propagate-environment: true
|
||||
# propagate-uid-gid: false
|
||||
# ipc: host
|
||||
# gpus: all
|
||||
# environment:
|
||||
# - VLLM_USAGE_SOURCE
|
||||
# - HF_TOKEN
|
||||
- label: "H100"
|
||||
agents:
|
||||
queue: H100
|
||||
plugins:
|
||||
- docker#v5.11.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: all
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
|
@ -1,27 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
|
||||
set -euo pipefail
|
||||
|
||||
# Install system packages
|
||||
apt update
|
||||
apt install -y curl jq
|
||||
|
||||
# Install minijinja for templating
|
||||
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
|
||||
source $HOME/.cargo/env
|
||||
|
||||
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
|
||||
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
|
||||
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
|
||||
|
||||
if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
|
||||
echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
|
||||
else
|
||||
echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Upload sample.yaml
|
||||
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
|
45
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
45
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
@ -0,0 +1,45 @@
|
||||
|
||||
# Nightly benchmark
|
||||
|
||||
The main goal of this benchmarking is two-fold:
|
||||
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
|
||||
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
|
||||
|
||||
|
||||
## Docker images
|
||||
|
||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
|
||||
- vllm/vllm-openai:v0.5.0.post1
|
||||
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||
- openmmlab/lmdeploy:v0.5.0
|
||||
- ghcr.io/huggingface/text-generation-inference:2.1
|
||||
|
||||
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
|
||||
|
||||
|
||||
## Hardware
|
||||
|
||||
One AWS node with 8x NVIDIA A100 GPUs.
|
||||
|
||||
|
||||
## Workload description
|
||||
|
||||
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
|
||||
|
||||
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 500 prompts.
|
||||
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||
|
||||
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
|
||||
|
||||
## Plots
|
||||
|
||||
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
|
||||
|
||||
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
|
||||
|
||||
## Results
|
||||
|
||||
{nightly_results_benchmarking_table}
|
120
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
120
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
@ -0,0 +1,120 @@
|
||||
common_pod_spec: &common_pod_spec
|
||||
priorityClassName: perf-benchmark
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: hf-cache
|
||||
hostPath:
|
||||
path: /root/.cache/huggingface
|
||||
type: Directory
|
||||
|
||||
common_container_settings: &common_container_settings
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
- name: hf-cache
|
||||
mountPath: /root/.cache/huggingface
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
|
||||
steps:
|
||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||
- label: "A100 trt benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||
<<: *common_container_settings
|
||||
|
||||
- label: "A100 lmdeploy benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: openmmlab/lmdeploy:v0.5.0
|
||||
<<: *common_container_settings
|
||||
|
||||
|
||||
- label: "A100 vllm benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: vllm/vllm-openai:latest
|
||||
<<: *common_container_settings
|
||||
|
||||
- label: "A100 tgi benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: ghcr.io/huggingface/text-generation-inference:2.1
|
||||
<<: *common_container_settings
|
||||
|
||||
- wait
|
||||
|
||||
- label: "Plot"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: vllm/vllm-openai:v0.5.0.post1
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
|
||||
- wait
|
@ -54,7 +54,7 @@ wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl localhost:8000/v1/completions; do
|
||||
until curl -X POST localhost:8000/v1/completions; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
@ -73,8 +73,17 @@ kill_gpu_processes() {
|
||||
echo "All GPU processes have been killed."
|
||||
fi
|
||||
|
||||
# Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
|
||||
# since we are in container anyway
|
||||
pkill -9 -f python
|
||||
pkill -9 -f python3
|
||||
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 10
|
||||
# loop while nvidia-smi returns any processes
|
||||
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
|
||||
sleep 1
|
||||
echo "Waiting for GPU processes to be killed"
|
||||
done
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
@ -90,12 +99,19 @@ upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
|
||||
if command -v buildkite-agent >/dev/null 2>&1; then
|
||||
BUILDKITE_AGENT_COMMAND="buildkite-agent"
|
||||
elif [ -f /workspace/buildkite-agent ]; then
|
||||
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
|
||||
else
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
|
||||
# Use the determined command to annotate and upload artifacts
|
||||
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
|
||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
@ -269,6 +285,7 @@ run_serving_tests() {
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
server_pid=$!
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
@ -318,6 +335,7 @@ run_serving_tests() {
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill -9 $server_pid
|
||||
kill_gpu_processes
|
||||
done
|
||||
}
|
||||
|
76
.buildkite/nightly-benchmarks/run-nightly-suite.sh
Normal file
76
.buildkite/nightly-benchmarks/run-nightly-suite.sh
Normal file
@ -0,0 +1,76 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
echo "Error: HF_TOKEN is not set."
|
||||
exit 1
|
||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||
exit 1
|
||||
else
|
||||
echo "HF_TOKEN is set and valid."
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
check_hf_token
|
||||
|
||||
df -h
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
|
||||
# run lmdeploy
|
||||
if which lmdeploy >/dev/null; then
|
||||
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# run tgi
|
||||
if [ -e /tgi-entrypoint.sh ]; then
|
||||
echo "tgi is available, redirect to run-tgi-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# run trt
|
||||
if which trtllm-build >/dev/null; then
|
||||
echo "trtllm is available, redirect to run-trt-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# run vllm
|
||||
if [ -e /vllm-workspace ]; then
|
||||
echo "vllm is available, redirect to run-vllm-nightly.sh"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
|
||||
exit 0
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
@ -0,0 +1,26 @@
|
||||
import argparse
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
def main(model, cachedir):
|
||||
# Load the tokenizer and save it to the specified directory
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
tokenizer.save_pretrained(cachedir)
|
||||
print(f"Tokenizer saved to {cachedir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and save Hugging Face tokenizer")
|
||||
parser.add_argument("--model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Name of the model")
|
||||
parser.add_argument("--cachedir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to save the tokenizer")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.model, args.cachedir)
|
@ -0,0 +1,6 @@
|
||||
from lmdeploy.serve.openai.api_client import APIClient
|
||||
|
||||
api_client = APIClient("http://localhost:8000")
|
||||
model_name = api_client.available_models[0]
|
||||
|
||||
print(model_name)
|
102
.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
Normal file
102
.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
Normal file
@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
server_params=$1
|
||||
common_params=$2
|
||||
|
||||
|
||||
|
||||
model_path=$(echo "$common_params" | jq -r '.model')
|
||||
model_name="${model_path#*/}"
|
||||
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
|
||||
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||
|
||||
cd ~
|
||||
rm -rf models
|
||||
mkdir -p models
|
||||
cd models
|
||||
models_dir=$(pwd)
|
||||
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||
|
||||
cd ~
|
||||
rm -rf tensorrt-demo
|
||||
git clone https://github.com/neuralmagic/tensorrt-demo.git
|
||||
cd tensorrt-demo
|
||||
tensorrt_demo_dir=$(pwd)
|
||||
|
||||
# make sure the parameter inside tensorrt_demo is consistent to envvar
|
||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
|
||||
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
|
||||
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
|
||||
|
||||
|
||||
cd /
|
||||
rm -rf tensorrtllm_backend
|
||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||
git lfs install
|
||||
cd tensorrtllm_backend
|
||||
git checkout $trt_llm_version
|
||||
tensorrtllm_backend_dir=$(pwd)
|
||||
git submodule update --init --recursive
|
||||
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
|
||||
|
||||
cd /tensorrtllm_backend
|
||||
cd ./tensorrt_llm/examples/${model_type}
|
||||
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||
|
||||
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
|
||||
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
|
||||
python ../quantization/quantize.py \
|
||||
--model_dir ${model_path} \
|
||||
--dtype ${model_dtype} \
|
||||
--tp_size ${model_tp_size} \
|
||||
--output_dir ${trt_model_path} \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
--calib_size 2
|
||||
|
||||
else
|
||||
|
||||
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
|
||||
python3 convert_checkpoint.py \
|
||||
--model_dir ${model_path} \
|
||||
--dtype ${model_dtype} \
|
||||
--tp_size ${model_tp_size} \
|
||||
--output_dir ${trt_model_path}
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
trtllm-build \
|
||||
--checkpoint_dir=${trt_model_path} \
|
||||
--gpt_attention_plugin=${model_dtype} \
|
||||
--gemm_plugin=${model_dtype} \
|
||||
--remove_input_padding=enable \
|
||||
--paged_kv_cache=enable \
|
||||
--tp_size=${model_tp_size} \
|
||||
--max_batch_size=${max_batch_size} \
|
||||
--max_input_len=${max_input_len} \
|
||||
--max_output_len=${max_output_len} \
|
||||
--max_num_tokens=${max_output_len} \
|
||||
--opt_num_tokens=${max_output_len} \
|
||||
--output_dir=${trt_engine_path}
|
||||
|
||||
cd /tensorrtllm_backend/triton_model_repo
|
||||
rm -rf ./tensorrt_llm/1/*
|
||||
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
||||
cd /tensorrtllm_backend
|
||||
python3 scripts/launch_triton_server.py \
|
||||
--world_size=${model_tp_size} \
|
||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
40
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
40
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# initial annotation
|
||||
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||
|
||||
# download results
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
mkdir -p results/
|
||||
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||
ls
|
||||
ls results/
|
||||
|
||||
# generate figures
|
||||
python3 -m pip install tabulate pandas matplotlib
|
||||
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
--description $description \
|
||||
--results-folder results/
|
||||
|
||||
# upload results and figures
|
||||
/workspace/buildkite-agent artifact upload "nightly_results.png"
|
||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||
}
|
||||
|
||||
main "$@"
|
135
.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
Normal file
135
.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
Normal file
@ -0,0 +1,135 @@
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=
|
||||
'Parse command line arguments for summary-nightly-results script.')
|
||||
parser.add_argument('--results-folder',
|
||||
type=str,
|
||||
required=True,
|
||||
help='The folder where the results are stored.')
|
||||
parser.add_argument('--description',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Description of the results.')
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
|
||||
results_folder = Path(args.results_folder)
|
||||
|
||||
results = []
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||
with open(test_file, "r") as f:
|
||||
results = results + json.loads(f.read())
|
||||
|
||||
# generate markdown table
|
||||
df = pd.DataFrame.from_dict(results)
|
||||
|
||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||
|
||||
with open(args.description, "r") as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(
|
||||
nightly_results_benchmarking_table=md_table)
|
||||
|
||||
with open("nightly_results.md", "w") as f:
|
||||
f.write(description)
|
||||
|
||||
plt.rcParams.update({'font.size': 20})
|
||||
|
||||
# plot results
|
||||
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
|
||||
fig.subplots_adjust(hspace=1)
|
||||
methods = ["vllm", "trt", "lmdeploy", "tgi"]
|
||||
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
|
||||
for j, metric in enumerate(["TTFT", "ITL"]):
|
||||
means, stds = [], []
|
||||
for method in methods:
|
||||
target = df['Test name'].str.contains(model)
|
||||
target = target & df['Engine'].str.contains(method)
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
means.append(0.)
|
||||
stds.append(0.)
|
||||
else:
|
||||
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
|
||||
std = filtered_df[f"Std {metric} (ms)"].values[0]
|
||||
success = filtered_df["Successful req."].values[0]
|
||||
stds.append(std / math.sqrt(success))
|
||||
|
||||
print(model, metric)
|
||||
print(means, stds)
|
||||
|
||||
ax = axes[i, j + 1]
|
||||
|
||||
bars = ax.bar(
|
||||
["vllm", "trt", "lmdeploy", "tgi"],
|
||||
means,
|
||||
yerr=stds,
|
||||
capsize=10,
|
||||
)
|
||||
for idx, bar in enumerate(bars):
|
||||
bar.set_color(bar_colors[idx])
|
||||
ax.set_ylim(bottom=0)
|
||||
|
||||
ax.set_ylabel(f"{metric} (ms)")
|
||||
ax.set_title(f"{model} {metric}")
|
||||
ax.grid(axis='y')
|
||||
|
||||
metric = "Tput"
|
||||
j = 0
|
||||
if True:
|
||||
tputs = []
|
||||
for method in methods:
|
||||
target = df['Test name'].str.contains(model)
|
||||
target = target & df['Engine'].str.contains(method)
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
tputs.append(0.)
|
||||
else:
|
||||
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
|
||||
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
|
||||
tputs.append(input_tput + output_tput)
|
||||
|
||||
print(model, metric)
|
||||
print(tputs)
|
||||
|
||||
ax = axes[i, j]
|
||||
|
||||
bars = ax.bar(
|
||||
["vllm", "trt", "lmdeploy", "tgi"],
|
||||
tputs,
|
||||
)
|
||||
for idx, bar in enumerate(bars):
|
||||
bar.set_color(bar_colors[idx])
|
||||
|
||||
ax.set_ylim(bottom=0)
|
||||
|
||||
ax.set_ylabel("Tput (token/s)")
|
||||
ax.set_title(f"{model} {metric}")
|
||||
ax.grid(axis='y')
|
||||
|
||||
fig.tight_layout()
|
||||
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
main(args)
|
218
.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
Normal file
218
.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
Normal file
@ -0,0 +1,218 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill lmdeploy || true
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 10
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append lmdeploy to the test name
|
||||
test_name=lmdeploy_$test_name
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepare tokenizer
|
||||
rm -rf /tokenizer_cache
|
||||
mkdir /tokenizer_cache
|
||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||
--model "$model" \
|
||||
--cachedir /tokenizer_cache
|
||||
|
||||
server_command="lmdeploy serve api_server $model \
|
||||
--tp $tp \
|
||||
--server-port $port \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
bash -c "$server_command" &
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "lmdeploy server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "lmdeploy failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# get model name
|
||||
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend lmdeploy \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--model \"$model_name\" \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "lmdeploy" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
python -m pip install transformers==4.41.2
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
python -m pip install tabulate pandas
|
||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
216
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
Normal file
216
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
Normal file
@ -0,0 +1,216 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill text-generation || true
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 10
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append tgi to the test name
|
||||
test_name=tgi_$test_name
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||
echo "Key 'fp8' exists in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
--quantize fp8 \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="/tgi-entrypoint.sh \
|
||||
--model-id $model \
|
||||
--num-shard $tp \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "tgi server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "tgi failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend tgi \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "tgi" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
python -m pip install tabulate pandas
|
||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
214
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
Normal file
214
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
Normal file
@ -0,0 +1,214 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill tritonserver || true
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 20
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append trt to the test name
|
||||
test_name=trt_$test_name
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
|
||||
echo "Running test case $test_name"
|
||||
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "trt server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "trt failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# prepare tokenizer
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
rm -rf /tokenizer_cache
|
||||
mkdir /tokenizer_cache
|
||||
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||
--model "$model" \
|
||||
--cachedir /tokenizer_cache
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend tensorrt-llm \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
server_command=""
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "trt" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
|
||||
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
# update transformers package, to make sure mixtral tokenizer is available
|
||||
python -m pip install transformers -U
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
python -m pip install tabulate pandas
|
||||
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
221
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
Normal file
221
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
Normal file
@ -0,0 +1,221 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
# kill all processes on GPU.
|
||||
pkill pt_main_thread
|
||||
sleep 10
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# append vllm to the test name
|
||||
test_name=vllm_$test_name
|
||||
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "vllm server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "vllm failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "vllm" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
rm -rf /root/.cache/huggingface/*
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
check_gpus
|
||||
# enter vllm directory
|
||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||
|
||||
python3 -m pip install tabulate pandas
|
||||
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
@ -0,0 +1,76 @@
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
results_folder = Path("results/")
|
||||
|
||||
# serving results and the keys that will be printed into markdown
|
||||
serving_results = []
|
||||
serving_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
"completed": "Successful req.",
|
||||
"request_throughput": "Tput (req/s)",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"std_ttft_ms": "Std TTFT (ms)",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"std_itl_ms": "Std ITL (ms)",
|
||||
"input_throughput": "Input Tput (tok/s)",
|
||||
"output_throughput": "Output Tput (tok/s)",
|
||||
"engine": "Engine",
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file, "r") as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
serving_results.append(raw_result)
|
||||
continue
|
||||
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(
|
||||
serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping)
|
||||
|
||||
serving_md_table_with_headers = tabulate(serving_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
# remove the first line of header
|
||||
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||
|
||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||
|
||||
# document benchmarking results in markdown
|
||||
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
||||
# document results with header.
|
||||
# for those who wants to reproduce our benchmark.
|
||||
f.write(serving_md_table_with_headers)
|
||||
f.write('\n')
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||
|
||||
results = serving_results.to_dict(orient='records')
|
||||
f.write(json.dumps(results))
|
116
.buildkite/nightly-benchmarks/tests/nightly-tests.json
Normal file
116
.buildkite/nightly-benchmarks/tests/nightly-tests.json
Normal file
@ -0,0 +1,116 @@
|
||||
[
|
||||
{
|
||||
"test_name": "llama8B_tp1",
|
||||
"qps_list": [4],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tp": 1,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 500,
|
||||
"port": 8000
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "float16",
|
||||
"max_batch_size": 256,
|
||||
"max_input_len": 4096,
|
||||
"max_output_len": 4096,
|
||||
"trt_llm_version": "r24.04"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": ""
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "llama70B_tp4",
|
||||
"qps_list": [2],
|
||||
"common_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tp": 4,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 500,
|
||||
"port": 8000
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "float16",
|
||||
"max_batch_size": 256,
|
||||
"max_input_len": 4096,
|
||||
"max_output_len": 4096,
|
||||
"trt_llm_version": "r24.04"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": ""
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "mixtral8x7B_tp2",
|
||||
"qps_list": [2],
|
||||
"common_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tp": 2,
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 500,
|
||||
"port": 8000
|
||||
},
|
||||
"lmdeploy_server_parameters": {
|
||||
},
|
||||
"lmdeploy_client_parameters": {
|
||||
},
|
||||
"tgi_server_parameters": {
|
||||
},
|
||||
"tgi_client_parameters": {
|
||||
"endpoint": "/generate_stream"
|
||||
},
|
||||
"trt_server_parameters": {
|
||||
"model_type": "llama",
|
||||
"model_dtype": "float16",
|
||||
"max_batch_size": 256,
|
||||
"max_input_len": 4096,
|
||||
"max_output_len": 4096,
|
||||
"trt_llm_version": "r24.04"
|
||||
},
|
||||
"trt_client_parameters": {
|
||||
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||
},
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": ""
|
||||
},
|
||||
"vllm_client_parameters": {
|
||||
}
|
||||
}
|
||||
]
|
@ -1,21 +1,19 @@
|
||||
steps:
|
||||
- block: "Build wheels"
|
||||
|
||||
- label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
|
||||
- label: "Build wheel - CUDA {{matrix.cuda_version}}"
|
||||
agents:
|
||||
queue: cpu_queue
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
# rename the files to change linux -> manylinux1
|
||||
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
matrix:
|
||||
setup:
|
||||
cuda_version:
|
||||
- "11.8.0"
|
||||
- "12.1.0"
|
||||
python_version:
|
||||
- "3.8"
|
||||
- "3.9"
|
||||
- "3.10"
|
||||
- "3.11"
|
||||
|
@ -2,6 +2,15 @@
|
||||
set -ex
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- Confirming Clean Initial State"
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
@ -45,15 +54,10 @@ while true; do
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Building container"
|
||||
sha=$(git rev-parse --short HEAD)
|
||||
image_name=rocm_${sha}
|
||||
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
|
||||
docker build \
|
||||
-t ${image_name} \
|
||||
-f Dockerfile.rocm \
|
||||
--progress plain \
|
||||
.
|
||||
echo "--- Pulling container"
|
||||
image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
docker pull ${image_name}
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
||||
@ -62,11 +66,18 @@ trap remove_docker_container EXIT
|
||||
|
||||
echo "--- Running container"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p ${HF_CACHE}
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--shm-size=16gb \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
-v ${HF_CACHE}:${HF_MOUNT} \
|
||||
-e HF_HOME=${HF_MOUNT} \
|
||||
--name ${container_name} \
|
||||
${image_name} \
|
||||
/bin/bash -c "${@}"
|
||||
|
105
.buildkite/run-multi-node-test.sh
Executable file
105
.buildkite/run-multi-node-test.sh
Executable file
@ -0,0 +1,105 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euox pipefail
|
||||
|
||||
if [[ $# -lt 4 ]]; then
|
||||
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WORKING_DIR=$1
|
||||
NUM_NODES=$2
|
||||
NUM_GPUS=$3
|
||||
DOCKER_IMAGE=$4
|
||||
|
||||
shift 4
|
||||
COMMANDS=("$@")
|
||||
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
|
||||
echo "The number of commands must be equal to the number of nodes."
|
||||
echo "Number of nodes: $NUM_NODES"
|
||||
echo "Number of commands: ${#COMMANDS[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "List of commands"
|
||||
for command in "${COMMANDS[@]}"; do
|
||||
echo $command
|
||||
done
|
||||
|
||||
start_network() {
|
||||
docker network create --subnet=192.168.10.0/24 docker-net
|
||||
}
|
||||
|
||||
start_nodes() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
GPU_DEVICES='"device='
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
GPU_DEVICES+='"'
|
||||
|
||||
# start the container in detached mode
|
||||
# things to note:
|
||||
# 1. --shm-size=10.24gb is required. don't use --ipc=host
|
||||
# 2. pass HF_TOKEN to the container
|
||||
# 3. map the huggingface cache directory to the container
|
||||
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||
# starting from 192.168.10.11)
|
||||
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
|
||||
|
||||
# organize containers into a ray cluster
|
||||
if [ $node -eq 0 ]; then
|
||||
# start the ray head node
|
||||
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
|
||||
# wait for the head node to be ready
|
||||
sleep 10
|
||||
else
|
||||
# start the ray worker nodes, and connect them to the head node
|
||||
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
||||
fi
|
||||
done
|
||||
|
||||
# wait for the cluster to be ready
|
||||
sleep 10
|
||||
|
||||
# print the cluster status
|
||||
docker exec node0 /bin/bash -c "ray status"
|
||||
}
|
||||
|
||||
run_nodes() {
|
||||
# important: iterate in reverse order to start the head node last
|
||||
# we start the worker nodes first, in detached mode, and then start the head node
|
||||
# in the foreground, so that the output of the head node is visible in the buildkite logs
|
||||
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
|
||||
GPU_DEVICES='"device='
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
GPU_DEVICES+='"'
|
||||
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
||||
if [ $node -ne 0 ]; then
|
||||
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
else
|
||||
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||
fi
|
||||
done
|
||||
}
|
||||
cleanup() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
docker stop node$node
|
||||
done
|
||||
docker network rm docker-net
|
||||
}
|
||||
trap cleanup EXIT
|
||||
start_network
|
||||
start_nodes
|
||||
run_nodes
|
||||
|
16
.buildkite/run-tpu-test.sh
Normal file
16
.buildkite/run-tpu-test.sh
Normal file
@ -0,0 +1,16 @@
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
|
||||
python3 /workspace/vllm/examples/offline_inference_tpu.py
|
@ -7,8 +7,32 @@
|
||||
|
||||
|
||||
steps:
|
||||
- label: Async Engine, Inputs, Utils, Worker Test
|
||||
fast_check: true
|
||||
fast_check_only: true
|
||||
commands:
|
||||
- pytest -v -s async_engine # Async Engine
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
|
||||
- label: Tensorizer, Metrics, Tracing Test
|
||||
fast_check: true
|
||||
fast_check_only: true
|
||||
commands:
|
||||
- apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
|
||||
- pytest -v -s metrics # Metrics
|
||||
- "pip install \
|
||||
opentelemetry-sdk \
|
||||
opentelemetry-api \
|
||||
opentelemetry-exporter-otlp \
|
||||
opentelemetry-semantic-conventions-ai" # Tracing
|
||||
- pytest -v -s tracing
|
||||
|
||||
- label: Regression Test
|
||||
mirror_hardwares: [amd]
|
||||
fast_check: true
|
||||
command: pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
@ -18,16 +42,20 @@ steps:
|
||||
|
||||
- label: Basic Correctness Test
|
||||
mirror_hardwares: [amd]
|
||||
fast_check: true
|
||||
commands:
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
# This flashinfer installation will fail on AMD ROCm, so it is set as optional.
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Core Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
fast_check: true
|
||||
commands:
|
||||
- pytest -v -s core
|
||||
- pytest -v -s distributed/test_parallel_state.py
|
||||
|
||||
@ -39,15 +67,27 @@ steps:
|
||||
- pytest -v -s distributed/test_comm_ops.py
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total)
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)
|
||||
mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
@ -66,11 +106,13 @@ steps:
|
||||
#mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
fast_check: true
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
||||
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
|
||||
@ -78,18 +120,17 @@ steps:
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
commands:
|
||||
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: Engine Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: Entrypoints Test
|
||||
fast_check: true
|
||||
mirror_hardwares: [amd]
|
||||
|
||||
commands:
|
||||
@ -104,6 +145,7 @@ steps:
|
||||
# install tensorizer for tensorize_vllm_model.py
|
||||
- pip install awscli tensorizer
|
||||
- python3 offline_inference.py
|
||||
- python3 cpu_offload.py
|
||||
- python3 offline_inference_with_prefix.py
|
||||
- python3 llm_engine_example.py
|
||||
- python3 llava_example.py
|
||||
@ -112,27 +154,25 @@ steps:
|
||||
- label: Inputs Test
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s multimodal
|
||||
|
||||
- label: Kernels Test %N
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 4
|
||||
|
||||
- label: Models Test
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- pytest -v -s models -m \"not vlm\"
|
||||
|
||||
- label: Vision Language Models Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s models -m vlm
|
||||
|
||||
- label: Prefix Caching Test
|
||||
@ -149,7 +189,9 @@ steps:
|
||||
command: pytest -v -s test_logits_processor.py
|
||||
|
||||
- label: Utils Test
|
||||
command: pytest -v -s test_utils.py
|
||||
commands:
|
||||
- pytest -v -s test_utils.py
|
||||
- pytest -v -s test_embedded_commit.py
|
||||
|
||||
- label: Worker Test
|
||||
mirror_hardwares: [amd]
|
||||
@ -179,7 +221,10 @@ steps:
|
||||
|
||||
- label: Tensorizer Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
|
||||
commands:
|
||||
- apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s tensorizer_loader
|
||||
|
||||
- label: Metrics Test
|
||||
mirror_hardwares: [amd]
|
||||
@ -223,6 +268,7 @@ steps:
|
||||
|
||||
- label: Documentation Build
|
||||
working_dir: "/vllm-workspace/test_docs/docs"
|
||||
fast_check: true
|
||||
no_gpu: True
|
||||
commands:
|
||||
- pip install -r requirements-docs.txt
|
||||
@ -237,7 +283,7 @@ steps:
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
2
.github/FUNDING.yml
vendored
Normal file
2
.github/FUNDING.yml
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
github: [vllm-project]
|
||||
open_collective: [vllm]
|
21
.github/workflows/add_label_automerge.yml
vendored
Normal file
21
.github/workflows/add_label_automerge.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
name: Add label on auto-merge enabled
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
- auto_merge_enabled
|
||||
jobs:
|
||||
add-label-on-auto-merge:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Add label
|
||||
uses: actions/github-script@v5
|
||||
with:
|
||||
script: |
|
||||
github.rest.issues.addLabels({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
labels: ['ready']
|
||||
})
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
23
.github/workflows/add_label_ready_comment.yml
vendored
Normal file
23
.github/workflows/add_label_ready_comment.yml
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
name: Add Ready Label on Ready Comment
|
||||
|
||||
on:
|
||||
issue_comment:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
add-ready-label:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
|
||||
steps:
|
||||
- name: Add label
|
||||
uses: actions/github-script@v5
|
||||
with:
|
||||
script: |
|
||||
github.rest.issues.addLabels({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
labels: ['ready']
|
||||
})
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
22
.github/workflows/mypy.yaml
vendored
22
.github/workflows/mypy.yaml
vendored
@ -32,20 +32,22 @@ jobs:
|
||||
pip install types-setuptools
|
||||
- name: Mypy
|
||||
run: |
|
||||
mypy tests --config-file pyproject.toml
|
||||
mypy vllm/*.py --config-file pyproject.toml
|
||||
mypy vllm/attention --config-file pyproject.toml
|
||||
mypy vllm/core --config-file pyproject.toml
|
||||
mypy vllm/distributed --config-file pyproject.toml
|
||||
mypy vllm/engine --config-file pyproject.toml
|
||||
mypy vllm/entrypoints --config-file pyproject.toml
|
||||
mypy vllm/executor --config-file pyproject.toml
|
||||
mypy vllm/multimodal --config-file pyproject.toml
|
||||
mypy vllm/usage --config-file pyproject.toml
|
||||
mypy vllm/*.py --config-file pyproject.toml
|
||||
mypy vllm/transformers_utils --config-file pyproject.toml
|
||||
mypy vllm/engine --config-file pyproject.toml
|
||||
mypy vllm/worker --config-file pyproject.toml
|
||||
mypy vllm/spec_decode --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy vllm/lora --config-file pyproject.toml
|
||||
mypy vllm/inputs --config-file pyproject.toml
|
||||
mypy vllm/logging --config-file pyproject.toml
|
||||
mypy tests --config-file pyproject.toml
|
||||
mypy vllm/lora --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy vllm/multimodal --config-file pyproject.toml
|
||||
mypy vllm/platforms --config-file pyproject.toml
|
||||
mypy vllm/spec_decode --config-file pyproject.toml
|
||||
mypy vllm/transformers_utils --config-file pyproject.toml
|
||||
mypy vllm/usage --config-file pyproject.toml
|
||||
mypy vllm/worker --config-file pyproject.toml
|
||||
|
||||
|
2
.github/workflows/publish.yml
vendored
2
.github/workflows/publish.yml
vendored
@ -49,7 +49,7 @@ jobs:
|
||||
matrix:
|
||||
os: ['ubuntu-20.04']
|
||||
python-version: ['3.8', '3.9', '3.10', '3.11']
|
||||
pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
pytorch-version: ['2.3.1'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
cuda-version: ['11.8', '12.1']
|
||||
|
||||
steps:
|
||||
|
21
.github/workflows/reminder_comment.yml
vendored
Normal file
21
.github/workflows/reminder_comment.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
name: PR Reminder Comment Bot
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
||||
jobs:
|
||||
pr_reminder:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Remind to run full CI on PR
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
|
||||
})
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,6 @@
|
||||
# vllm commit id, generated by setup.py
|
||||
vllm/commit_id.py
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
@ -32,8 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -101,7 +101,7 @@ elseif(HIP_FOUND)
|
||||
# ROCm 5.X and 6.X
|
||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||
endif()
|
||||
else()
|
||||
@ -151,6 +151,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/moe_align_block_size_kernels.cu"
|
||||
"csrc/prepare_inputs/advance_step.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
@ -171,6 +172,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
|
53
Dockerfile
53
Dockerfile
@ -8,10 +8,10 @@
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
ARG PYTHON_VERSION=3
|
||||
ARG PYTHON_VERSION=3.10
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
@ -21,13 +21,16 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& apt-get install -y ccache software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
|
||||
&& python3 --version \
|
||||
&& python3 -m pip --version
|
||||
&& python3 --version
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y python3-pip git curl sudo
|
||||
&& apt-get install -y git curl sudo
|
||||
|
||||
# Install pip s.t. it will be compatible with our PYTHON_VERSION
|
||||
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
|
||||
RUN python3 -m pip --version
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
@ -58,7 +61,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
|
||||
ARG PYTHON_VERSION=3
|
||||
ARG PYTHON_VERSION=3.10
|
||||
|
||||
# install build dependencies
|
||||
COPY requirements-build.txt requirements-build.txt
|
||||
@ -88,6 +91,9 @@ ENV NVCC_THREADS=$nvcc_threads
|
||||
# make sure punica kernels are built (for LoRA)
|
||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||
|
||||
ARG buildkite_commit
|
||||
ENV BUILDKITE_COMMIT=${buildkite_commit}
|
||||
|
||||
ARG USE_SCCACHE
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
@ -97,10 +103,15 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& export SCCACHE_BUCKET=vllm-build-sccache \
|
||||
&& if [ "$CUDA_VERSION" = "11.8.0" ]; then \
|
||||
export SCCACHE_BUCKET=vllm-build-sccache-2; \
|
||||
else \
|
||||
export SCCACHE_BUCKET=vllm-build-sccache; \
|
||||
fi \
|
||||
&& export SCCACHE_REGION=us-west-2 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
&& sccache --show-stats; \
|
||||
fi
|
||||
|
||||
@ -108,7 +119,7 @@ ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist; \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
||||
@ -145,12 +156,27 @@ RUN pip --verbose wheel -r requirements-mamba.txt \
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
ARG PYTHON_VERSION=3.10
|
||||
WORKDIR /vllm-workspace
|
||||
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
|
||||
&& python3 --version
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y python3-pip git vim
|
||||
&& apt-get install -y python3-pip git vim curl libibverbs-dev
|
||||
|
||||
# Install pip s.t. it will be compatible with our PYTHON_VERSION
|
||||
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
|
||||
RUN python3 -m pip --version
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
@ -166,6 +192,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
FROM ubuntu:22.04 AS dev
|
||||
FROM ubuntu:20.04 AS dev
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y python3-pip git
|
||||
|
@ -1,26 +1,24 @@
|
||||
# Default ROCm 6.1 base image
|
||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||
|
||||
# Tested and supported base rocm/pytorch images
|
||||
ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
|
||||
ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
|
||||
ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||
|
||||
# Default ROCm ARCHes to build vLLM for.
|
||||
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||
|
||||
# Whether to build CK-based flash-attention
|
||||
# If 0, will not build flash attention
|
||||
# This is useful for gfx target where flash-attention is not supported
|
||||
# (i.e. those that do not appear in `FA_GFX_ARCHS`)
|
||||
# Triton FA is used by default on ROCm now so this is unnecessary.
|
||||
# Whether to install CK-based flash-attention
|
||||
# If 0, will not install flash-attention
|
||||
ARG BUILD_FA="1"
|
||||
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
|
||||
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
|
||||
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
|
||||
# architectures specified in `FA_GFX_ARCHS`
|
||||
ARG TRY_FA_WHEEL="1"
|
||||
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
|
||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||
ARG FA_BRANCH="ae7928c"
|
||||
ARG FA_BRANCH="23a2b1c2"
|
||||
|
||||
# Whether to build triton on rocm
|
||||
ARG BUILD_TRITON="1"
|
||||
ARG TRITON_BRANCH="0ef1848"
|
||||
ARG TRITON_BRANCH="e0fc12c"
|
||||
|
||||
### Base image build stage
|
||||
FROM $BASE_IMAGE AS base
|
||||
@ -48,29 +46,17 @@ RUN apt-get update && apt-get install -y \
|
||||
ARG APP_MOUNT=/vllm-workspace
|
||||
WORKDIR ${APP_MOUNT}
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
# Remove sccache so it doesn't interfere with ccache
|
||||
# TODO: implement sccache support across components
|
||||
RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
# Install torch == 2.4.0 on ROCm
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
# Install torch == 2.5.0 on ROCm
|
||||
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-5.7"*) \
|
||||
pip uninstall -y torch torchaudio torchvision \
|
||||
&& pip install --no-cache-dir --pre \
|
||||
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||
torchvision==0.19.0.dev20240612 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
|
||||
*"rocm-6.0"*) \
|
||||
pip uninstall -y torch torchaudio torchvision \
|
||||
&& pip install --no-cache-dir --pre \
|
||||
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||
torchvision==0.19.0.dev20240612 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
|
||||
*"rocm-6.1"*) \
|
||||
pip uninstall -y torch torchaudio torchvision \
|
||||
&& pip install --no-cache-dir --pre \
|
||||
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||
torchvision==0.19.0.dev20240612 \
|
||||
python3 -m pip uninstall -y torch torchaudio torchvision \
|
||||
&& python3 -m pip install --no-cache-dir --pre \
|
||||
torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
|
||||
torchvision==0.20.0.dev20240710 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
|
||||
*) ;; esac
|
||||
|
||||
@ -87,29 +73,31 @@ ENV CCACHE_DIR=/root/.cache/ccache
|
||||
FROM base AS build_amdsmi
|
||||
# Build amdsmi wheel always
|
||||
RUN cd /opt/rocm/share/amd_smi \
|
||||
&& pip wheel . --wheel-dir=/install
|
||||
&& python3 -m pip wheel . --wheel-dir=/install
|
||||
|
||||
|
||||
### Flash-Attention wheel build stage
|
||||
FROM base AS build_fa
|
||||
ARG BUILD_FA
|
||||
ARG TRY_FA_WHEEL
|
||||
ARG FA_WHEEL_URL
|
||||
ARG FA_GFX_ARCHS
|
||||
ARG FA_BRANCH
|
||||
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
if [ "$BUILD_FA" = "1" ]; then \
|
||||
mkdir -p libs \
|
||||
&& cd libs \
|
||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||
&& cd flash-attention \
|
||||
&& git checkout "${FA_BRANCH}" \
|
||||
&& git submodule update --init \
|
||||
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-5.7"*) \
|
||||
export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
|
||||
&& patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
|
||||
*) ;; esac \
|
||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
|
||||
# If a suitable wheel exists, we download it instead of building FA
|
||||
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
|
||||
else \
|
||||
mkdir -p libs \
|
||||
&& cd libs \
|
||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||
&& cd flash-attention \
|
||||
&& git checkout "${FA_BRANCH}" \
|
||||
&& git submodule update --init \
|
||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||
fi; \
|
||||
# Create an empty directory otherwise as later build stages expect one
|
||||
else mkdir -p /install; \
|
||||
fi
|
||||
@ -148,7 +136,7 @@ RUN case "$(which python3)" in \
|
||||
|
||||
# Package upgrades for useful functionality or to avoid dependency issues
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install --upgrade numba scipy huggingface-hub[cli]
|
||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
|
||||
|
||||
# Make sure punica kernels are built (for LoRA)
|
||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||
@ -159,14 +147,11 @@ ENV TOKENIZERS_PARALLELISM=false
|
||||
|
||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -U -r requirements-rocm.txt \
|
||||
python3 -m pip install -Ur requirements-rocm.txt \
|
||||
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-6.0"*) \
|
||||
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
|
||||
*"rocm-6.1"*) \
|
||||
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
|
||||
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
|
||||
&& cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
|
||||
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
|
||||
# Prevent interference if torch bundles its own HIP runtime
|
||||
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
|
||||
*) ;; esac \
|
||||
@ -178,7 +163,7 @@ RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
||||
mkdir -p libs \
|
||||
&& cp /install/*.whl libs \
|
||||
# Preemptively uninstall to avoid same-version no-installs
|
||||
&& pip uninstall -y amdsmi;
|
||||
&& python3 -m pip uninstall -y amdsmi;
|
||||
|
||||
# Copy triton wheel(s) into final image if they were built
|
||||
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
||||
@ -186,7 +171,7 @@ RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
||||
&& if ls /install/*.whl; then \
|
||||
cp /install/*.whl libs \
|
||||
# Preemptively uninstall to avoid same-version no-installs
|
||||
&& pip uninstall -y triton; fi
|
||||
&& python3 -m pip uninstall -y triton; fi
|
||||
|
||||
# Copy flash-attn wheel(s) into final image if they were built
|
||||
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
||||
@ -194,11 +179,11 @@ RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
||||
&& if ls /install/*.whl; then \
|
||||
cp /install/*.whl libs \
|
||||
# Preemptively uninstall to avoid same-version no-installs
|
||||
&& pip uninstall -y flash-attn; fi
|
||||
&& python3 -m pip uninstall -y flash-attn; fi
|
||||
|
||||
# Install wheels that were built to the final image
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
if ls libs/*.whl; then \
|
||||
pip install libs/*.whl; fi
|
||||
python3 -m pip install libs/*.whl; fi
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
@ -1,19 +1,20 @@
|
||||
ARG NIGHTLY_DATE="20240601"
|
||||
ARG NIGHTLY_DATE="20240713"
|
||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
WORKDIR /workspace
|
||||
COPY . /workspace/vllm
|
||||
|
||||
ENV VLLM_TARGET_DEVICE="tpu"
|
||||
# Install aiohttp separately to avoid build errors.
|
||||
RUN pip install aiohttp
|
||||
# Install NumPy 1 instead of NumPy 2.
|
||||
RUN pip install "numpy<2"
|
||||
# Install the TPU and Pallas dependencies.
|
||||
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
|
||||
# Build vLLM.
|
||||
COPY . /workspace/vllm
|
||||
ENV VLLM_TARGET_DEVICE="tpu"
|
||||
RUN cd /workspace/vllm && python setup.py develop
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
|
||||
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
|
27
README.md
27
README.md
@ -16,27 +16,21 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
|
||||
---
|
||||
|
||||
**Ray Summit CPF is Open (June 4th to June 20th)!**
|
||||
**The Fifth vLLM Bay Area Meetup (July 24th 5pm-8pm PT)**
|
||||
|
||||
There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
|
||||
If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
|
||||
This will be a great chance for everyone in the community to get together and learn.
|
||||
Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
|
||||
We are excited to announce our fifth vLLM Meetup!
|
||||
Join us to hear the vLLM's recent updates and the upcoming roadmap.
|
||||
Additionally, our collaborators from AWS will be presenting their insights and experiences in deploying vLLM.
|
||||
Register now [here](https://lu.ma/lp0gyjqr) and be part of the event!
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||
- [2024/01] Added ROCm 6.0 support to vLLM.
|
||||
- [2023/12] Added ROCm 5.7 support to vLLM.
|
||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
||||
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
|
||||
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
|
||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||
|
||||
---
|
||||
@ -52,14 +46,16 @@ vLLM is fast with:
|
||||
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
||||
- Optimized CUDA kernels
|
||||
|
||||
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
||||
|
||||
vLLM is flexible and easy to use with:
|
||||
|
||||
- Seamless integration with popular Hugging Face models
|
||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||
- Tensor parallelism support for distributed inference
|
||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
|
||||
- (Experimental) Prefix caching support
|
||||
- (Experimental) Multi-lora support
|
||||
|
||||
@ -103,6 +99,7 @@ vLLM is a community project. Our compute resources for development and testing a
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Dropbox
|
||||
- Google Cloud
|
||||
- Lambda Lab
|
||||
- NVIDIA
|
||||
- Replicate
|
||||
|
@ -390,17 +390,17 @@ def remove_prefix(text: str, prefix: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def get_model(pretrained_model_name_or_path: str):
|
||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||
from modelscope import snapshot_download
|
||||
else:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
model_path = snapshot_download(
|
||||
model_id=pretrained_model_name_or_path,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
return model_path
|
||||
model_path = snapshot_download(
|
||||
model_id=pretrained_model_name_or_path,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
|
||||
return model_path
|
||||
return pretrained_model_name_or_path
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
|
@ -11,7 +11,7 @@ from tqdm import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptStrictInputs
|
||||
from vllm.inputs import PromptInputs
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
@ -61,7 +61,7 @@ def main(args: argparse.Namespace):
|
||||
dummy_prompt_token_ids = np.random.randint(10000,
|
||||
size=(args.batch_size,
|
||||
args.input_len))
|
||||
dummy_inputs: List[PromptStrictInputs] = [{
|
||||
dummy_inputs: List[PromptInputs] = [{
|
||||
"prompt_token_ids": batch
|
||||
} for batch in dummy_prompt_token_ids.tolist()]
|
||||
|
||||
|
@ -2,8 +2,8 @@
|
||||
|
||||
On the server side, run one of the following commands:
|
||||
vLLM OpenAI API server
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model <your_model> --swap-space 16 \
|
||||
vllm serve <your_model> \
|
||||
--swap-space 16 \
|
||||
--disable-log-requests
|
||||
|
||||
(TGI backend)
|
||||
@ -17,7 +17,7 @@ On the client side, run:
|
||||
--dataset-path <path to dataset> \
|
||||
--request-rate <request_rate> \ # By default <request_rate> is inf
|
||||
--num-prompts <num_prompts> # By default <num_prompts> is 1000
|
||||
|
||||
|
||||
when using tgi backend, add
|
||||
--endpoint /generate_stream
|
||||
to the end of the command above.
|
||||
@ -60,12 +60,15 @@ class BenchmarkMetrics:
|
||||
output_throughput: float
|
||||
mean_ttft_ms: float
|
||||
median_ttft_ms: float
|
||||
std_ttft_ms: float
|
||||
p99_ttft_ms: float
|
||||
mean_tpot_ms: float
|
||||
median_tpot_ms: float
|
||||
std_tpot_ms: float
|
||||
p99_tpot_ms: float
|
||||
mean_itl_ms: float
|
||||
median_itl_ms: float
|
||||
std_itl_ms: float
|
||||
p99_itl_ms: float
|
||||
|
||||
|
||||
@ -77,7 +80,6 @@ def sample_sharegpt_requests(
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
||||
# Load the dataset.
|
||||
with open(dataset_path) as f:
|
||||
dataset = json.load(f)
|
||||
@ -185,6 +187,31 @@ def sample_sonnet_requests(
|
||||
return sampled_requests
|
||||
|
||||
|
||||
def sample_random_requests(
|
||||
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
|
||||
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
|
||||
|
||||
input_lens = np.random.randint(
|
||||
int(input_len * range_ratio),
|
||||
input_len + 1,
|
||||
size=num_prompts,
|
||||
)
|
||||
output_lens = np.random.randint(
|
||||
int(output_len * range_ratio),
|
||||
output_len + 1,
|
||||
size=num_prompts,
|
||||
)
|
||||
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
||||
input_requests = []
|
||||
for i in range(num_prompts):
|
||||
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
|
||||
for j in range(input_lens[i])])
|
||||
input_requests.append(
|
||||
(prompt, int(input_lens[i]), int(output_lens[i])))
|
||||
|
||||
return input_requests
|
||||
|
||||
|
||||
async def get_request(
|
||||
input_requests: List[Tuple[str, int, int]],
|
||||
request_rate: float,
|
||||
@ -196,6 +223,7 @@ async def get_request(
|
||||
if request_rate == float("inf"):
|
||||
# If the request rate is infinity, then we don't need to wait.
|
||||
continue
|
||||
|
||||
# Sample the request interval from the exponential distribution.
|
||||
interval = np.random.exponential(1.0 / request_rate)
|
||||
# The next request will be sent after the interval.
|
||||
@ -219,7 +247,7 @@ def calculate_metrics(
|
||||
# We use the tokenizer to count the number of output tokens for all
|
||||
# serving backends instead of looking at len(outputs[i].itl) since
|
||||
# multiple output tokens may be bundled together
|
||||
# Note: this may inflate the output token count slightly
|
||||
# Note : this may inflate the output token count slightly
|
||||
output_len = len(
|
||||
tokenizer(outputs[i].generated_text,
|
||||
add_special_tokens=False).input_ids)
|
||||
@ -249,12 +277,15 @@ def calculate_metrics(
|
||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||
1000, # ttfts is empty if streaming is not supported by backend
|
||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
|
||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||
median_itl_ms=np.median(itls or 0) * 1000,
|
||||
std_itl_ms=np.std(itls or 0) * 1000,
|
||||
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
||||
)
|
||||
|
||||
@ -371,12 +402,15 @@ async def benchmark(
|
||||
"output_throughput": metrics.output_throughput,
|
||||
"mean_ttft_ms": metrics.mean_ttft_ms,
|
||||
"median_ttft_ms": metrics.median_ttft_ms,
|
||||
"std_ttft_ms": metrics.std_ttft_ms,
|
||||
"p99_ttft_ms": metrics.p99_ttft_ms,
|
||||
"mean_tpot_ms": metrics.mean_tpot_ms,
|
||||
"median_tpot_ms": metrics.median_tpot_ms,
|
||||
"std_tpot_ms": metrics.std_tpot_ms,
|
||||
"p99_tpot_ms": metrics.p99_tpot_ms,
|
||||
"mean_itl_ms": metrics.mean_itl_ms,
|
||||
"median_itl_ms": metrics.median_itl_ms,
|
||||
"std_itl_ms": metrics.std_itl_ms,
|
||||
"p99_itl_ms": metrics.p99_itl_ms,
|
||||
"input_lens": [output.prompt_len for output in outputs],
|
||||
"output_lens": actual_output_lens,
|
||||
@ -456,6 +490,15 @@ def main(args: argparse.Namespace):
|
||||
for prompt, prompt_formatted, prompt_len,
|
||||
output_len in input_requests]
|
||||
|
||||
elif args.dataset_name == "random":
|
||||
input_requests = sample_random_requests(
|
||||
input_len=args.random_input_len,
|
||||
output_len=args.random_output_len,
|
||||
num_prompts=args.num_prompts,
|
||||
range_ratio=args.random_range_ratio,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
||||
|
||||
@ -549,7 +592,7 @@ if __name__ == "__main__":
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default="sharegpt",
|
||||
choices=["sharegpt", "sonnet"],
|
||||
choices=["sharegpt", "sonnet", "random"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
)
|
||||
parser.add_argument("--dataset-path",
|
||||
@ -566,7 +609,7 @@ if __name__ == "__main__":
|
||||
"--tokenizer",
|
||||
type=str,
|
||||
help=
|
||||
"Name or path of the tokenizer, if not using the default tokenizer.",
|
||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
)
|
||||
parser.add_argument(
|
||||
"--best-of",
|
||||
@ -609,6 +652,27 @@ if __name__ == "__main__":
|
||||
help=
|
||||
"Number of prefix tokens per request, used only for sonnet dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-input-len",
|
||||
type=int,
|
||||
default=1024,
|
||||
help=
|
||||
"Number of input tokens per request, used only for random sampling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-output-len",
|
||||
type=int,
|
||||
default=128,
|
||||
help=
|
||||
"Number of output tokens per request, used only for random sampling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-range-ratio",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Range of sampled ratio of input/output length, "
|
||||
"used only for random sampling.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--request-rate",
|
||||
type=float,
|
||||
|
@ -20,18 +20,18 @@ DEFAULT_TP_SIZES = [1]
|
||||
# helpers
|
||||
|
||||
|
||||
def to_fp8(tensor: torch.tensor) -> torch.tensor:
|
||||
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||
return torch.round(tensor.clamp(
|
||||
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||
|
||||
|
||||
def to_int8(tensor: torch.tensor) -> torch.tensor:
|
||||
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
||||
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||
|
||||
|
||||
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||
k: int) -> Tuple[torch.tensor, torch.tensor]:
|
||||
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
|
||||
a = torch.randn((m, k), device='cuda') * 5
|
||||
b = torch.randn((n, k), device='cuda').t() * 5
|
||||
@ -47,15 +47,15 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||
# impl
|
||||
|
||||
|
||||
def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||
scale_b: torch.Tensor,
|
||||
out_dtype: torch.dtype) -> torch.Tensor:
|
||||
return torch.mm(a, b)
|
||||
|
||||
|
||||
def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||
scale_b: torch.Tensor,
|
||||
out_dtype: torch.dtype) -> torch.Tensor:
|
||||
return torch._scaled_mm(a,
|
||||
b,
|
||||
scale_a=scale_a,
|
||||
@ -63,9 +63,9 @@ def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
out_dtype=out_dtype)
|
||||
|
||||
|
||||
def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
|
||||
scale_a: torch.tensor, scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
|
||||
scale_a: torch.Tensor, scale_b: torch.Tensor,
|
||||
out_dtype: torch.dtype) -> torch.Tensor:
|
||||
return torch._scaled_mm(a,
|
||||
b,
|
||||
scale_a=scale_a,
|
||||
@ -74,15 +74,15 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
|
||||
use_fast_accum=True)
|
||||
|
||||
|
||||
def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||
scale_b: torch.Tensor,
|
||||
out_dtype: torch.dtype) -> torch.Tensor:
|
||||
return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
|
||||
|
||||
|
||||
# bench
|
||||
def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
|
||||
def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
|
||||
scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
|
||||
sub_label: str, fn: Callable, description: str) -> TMeasurement:
|
||||
|
||||
min_run_time = 1
|
||||
|
@ -5,14 +5,16 @@ import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
||||
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
MarlinWorkspace, marlin_24_quantize, marlin_quantize)
|
||||
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
MarlinWorkspace, marlin_quantize)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
|
||||
marlin_24_quantize)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
gptq_pack, quantize_weights, sort_weights)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
@ -100,7 +100,7 @@ def main(
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# Using default kv_scale
|
||||
kv_scale = 1.0
|
||||
k_scale = v_scale = 1.0
|
||||
|
||||
for _ in range(num_iters):
|
||||
if version == "v1":
|
||||
@ -117,7 +117,8 @@ def main(
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
elif version == "v2":
|
||||
ops.paged_attention_v2(
|
||||
@ -136,7 +137,8 @@ def main(
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid version: {version}")
|
||||
|
@ -105,9 +105,9 @@ __device__ void paged_attention_kernel(
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
const float k_scale, const float v_scale, const int tp_rank,
|
||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
|
||||
const int seq_idx = blockIdx.y;
|
||||
const int partition_idx = blockIdx.z;
|
||||
const int max_num_partitions = gridDim.z;
|
||||
@ -285,7 +285,7 @@ __device__ void paged_attention_kernel(
|
||||
Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
|
||||
k_ptr + offset1 * BLOCK_SIZE * x + offset2);
|
||||
k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
|
||||
k_vec_quant, kv_scale);
|
||||
k_vec_quant, k_scale);
|
||||
}
|
||||
}
|
||||
|
||||
@ -415,7 +415,7 @@ __device__ void paged_attention_kernel(
|
||||
*reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
|
||||
// Vector conversion from V_quant_vec to V_vec.
|
||||
v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
|
||||
kv_scale);
|
||||
v_scale);
|
||||
}
|
||||
if (block_idx == num_seq_blocks - 1) {
|
||||
// NOTE(woosuk): When v_vec contains the tokens that are out of the
|
||||
@ -513,15 +513,15 @@ __global__ void paged_attention_v1_kernel(
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
const float k_scale, const float v_scale, const int tp_rank,
|
||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
|
||||
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
|
||||
KV_DTYPE, IS_BLOCK_SPARSE>(
|
||||
/* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
|
||||
v_cache, num_kv_heads, scale, block_tables, seq_lens,
|
||||
max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
|
||||
kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
|
||||
kv_head_stride, k_scale, v_scale, tp_rank, blocksparse_local_blocks,
|
||||
blocksparse_vert_stride, blocksparse_block_size,
|
||||
blocksparse_head_sliding_step);
|
||||
}
|
||||
@ -549,14 +549,14 @@ __global__ void paged_attention_v2_kernel(
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
const float k_scale, const float v_scale, const int tp_rank,
|
||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
|
||||
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
|
||||
KV_DTYPE, IS_BLOCK_SPARSE, PARTITION_SIZE>(
|
||||
exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
|
||||
block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
|
||||
kv_block_stride, kv_head_stride, kv_scale, tp_rank,
|
||||
kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,
|
||||
blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
|
||||
blocksparse_head_sliding_step);
|
||||
}
|
||||
@ -682,7 +682,7 @@ __global__ void paged_attention_v2_reduce_kernel(
|
||||
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
|
||||
scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
|
||||
kv_scale, tp_rank, blocksparse_local_blocks, \
|
||||
k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
|
||||
blocksparse_vert_stride, blocksparse_block_size, \
|
||||
blocksparse_head_sliding_step);
|
||||
|
||||
@ -694,8 +694,8 @@ void paged_attention_v1_launcher(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
|
||||
const int tp_rank, const int blocksparse_local_blocks,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
int num_seqs = query.size(0);
|
||||
@ -770,7 +770,7 @@ void paged_attention_v1_launcher(
|
||||
paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE, \
|
||||
IS_BLOCK_SPARSE>( \
|
||||
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||
seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank, \
|
||||
seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \
|
||||
blocksparse_local_blocks, blocksparse_vert_stride, \
|
||||
blocksparse_block_size, blocksparse_head_sliding_step);
|
||||
|
||||
@ -815,8 +815,8 @@ void paged_attention_v1(
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
||||
@ -833,7 +833,7 @@ void paged_attention_v1(
|
||||
exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
|
||||
value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||
kv_block_stride, kv_head_stride, kv_scale, tp_rank, \
|
||||
kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \
|
||||
blocksparse_local_blocks, blocksparse_vert_stride, \
|
||||
blocksparse_block_size, blocksparse_head_sliding_step); \
|
||||
vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, \
|
||||
@ -850,8 +850,8 @@ void paged_attention_v2_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
|
||||
const int tp_rank, const int blocksparse_local_blocks,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
int num_seqs = query.size(0);
|
||||
@ -932,8 +932,9 @@ void paged_attention_v2_launcher(
|
||||
IS_BLOCK_SPARSE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
|
||||
kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, \
|
||||
blocksparse_block_size, blocksparse_head_sliding_step);
|
||||
k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
|
||||
blocksparse_vert_stride, blocksparse_block_size, \
|
||||
blocksparse_head_sliding_step);
|
||||
|
||||
#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
|
||||
switch (is_block_sparse) { \
|
||||
@ -980,8 +981,8 @@ void paged_attention_v2(
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
||||
|
@ -18,8 +18,8 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype,
|
||||
const double kv_scale);
|
||||
const std::string& kv_cache_dtype, const double k_scale,
|
||||
const double v_scale);
|
||||
|
||||
void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache,
|
||||
|
@ -159,8 +159,8 @@ __global__ void reshape_and_cache_kernel(
|
||||
// block_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int key_stride, const int value_stride, const int num_heads,
|
||||
const int head_size, const int block_size, const int x,
|
||||
const float kv_scale) {
|
||||
const int head_size, const int block_size, const int x, const float k_scale,
|
||||
const float v_scale) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
if (slot_idx < 0) {
|
||||
@ -196,9 +196,9 @@ __global__ void reshape_and_cache_kernel(
|
||||
value_cache[tgt_value_idx] = tgt_value;
|
||||
} else {
|
||||
key_cache[tgt_key_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, kv_scale);
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
|
||||
value_cache[tgt_value_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, kv_scale);
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -248,7 +248,7 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
|
||||
num_heads, head_size, block_size, x, kv_scale);
|
||||
num_heads, head_size, block_size, x, k_scale, v_scale);
|
||||
|
||||
void reshape_and_cache(
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
@ -258,7 +258,8 @@ void reshape_and_cache(
|
||||
torch::Tensor&
|
||||
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
const std::string& kv_cache_dtype, const double kv_scale) {
|
||||
const std::string& kv_cache_dtype, const double k_scale,
|
||||
const double v_scale) {
|
||||
int num_tokens = key.size(0);
|
||||
int num_heads = key.size(1);
|
||||
int head_size = key.size(2);
|
||||
@ -318,13 +319,13 @@ namespace vllm {
|
||||
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
|
||||
Tout* __restrict__ dst_cache,
|
||||
const float kv_scale,
|
||||
const float scale,
|
||||
const int64_t block_stride) {
|
||||
const int64_t block_idx = blockIdx.x;
|
||||
for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
|
||||
int64_t idx = block_idx * block_stride + i;
|
||||
dst_cache[idx] =
|
||||
fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], kv_scale);
|
||||
fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], scale);
|
||||
}
|
||||
}
|
||||
|
||||
@ -333,11 +334,11 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
|
||||
#define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE) \
|
||||
vllm::convert_fp8_kernel<Tout, Tin, KV_DTYPE><<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<Tin*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<Tout*>(dst_cache.data_ptr()), kv_scale, block_stride);
|
||||
reinterpret_cast<Tout*>(dst_cache.data_ptr()), scale, block_stride);
|
||||
|
||||
// Only for testing.
|
||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double kv_scale, const std::string& kv_cache_dtype) {
|
||||
const double scale, const std::string& kv_cache_dtype) {
|
||||
torch::Device src_device = src_cache.device();
|
||||
torch::Device dst_device = dst_cache.device();
|
||||
TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
|
||||
|
@ -423,11 +423,11 @@ void paged_attention_v1(
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
|
||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||
"CPU backend does not support blocksparse attention yet.");
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
|
||||
@ -742,11 +742,11 @@ void paged_attention_v2(
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
|
||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||
"CPU backend does not support blocksparse attention yet.");
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
|
||||
|
@ -107,8 +107,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype, double kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
const std::string& kv_cache_dtype, double k_scale,
|
||||
double v_scale) {
|
||||
TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f);
|
||||
|
||||
int num_tokens = key.size(0);
|
||||
int num_heads = key.size(1);
|
||||
|
@ -16,8 +16,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||
" int blocksparse_local_blocks,"
|
||||
" str kv_cache_dtype, float k_scale, float v_scale,"
|
||||
" int tp_rank, int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
|
||||
@ -30,8 +30,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||
" int blocksparse_local_blocks,"
|
||||
" str kv_cache_dtype, float k_scale, float v_scale,"
|
||||
" int tp_rank, int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
|
||||
@ -103,7 +103,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
" Tensor! key_cache, Tensor! value_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" float kv_scale) -> ()");
|
||||
" float k_scale, float v_scale) -> ()");
|
||||
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
|
||||
}
|
||||
|
||||
|
35
csrc/ops.h
35
csrc/ops.h
@ -8,8 +8,8 @@ void paged_attention_v1(
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step);
|
||||
|
||||
@ -19,8 +19,8 @@ void paged_attention_v2(
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step);
|
||||
|
||||
@ -52,6 +52,11 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
||||
|
||||
void gelu_quick(torch::Tensor& out, torch::Tensor& input);
|
||||
|
||||
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
@ -84,15 +89,19 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
int64_t size_k);
|
||||
|
||||
torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
torch::Tensor& b_scales, torch::Tensor& g_idx,
|
||||
torch::Tensor& perm, torch::Tensor& workspace,
|
||||
int64_t num_bits, int64_t size_m, int64_t size_n,
|
||||
int64_t size_k, bool is_k_full);
|
||||
torch::Tensor& b_scales, torch::Tensor& b_zeros,
|
||||
torch::Tensor& g_idx, torch::Tensor& perm,
|
||||
torch::Tensor& workspace, int64_t num_bits,
|
||||
int64_t size_m, int64_t size_n, int64_t size_k,
|
||||
bool is_k_full, bool has_zp);
|
||||
|
||||
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||
int64_t size_k, int64_t size_n,
|
||||
int64_t num_bits);
|
||||
|
||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
||||
int64_t size_n, int64_t num_bits);
|
||||
|
||||
torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
torch::Tensor& b_scales, torch::Tensor& workspace,
|
||||
int64_t num_bits, int64_t size_m, int64_t size_n,
|
||||
@ -123,12 +132,16 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
|
||||
void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
|
||||
|
||||
void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||
torch::Tensor& scale);
|
||||
void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor const& scale);
|
||||
|
||||
void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||
void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor& scale);
|
||||
|
||||
void dynamic_per_token_scaled_fp8_quant(
|
||||
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
|
||||
c10::optional<torch::Tensor> const& scale_ub);
|
||||
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
|
131
csrc/prepare_inputs/advance_step.cu
Normal file
131
csrc/prepare_inputs/advance_step.cu
Normal file
@ -0,0 +1,131 @@
|
||||
/*
|
||||
* The goal of this GPU kernel is to advance input tensors on the GPU directly
|
||||
* PR: https://github.com/vllm-project/vllm/pull/6338
|
||||
* Current restrictions:
|
||||
* 1. Specialized for DraftModelRunner
|
||||
* 2. Supports flash_attn only
|
||||
*/
|
||||
|
||||
#include "advance_step.cuh"
|
||||
|
||||
namespace prepare_inputs {
|
||||
|
||||
//
|
||||
template <int const num_threads>
|
||||
__global__ void advance_step_kernel(int num_seqs, int num_queries,
|
||||
int block_size, long* input_tokens_ptr,
|
||||
long const* sampled_token_ids_ptr,
|
||||
long* input_positions_ptr,
|
||||
int* seq_lens_ptr, long* slot_mapping_ptr,
|
||||
int const* block_tables_ptr,
|
||||
int64_t const block_tables_stride) {
|
||||
int num_query_blocks = div_ceil(num_queries, num_threads);
|
||||
|
||||
if (blockIdx.x >= num_query_blocks) {
|
||||
return;
|
||||
}
|
||||
|
||||
int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
|
||||
|
||||
if (cur_query_id >= num_queries) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Update input_tokens
|
||||
input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
|
||||
|
||||
int seq_len = seq_lens_ptr[cur_query_id];
|
||||
int next_seq_len = seq_len + 1;
|
||||
int next_input_pos = next_seq_len - 1;
|
||||
|
||||
// Update seq_lens
|
||||
seq_lens_ptr[cur_query_id] = next_seq_len;
|
||||
// Update input_positions
|
||||
input_positions_ptr[cur_query_id] = next_input_pos;
|
||||
|
||||
int const* seq_block_tables_ptr =
|
||||
block_tables_ptr + block_tables_stride * cur_query_id;
|
||||
|
||||
int block_index = next_input_pos / block_size;
|
||||
int block_offset = next_input_pos % block_size;
|
||||
|
||||
int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
|
||||
// Update slot_mapping
|
||||
slot_mapping_ptr[cur_query_id] = slot_num;
|
||||
}
|
||||
|
||||
inline void verify_tensor(std::string const& name, torch::Tensor& t,
|
||||
int64_t const size_0, int64_t const size_1,
|
||||
c10::ScalarType const type) {
|
||||
bool size_0_cond = true;
|
||||
if (size_0 != -1) {
|
||||
size_0_cond = t.size(0) == size_0;
|
||||
}
|
||||
|
||||
bool size_1_cond = true;
|
||||
if (size_1 != -1) {
|
||||
size_1_cond = t.size(1) == size_1;
|
||||
}
|
||||
|
||||
bool is_contiguous = t.is_contiguous();
|
||||
bool same_type = t.dtype() == type;
|
||||
|
||||
bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
|
||||
if (!pass) {
|
||||
TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
|
||||
" is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
|
||||
" is not as expected: shape = [", size_0, ", ", size_1,
|
||||
"], type = ", type);
|
||||
}
|
||||
}
|
||||
|
||||
void advance_step(int num_seqs, int num_queries, int block_size,
|
||||
torch::Tensor& input_tokens, // type: long
|
||||
torch::Tensor& sampled_token_ids, // type: long
|
||||
torch::Tensor& input_positions, // type: long
|
||||
torch::Tensor& seq_lens, // type: int
|
||||
torch::Tensor& slot_mapping, // type: long
|
||||
torch::Tensor& block_tables) { // type: int
|
||||
|
||||
if (logging) {
|
||||
printf("advance_step:\n");
|
||||
printf(" num_seqs = %d\n", num_seqs);
|
||||
printf(" num_queries = %d\n", num_queries);
|
||||
printf(" block_size = %d\n", block_size);
|
||||
}
|
||||
// Verify all tensors
|
||||
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
|
||||
verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
|
||||
at::kLong);
|
||||
verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
|
||||
verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
|
||||
verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
|
||||
verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
|
||||
|
||||
int dev = sampled_token_ids.get_device();
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
|
||||
|
||||
int blocks;
|
||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||
|
||||
advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
|
||||
num_seqs, num_queries, block_size,
|
||||
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
||||
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
|
||||
reinterpret_cast<long*>(input_positions.data_ptr()),
|
||||
reinterpret_cast<int*>(seq_lens.data_ptr()),
|
||||
reinterpret_cast<long*>(slot_mapping.data_ptr()),
|
||||
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
||||
block_tables.stride(0));
|
||||
}
|
||||
|
||||
} // namespace prepare_inputs
|
||||
|
||||
void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
|
||||
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
|
||||
torch::Tensor& input_positions, torch::Tensor& seq_lens,
|
||||
torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
|
||||
prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
|
||||
sampled_token_ids, input_positions, seq_lens,
|
||||
slot_mapping, block_tables);
|
||||
}
|
19
csrc/prepare_inputs/advance_step.cuh
Normal file
19
csrc/prepare_inputs/advance_step.cuh
Normal file
@ -0,0 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace prepare_inputs {
|
||||
|
||||
static constexpr int max_threads = 256;
|
||||
static constexpr bool logging = false;
|
||||
|
||||
constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
} // namespace prepare_inputs
|
@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
|
||||
if (cuda_device_capability >= 90) {
|
||||
return CUDA_VERSION >= 12000;
|
||||
} else if (cuda_device_capability >= 89) {
|
||||
return CUDA_VERSION >= 12040;
|
||||
// CUTLASS Kernels have not been tuned for Ada Lovelace systems
|
||||
// and are slower than torch.mm. Return false unconditionally in this case.
|
||||
return false;
|
||||
|
||||
// Once the CUTLASS kernels have been optimized for Lovelace systems,
|
||||
// use the following check:
|
||||
// return CUDA_VERSION >= 12040;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -98,4 +104,4 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
TORCH_CHECK(version_num >= 75);
|
||||
cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -7,6 +7,8 @@
|
||||
#include "cuda_compat.h"
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "../../reduction_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
|
||||
@ -21,10 +23,16 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
|
||||
|
||||
#define FP8_E4M3_MAX std::numeric_limits<c10::Float8_e4m3fn>::max()
|
||||
|
||||
template <typename scalar_t>
|
||||
template <bool is_scale_inverted>
|
||||
__device__ __forceinline__ c10::Float8_e4m3fn scaled_fp8_conversion(
|
||||
const scalar_t val, const float inverted_scale) {
|
||||
float x = static_cast<float>(val) * inverted_scale;
|
||||
float const val, float const scale) {
|
||||
float x = 0.0f;
|
||||
if constexpr (is_scale_inverted) {
|
||||
x = val * scale;
|
||||
} else {
|
||||
x = val / scale;
|
||||
}
|
||||
|
||||
float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
|
||||
return static_cast<c10::Float8_e4m3fn>(r);
|
||||
}
|
||||
@ -87,6 +95,70 @@ typedef struct __align__(4) {
|
||||
}
|
||||
float8x4_t;
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ float thread_max_vec(scalar_t const* __restrict__ input,
|
||||
int64_t const num_elems, int const tid,
|
||||
int const step) {
|
||||
// Vectorized input/output to better utilize memory bandwidth.
|
||||
vec4_t<scalar_t> const* vectorized_in =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(input);
|
||||
|
||||
int64_t const num_vec_elems = num_elems >> 2;
|
||||
float absmax_val = 0.0f;
|
||||
|
||||
#pragma unroll 4
|
||||
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
||||
vec4_t<scalar_t> in_vec = vectorized_in[i];
|
||||
absmax_val = max(absmax_val, fabs(in_vec.x));
|
||||
absmax_val = max(absmax_val, fabs(in_vec.y));
|
||||
absmax_val = max(absmax_val, fabs(in_vec.z));
|
||||
absmax_val = max(absmax_val, fabs(in_vec.w));
|
||||
}
|
||||
|
||||
// Handle the remaining elements if num_elems is not divisible by 4
|
||||
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
|
||||
absmax_val = max(absmax_val, fabs(input[i]));
|
||||
}
|
||||
|
||||
return absmax_val;
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool is_scale_inverted>
|
||||
__device__ void scaled_fp8_conversion_vec(c10::Float8_e4m3fn* __restrict__ out,
|
||||
scalar_t const* __restrict__ input,
|
||||
float const scale,
|
||||
int64_t const num_elems,
|
||||
int const tid, int const step) {
|
||||
// Vectorized input/output to better utilize memory bandwidth.
|
||||
vec4_t<scalar_t> const* vectorized_in =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(input);
|
||||
float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
|
||||
|
||||
int64_t const num_vec_elems = num_elems >> 2;
|
||||
|
||||
#pragma unroll 4
|
||||
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
||||
vec4_t<scalar_t> in_vec = vectorized_in[i];
|
||||
float8x4_t out_vec;
|
||||
|
||||
out_vec.x = scaled_fp8_conversion<is_scale_inverted>(
|
||||
static_cast<float>(in_vec.x), scale);
|
||||
out_vec.y = scaled_fp8_conversion<is_scale_inverted>(
|
||||
static_cast<float>(in_vec.y), scale);
|
||||
out_vec.z = scaled_fp8_conversion<is_scale_inverted>(
|
||||
static_cast<float>(in_vec.z), scale);
|
||||
out_vec.w = scaled_fp8_conversion<is_scale_inverted>(
|
||||
static_cast<float>(in_vec.w), scale);
|
||||
vectorized_out[i] = out_vec;
|
||||
}
|
||||
|
||||
// Handle the remaining elements if num_elems is not divisible by 4
|
||||
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
|
||||
out[i] = scaled_fp8_conversion<is_scale_inverted>(
|
||||
static_cast<float>(input[i]), scale);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out,
|
||||
const scalar_t* __restrict__ input,
|
||||
@ -97,38 +169,68 @@ __global__ void scaled_fp8_quant_kernel(c10::Float8_e4m3fn* __restrict__ out,
|
||||
// Invert the scale so that we can use multiplications to avoid expensive
|
||||
// division.
|
||||
const float inverted_scale = 1.0f / (*scale);
|
||||
scaled_fp8_conversion_vec<scalar_t, true>(
|
||||
out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
|
||||
}
|
||||
|
||||
// Vectorized input/output to better utilize memory bandwidth.
|
||||
const vec4_t<scalar_t>* vectorized_in =
|
||||
reinterpret_cast<const vec4_t<scalar_t>*>(input);
|
||||
float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
|
||||
template <typename scalar_t>
|
||||
__global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
||||
c10::Float8_e4m3fn* __restrict__ out, float* __restrict__ scale,
|
||||
scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
|
||||
const int hidden_size) {
|
||||
float const min_scaling_factor = 1.0f / (FP8_E4M3_MAX * 512.f);
|
||||
|
||||
int num_vec_elems = num_elems >> 2;
|
||||
int const tid = threadIdx.x;
|
||||
int const token_idx = blockIdx.x;
|
||||
|
||||
#pragma unroll 4
|
||||
for (int i = tid; i < num_vec_elems; i += blockDim.x * gridDim.x) {
|
||||
vec4_t<scalar_t> in_vec = vectorized_in[i];
|
||||
float8x4_t out_vec;
|
||||
scalar_t const* __restrict__ token_input = &input[token_idx * hidden_size];
|
||||
c10::Float8_e4m3fn* __restrict__ token_output = &out[token_idx * hidden_size];
|
||||
|
||||
out_vec.x = scaled_fp8_conversion(in_vec.x, inverted_scale);
|
||||
out_vec.y = scaled_fp8_conversion(in_vec.y, inverted_scale);
|
||||
out_vec.z = scaled_fp8_conversion(in_vec.z, inverted_scale);
|
||||
out_vec.w = scaled_fp8_conversion(in_vec.w, inverted_scale);
|
||||
vectorized_out[i] = out_vec;
|
||||
// For vectorization, token_input and token_output pointers need to be
|
||||
// aligned at 8-byte and 4-byte addresses respectively.
|
||||
bool const can_vectorize = hidden_size % 4 == 0;
|
||||
|
||||
float absmax_val = 0.0f;
|
||||
if (can_vectorize) {
|
||||
absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x);
|
||||
} else {
|
||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||
float const x = static_cast<float>(token_input[i]);
|
||||
absmax_val = max(absmax_val, fabs(x));
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the remaining elements if num_elems is not divisible by 4
|
||||
for (int i = num_vec_elems * 4 + tid; i < num_elems;
|
||||
i += blockDim.x * gridDim.x) {
|
||||
out[i] = scaled_fp8_conversion(input[i], inverted_scale);
|
||||
float const block_absmax_val_maybe = blockReduceMax(absmax_val);
|
||||
__shared__ float token_scale;
|
||||
if (tid == 0) {
|
||||
if (scale_ub) {
|
||||
token_scale = min(block_absmax_val_maybe, *scale_ub);
|
||||
} else {
|
||||
token_scale = block_absmax_val_maybe;
|
||||
}
|
||||
// token scale computation
|
||||
token_scale = max(token_scale / FP8_E4M3_MAX, min_scaling_factor);
|
||||
scale[token_idx] = token_scale;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Note that we don't use inverted scales so we can match FBGemm impl.
|
||||
if (can_vectorize) {
|
||||
scaled_fp8_conversion_vec<scalar_t, false>(
|
||||
token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
|
||||
} else {
|
||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||
token_output[i] = scaled_fp8_conversion<false>(
|
||||
static_cast<float>(token_input[i]), token_scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input, // [..., d]
|
||||
torch::Tensor& scale) // [1]
|
||||
void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor const& input, // [..., d]
|
||||
torch::Tensor const& scale) // [1]
|
||||
{
|
||||
int64_t num_tokens = input.numel() / input.size(-1);
|
||||
int64_t num_elems = input.numel();
|
||||
@ -144,9 +246,9 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
||||
});
|
||||
}
|
||||
|
||||
void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input, // [..., d]
|
||||
torch::Tensor& scale) // [1]
|
||||
void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor const& input, // [..., d]
|
||||
torch::Tensor& scale) // [1]
|
||||
{
|
||||
int64_t num_tokens = input.numel() / input.size(-1);
|
||||
int64_t num_elems = input.numel();
|
||||
@ -163,3 +265,28 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
||||
scale.data_ptr<float>(), num_elems);
|
||||
});
|
||||
}
|
||||
|
||||
void dynamic_per_token_scaled_fp8_quant(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor const& input, // [..., d]
|
||||
torch::Tensor& scales, std::optional<at::Tensor> const& scale_ub) {
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
|
||||
int const hidden_size = input.size(-1);
|
||||
int const num_tokens = input.numel() / hidden_size;
|
||||
dim3 const grid(num_tokens);
|
||||
dim3 const block(std::min(hidden_size, 1024));
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "dynamic_per_token_scaled_fp8_quant_kernel", [&] {
|
||||
vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<c10::Float8_e4m3fn>(), scales.data_ptr<float>(),
|
||||
input.data_ptr<scalar_t>(),
|
||||
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
||||
hidden_size);
|
||||
});
|
||||
}
|
||||
|
@ -19,10 +19,10 @@
|
||||
* Adapted from https://github.com/IST-DASLab/marlin
|
||||
*/
|
||||
|
||||
#include "../gptq_marlin/gptq_marlin.cuh"
|
||||
#include "../gptq_marlin/gptq_marlin_dtypes.cuh"
|
||||
#include "../gptq_marlin/marlin.cuh"
|
||||
#include "../gptq_marlin/marlin_dtypes.cuh"
|
||||
|
||||
using namespace gptq_marlin;
|
||||
using namespace marlin;
|
||||
|
||||
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
||||
static_assert(std::is_same<scalar_t, half>::value || \
|
||||
@ -1224,16 +1224,15 @@ torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
", size_k = ", size_k);
|
||||
|
||||
// Verify B
|
||||
TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_size = ", gptq_marlin::tile_size);
|
||||
TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
|
||||
TORCH_CHECK(size_k % marlin::tile_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_size = ", marlin::tile_size);
|
||||
TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
|
||||
"Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
|
||||
", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
|
||||
TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
|
||||
", size_k = ", size_k, ", tile_size = ", marlin::tile_size);
|
||||
TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
|
||||
"b_q_weight.size(1) = ", b_q_weight.size(1),
|
||||
" is not divisible by tile_size = ", gptq_marlin::tile_size);
|
||||
int actual_size_n =
|
||||
(b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
|
||||
" is not divisible by tile_size = ", marlin::tile_size);
|
||||
int actual_size_n = (b_q_weight.size(1) / marlin::tile_size) * pack_factor;
|
||||
TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
|
||||
", actual_size_n = ", actual_size_n);
|
||||
|
||||
@ -1274,11 +1273,9 @@ torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
num_groups = b_scales.size(0);
|
||||
|
||||
// Verify workspace size
|
||||
TORCH_CHECK(
|
||||
size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
|
||||
", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
|
||||
int min_workspace_size =
|
||||
(size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
|
||||
TORCH_CHECK(size_n % marlin::min_thread_n == 0, "size_n = ", size_n,
|
||||
", is not divisible by min_thread_n = ", marlin::min_thread_n);
|
||||
int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
|
||||
TORCH_CHECK(workspace.numel() >= min_workspace_size,
|
||||
"workspace.numel = ", workspace.numel(),
|
||||
" is below min_workspace_size = ", min_workspace_size);
|
||||
@ -1290,14 +1287,14 @@ torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
b_scales.data_ptr<at::Half>(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), num_bits, num_groups, group_size, dev,
|
||||
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
|
||||
gptq_marlin::max_par);
|
||||
marlin::max_par);
|
||||
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
|
||||
fp8_marlin::marlin_mm_f16i4<nv_bfloat16>(
|
||||
a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
|
||||
c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(), size_m,
|
||||
size_n, size_k, workspace.data_ptr(), num_bits, num_groups, group_size,
|
||||
dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
|
||||
gptq_marlin::max_par);
|
||||
marlin::max_par);
|
||||
} else {
|
||||
TORCH_CHECK(false, "fp8_marlin_gemm only supports bfloat16 and float16");
|
||||
}
|
||||
|
269
csrc/quantization/gptq_marlin/awq_marlin_repack.cu
Normal file
269
csrc/quantization/gptq_marlin/awq_marlin_repack.cu
Normal file
@ -0,0 +1,269 @@
|
||||
#include "marlin.cuh"
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
|
||||
namespace marlin {
|
||||
|
||||
template <int const num_threads, int const num_bits, bool const has_perm>
|
||||
__global__ void awq_marlin_repack_kernel(
|
||||
uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
|
||||
int size_k, int size_n) {}
|
||||
|
||||
} // namespace marlin
|
||||
|
||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||
int64_t size_k, int64_t size_n,
|
||||
int64_t num_bits) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
|
||||
return torch::empty({1, 1});
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
namespace marlin {
|
||||
|
||||
template <int const num_threads, int const num_bits>
|
||||
__global__ void awq_marlin_repack_kernel(
|
||||
uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
|
||||
int size_k, int size_n) {
|
||||
constexpr int pack_factor = 32 / num_bits;
|
||||
|
||||
int k_tiles = size_k / tile_k_size;
|
||||
int n_tiles = size_n / tile_n_size;
|
||||
int block_k_tiles = div_ceil(k_tiles, gridDim.x);
|
||||
|
||||
int start_k_tile = blockIdx.x * block_k_tiles;
|
||||
if (start_k_tile >= k_tiles) {
|
||||
return;
|
||||
}
|
||||
|
||||
int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
|
||||
|
||||
// Wait until the next thread tile has been loaded to shared memory.
|
||||
auto wait_for_stage = [&]() {
|
||||
// We only have `stages - 2` active fetches since we are double buffering
|
||||
// and can only issue the next fetch when it is guaranteed that the previous
|
||||
// shared memory load is fully complete (as it may otherwise be
|
||||
// overwritten).
|
||||
cp_async_wait<repack_stages - 2>();
|
||||
__syncthreads();
|
||||
};
|
||||
|
||||
extern __shared__ int4 sh[];
|
||||
|
||||
constexpr int tile_n_ints = tile_n_size / pack_factor;
|
||||
|
||||
constexpr int stage_n_threads = tile_n_ints / 4;
|
||||
constexpr int stage_k_threads = tile_k_size;
|
||||
constexpr int stage_size = stage_k_threads * stage_n_threads;
|
||||
|
||||
auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
|
||||
if (n_tile_id >= n_tiles) {
|
||||
cp_async_fence();
|
||||
return;
|
||||
}
|
||||
|
||||
int first_n = n_tile_id * tile_n_size;
|
||||
int first_n_packed = first_n / pack_factor;
|
||||
|
||||
int4* sh_ptr = sh + stage_size * pipe;
|
||||
|
||||
if (threadIdx.x < stage_size) {
|
||||
int k_id = threadIdx.x / stage_n_threads;
|
||||
int n_id = threadIdx.x % stage_n_threads;
|
||||
|
||||
int first_k = k_tile_id * tile_k_size;
|
||||
|
||||
cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
|
||||
reinterpret_cast<int4 const*>(
|
||||
&(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) +
|
||||
first_n_packed + (n_id * 4)])));
|
||||
}
|
||||
|
||||
cp_async_fence();
|
||||
};
|
||||
|
||||
auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
|
||||
if (n_tile_id >= n_tiles) {
|
||||
return;
|
||||
}
|
||||
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int th_id = threadIdx.x % 32;
|
||||
|
||||
if (warp_id >= 4) {
|
||||
return;
|
||||
}
|
||||
|
||||
int tc_col = th_id / 4;
|
||||
int tc_row = (th_id % 4) * 2;
|
||||
|
||||
constexpr int tc_offsets[4] = {0, 1, 8, 9};
|
||||
|
||||
int cur_n = warp_id * 16 + tc_col;
|
||||
int cur_n_packed = cur_n / pack_factor;
|
||||
int cur_n_pos = cur_n % pack_factor;
|
||||
|
||||
constexpr int sh_stride = tile_n_ints;
|
||||
constexpr uint32_t mask = (1 << num_bits) - 1;
|
||||
|
||||
int4* sh_stage_ptr = sh + stage_size * pipe;
|
||||
uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
|
||||
|
||||
// Undo interleaving
|
||||
int cur_n_pos_unpacked;
|
||||
if constexpr (num_bits == 4) {
|
||||
constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
cur_n_pos_unpacked = undo_pack[cur_n_pos];
|
||||
} else {
|
||||
constexpr int undo_pack[4] = {0, 2, 1, 3};
|
||||
cur_n_pos_unpacked = undo_pack[cur_n_pos];
|
||||
}
|
||||
|
||||
uint32_t vals[8];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int cur_elem = tc_row + tc_offsets[i];
|
||||
|
||||
int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
|
||||
int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
|
||||
sh_stride * cur_elem];
|
||||
|
||||
vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
|
||||
vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
|
||||
}
|
||||
|
||||
constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
|
||||
int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
|
||||
|
||||
// Result of:
|
||||
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
|
||||
if constexpr (num_bits == 4) {
|
||||
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
||||
|
||||
uint32_t res = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
res |= vals[pack_idx[i]] << (i * 4);
|
||||
}
|
||||
|
||||
out_ptr[out_offset + th_id * 4 + warp_id] = res;
|
||||
|
||||
} else {
|
||||
constexpr int pack_idx[4] = {0, 2, 1, 3};
|
||||
|
||||
uint32_t res1 = 0;
|
||||
uint32_t res2 = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
res1 |= vals[pack_idx[i]] << (i * 8);
|
||||
res2 |= vals[4 + pack_idx[i]] << (i * 8);
|
||||
}
|
||||
|
||||
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
|
||||
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
|
||||
}
|
||||
};
|
||||
|
||||
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
|
||||
#pragma unroll
|
||||
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
|
||||
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
|
||||
}
|
||||
|
||||
wait_for_stage();
|
||||
};
|
||||
#pragma unroll
|
||||
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
|
||||
int n_tile_id = 0;
|
||||
|
||||
start_pipes(k_tile_id, n_tile_id);
|
||||
|
||||
while (n_tile_id < n_tiles) {
|
||||
#pragma unroll
|
||||
for (int pipe = 0; pipe < repack_stages; pipe++) {
|
||||
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
|
||||
n_tile_id + pipe + repack_stages - 1);
|
||||
repack_tile(pipe, k_tile_id, n_tile_id + pipe);
|
||||
wait_for_stage();
|
||||
}
|
||||
n_tile_id += repack_stages;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace marlin
|
||||
|
||||
#define CALL_IF(NUM_BITS) \
|
||||
else if (num_bits == NUM_BITS) { \
|
||||
cudaFuncSetAttribute( \
|
||||
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||
marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS> \
|
||||
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
|
||||
b_q_weight_ptr, out_ptr, size_k, size_n); \
|
||||
}
|
||||
|
||||
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
|
||||
int64_t size_n, int64_t num_bits) {
|
||||
// Verify compatibility with marlin tile of 16x64
|
||||
TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_k_size = ", marlin::tile_k_size);
|
||||
TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
|
||||
" is not divisible by tile_n_size = ", marlin::tile_n_size);
|
||||
|
||||
TORCH_CHECK(num_bits == 4 || num_bits == 8,
|
||||
"num_bits must be 4 or 8. Got = ", num_bits);
|
||||
int const pack_factor = 32 / num_bits;
|
||||
|
||||
// Verify B
|
||||
TORCH_CHECK(b_q_weight.size(0) == size_k,
|
||||
"b_q_weight.size(0) = ", b_q_weight.size(0),
|
||||
" is not size_k = ", size_k);
|
||||
TORCH_CHECK((size_n / pack_factor) == b_q_weight.size(1),
|
||||
"Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
|
||||
", size_n = ", size_n, ", pack_factor = ", pack_factor);
|
||||
|
||||
// Verify device and strides
|
||||
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
|
||||
TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
|
||||
TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
|
||||
|
||||
// Alloc buffers
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
|
||||
auto options = torch::TensorOptions()
|
||||
.dtype(b_q_weight.dtype())
|
||||
.device(b_q_weight.device());
|
||||
torch::Tensor out = torch::empty(
|
||||
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||
options);
|
||||
|
||||
// Get ptrs
|
||||
uint32_t const* b_q_weight_ptr =
|
||||
reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
|
||||
uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
|
||||
|
||||
// Get dev info
|
||||
int dev = b_q_weight.get_device();
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
|
||||
int blocks;
|
||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||
|
||||
int max_shared_mem = 0;
|
||||
cudaDeviceGetAttribute(&max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
TORCH_CHECK(max_shared_mem > 0);
|
||||
|
||||
if (false) {
|
||||
}
|
||||
CALL_IF(4)
|
||||
CALL_IF(8)
|
||||
else {
|
||||
TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
#endif
|
@ -19,8 +19,8 @@
|
||||
* Adapted from https://github.com/IST-DASLab/marlin
|
||||
*/
|
||||
|
||||
#include "gptq_marlin.cuh"
|
||||
#include "gptq_marlin_dtypes.cuh"
|
||||
#include "marlin.cuh"
|
||||
#include "marlin_dtypes.cuh"
|
||||
|
||||
#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \
|
||||
static_assert(std::is_same<scalar_t, half>::value || \
|
||||
@ -32,7 +32,7 @@ inline std::string str(T x) {
|
||||
return std::to_string(x);
|
||||
}
|
||||
|
||||
namespace gptq_marlin {
|
||||
namespace marlin {
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
|
||||
@ -72,10 +72,11 @@ __global__ void Marlin(
|
||||
} // namespace gptq_marlin
|
||||
|
||||
torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
torch::Tensor& b_scales, torch::Tensor& g_idx,
|
||||
torch::Tensor& perm, torch::Tensor& workspace,
|
||||
int64_t num_bits, int64_t size_m, int64_t size_n,
|
||||
int64_t size_k, bool is_k_full) {
|
||||
torch::Tensor& b_scales, torch::Tensor& b_zeros,
|
||||
torch::Tensor& g_idx, torch::Tensor& perm,
|
||||
torch::Tensor& workspace, int64_t num_bits,
|
||||
int64_t size_m, int64_t size_n, int64_t size_k,
|
||||
bool is_k_full) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||
"marlin_gemm(..) requires CUDA_ARCH >= 8.0");
|
||||
return torch::empty({1, 1});
|
||||
@ -264,6 +265,114 @@ dequant_8bit<nv_bfloat16>(int q) {
|
||||
return frag_b;
|
||||
}
|
||||
|
||||
// Zero-point dequantizers
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit_zp(int q) {
|
||||
STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline typename ScalarType<half>::FragB dequant_4bit_zp<half>(
|
||||
int q) {
|
||||
const int LO = 0x000f000f;
|
||||
const int HI = 0x00f000f0;
|
||||
const int EX = 0x64006400;
|
||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
||||
|
||||
const int SUB = 0x64006400;
|
||||
const int MUL = 0x2c002c00;
|
||||
const int ADD = 0xd400d400;
|
||||
typename ScalarType<half>::FragB frag_b;
|
||||
frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
|
||||
*reinterpret_cast<const half2*>(&SUB));
|
||||
frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
|
||||
*reinterpret_cast<const half2*>(&MUL),
|
||||
*reinterpret_cast<const half2*>(&ADD));
|
||||
return frag_b;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline typename ScalarType<nv_bfloat16>::FragB
|
||||
dequant_4bit_zp<nv_bfloat16>(int q) {
|
||||
static constexpr uint32_t MASK = 0x000f000f;
|
||||
static constexpr uint32_t EX = 0x43004300;
|
||||
|
||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||
|
||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
|
||||
q >>= 4;
|
||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
|
||||
|
||||
typename ScalarType<nv_bfloat16>::FragB frag_b;
|
||||
static constexpr uint32_t MUL = 0x3F803F80;
|
||||
static constexpr uint32_t ADD = 0xC300C300;
|
||||
|
||||
frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
|
||||
*reinterpret_cast<const nv_bfloat162*>(&MUL),
|
||||
*reinterpret_cast<const nv_bfloat162*>(&ADD));
|
||||
frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
|
||||
*reinterpret_cast<const nv_bfloat162*>(&MUL),
|
||||
*reinterpret_cast<const nv_bfloat162*>(&ADD));
|
||||
return frag_b;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit_zp(int q) {
|
||||
STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline typename ScalarType<half>::FragB dequant_8bit_zp<half>(
|
||||
int q) {
|
||||
static constexpr uint32_t mask_for_elt_01 = 0x5250;
|
||||
static constexpr uint32_t mask_for_elt_23 = 0x5351;
|
||||
static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
|
||||
|
||||
uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
|
||||
uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
|
||||
|
||||
static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
|
||||
|
||||
typename ScalarType<half>::FragB frag_b;
|
||||
frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
|
||||
*reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
|
||||
frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
|
||||
*reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
|
||||
return frag_b;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline typename ScalarType<nv_bfloat16>::FragB
|
||||
dequant_8bit_zp<nv_bfloat16>(int q) {
|
||||
typename ScalarType<nv_bfloat16>::FragB frag_b;
|
||||
|
||||
float fp32_intermediates[4];
|
||||
uint32_t* fp32_intermediates_casted =
|
||||
reinterpret_cast<uint32_t*>(fp32_intermediates);
|
||||
|
||||
static constexpr uint32_t fp32_base = 0x4B000000;
|
||||
fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
|
||||
fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
|
||||
fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
|
||||
fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
|
||||
|
||||
fp32_intermediates[0] -= 8388608.f;
|
||||
fp32_intermediates[1] -= 8388608.f;
|
||||
fp32_intermediates[2] -= 8388608.f;
|
||||
fp32_intermediates[3] -= 8388608.f;
|
||||
|
||||
uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
|
||||
bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
|
||||
fp32_intermediates_casted[1], 0x7632);
|
||||
bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
|
||||
fp32_intermediates_casted[3], 0x7632);
|
||||
|
||||
return frag_b;
|
||||
}
|
||||
|
||||
// Multiply dequantized values by the corresponding quantization scale; used
|
||||
// only for grouped quantization.
|
||||
template <typename scalar_t>
|
||||
@ -277,6 +386,17 @@ __device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
|
||||
frag_b[1] = __hmul2(frag_b[1], s);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
|
||||
typename ScalarType<scalar_t>::scalar_t2& frag_zp,
|
||||
int i) {
|
||||
using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
|
||||
scalar_t2 zp =
|
||||
ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
|
||||
frag_b[0] = __hsub2(frag_b[0], zp);
|
||||
frag_b[1] = __hsub2(frag_b[1], zp);
|
||||
}
|
||||
|
||||
// Same as above, but for act_order (each K is multiplied individually)
|
||||
template <typename scalar_t>
|
||||
__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
|
||||
@ -404,6 +524,7 @@ template <typename scalar_t, // compute dtype, half or nv_float16
|
||||
const int stages, // number of stages for the async global->shared
|
||||
// fetch pipeline
|
||||
const bool has_act_order, // whether act_order is enabled
|
||||
const bool has_zp, // whether zero-points are enabled
|
||||
const int group_blocks = -1 // number of consecutive 16x16 blocks
|
||||
// with a separate quantization scale
|
||||
>
|
||||
@ -413,6 +534,8 @@ __global__ void Marlin(
|
||||
int4* __restrict__ C, // fp16 output buffer of shape mxn
|
||||
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
|
||||
// (k/groupsize)xn
|
||||
const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape
|
||||
// (k/groupsize)x(n/pack_factor)
|
||||
const int* __restrict__ g_idx, // int32 group indices of shape k
|
||||
int num_groups, // number of scale groups per output channel
|
||||
int prob_m, // batch dimension m
|
||||
@ -437,6 +560,7 @@ __global__ void Marlin(
|
||||
using FragB = typename ScalarType<scalar_t>::FragB;
|
||||
using FragC = typename ScalarType<scalar_t>::FragC;
|
||||
using FragS = typename ScalarType<scalar_t>::FragS;
|
||||
using FragZP = typename ScalarType<scalar_t>::FragZP;
|
||||
|
||||
constexpr int pack_factor = 32 / num_bits;
|
||||
|
||||
@ -566,6 +690,13 @@ __global__ void Marlin(
|
||||
int tb_n_warps = thread_n_blocks / 4;
|
||||
int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
|
||||
|
||||
// Zero-points sizes/strides
|
||||
int zp_gl_stride = (prob_n / pack_factor) / 4;
|
||||
constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4;
|
||||
constexpr int zp_tb_groups = s_tb_groups;
|
||||
constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
|
||||
int zp_gl_rd_delta = zp_gl_stride;
|
||||
|
||||
// Global A read index of current thread.
|
||||
int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
|
||||
(threadIdx.x % a_gl_rd_delta_o);
|
||||
@ -605,6 +736,19 @@ __global__ void Marlin(
|
||||
int s_sh_wr = threadIdx.x;
|
||||
bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
|
||||
|
||||
// Zero-points
|
||||
int zp_gl_rd;
|
||||
if constexpr (has_zp) {
|
||||
if constexpr (group_blocks == -1) {
|
||||
zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
|
||||
} else {
|
||||
zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
|
||||
zp_sh_stride * slice_col + threadIdx.x;
|
||||
}
|
||||
}
|
||||
int zp_sh_wr = threadIdx.x;
|
||||
bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
|
||||
|
||||
// We use a different scale layout for grouped and column-wise quantization as
|
||||
// we scale a `half2` tile in column-major layout in the former and in
|
||||
// row-major in the latter case.
|
||||
@ -616,6 +760,18 @@ __global__ void Marlin(
|
||||
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
|
||||
(threadIdx.x % 32) % 4;
|
||||
|
||||
// Zero-points have the same read layout as the scales
|
||||
// (without column-wise case)
|
||||
constexpr int num_col_threads = 8;
|
||||
constexpr int num_row_threads = 4;
|
||||
constexpr int num_ints_per_thread = 8 / pack_factor;
|
||||
int zp_sh_rd;
|
||||
if constexpr (has_zp) {
|
||||
zp_sh_rd = num_ints_per_thread * num_col_threads *
|
||||
((threadIdx.x / 32) % (thread_n_blocks / 4)) +
|
||||
num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
|
||||
}
|
||||
|
||||
// Precompute which thread should not read memory in which iterations; this is
|
||||
// needed if there are more threads than required for a certain tilesize or
|
||||
// when the batchsize is not a multiple of 16.
|
||||
@ -664,14 +820,17 @@ __global__ void Marlin(
|
||||
int4* sh_a = sh;
|
||||
int4* sh_b = sh_a + (stages * a_sh_stage);
|
||||
int4* sh_g_idx = sh_b + (stages * b_sh_stage);
|
||||
int4* sh_s = sh_g_idx + (stages * g_idx_stage);
|
||||
int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
|
||||
int4* sh_s = sh_zp + (stages * zp_sh_stage);
|
||||
|
||||
// Register storage for double buffer of shared memory reads.
|
||||
FragA frag_a[2][thread_m_blocks];
|
||||
I4 frag_b_quant[2][b_thread_vecs];
|
||||
FragC frag_c[thread_m_blocks][4][2];
|
||||
FragS frag_s[2][4]; // No act-order
|
||||
FragS act_frag_s[2][4][4]; // For act-order
|
||||
FragS frag_s[2][4]; // No act-order
|
||||
FragS act_frag_s[2][4][4]; // For act-order
|
||||
int frag_qzp[2][num_ints_per_thread]; // Zero-points
|
||||
FragZP frag_zp; // Zero-points in fp16
|
||||
|
||||
// Zero accumulators.
|
||||
auto zero_accums = [&]() {
|
||||
@ -777,6 +936,28 @@ __global__ void Marlin(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (has_zp && group_blocks != -1) {
|
||||
int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
|
||||
|
||||
if constexpr (group_blocks >= thread_k_blocks) {
|
||||
// Only fetch zero-points if this tile starts a new group
|
||||
if (pipe % (group_blocks / thread_k_blocks) == 0) {
|
||||
if (zp_sh_wr_pred) {
|
||||
cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
|
||||
}
|
||||
zp_gl_rd += zp_gl_rd_delta;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < zp_tb_groups; i++) {
|
||||
if (zp_sh_wr_pred) {
|
||||
cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
|
||||
&zp_ptr[zp_gl_rd]);
|
||||
}
|
||||
zp_gl_rd += zp_gl_rd_delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Insert a fence even when we are winding down the pipeline to ensure that
|
||||
@ -784,6 +965,12 @@ __global__ void Marlin(
|
||||
cp_async_fence();
|
||||
};
|
||||
|
||||
auto fetch_zp_to_shared = [&]() {
|
||||
if (zp_sh_wr_pred) {
|
||||
cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
|
||||
}
|
||||
};
|
||||
|
||||
// Wait until the next thread tile has been loaded to shared memory.
|
||||
auto wait_for_stage = [&]() {
|
||||
// We only have `stages - 2` active fetches since we are double buffering
|
||||
@ -932,8 +1119,73 @@ __global__ void Marlin(
|
||||
}
|
||||
};
|
||||
|
||||
auto fetch_zp_to_registers = [&](int k, int full_pipe) {
|
||||
if constexpr (!has_zp) {
|
||||
return;
|
||||
}
|
||||
|
||||
int pipe = full_pipe % stages;
|
||||
|
||||
if constexpr (group_blocks == -1) {
|
||||
for (int i = 0; i < num_ints_per_thread; i++) {
|
||||
frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
|
||||
}
|
||||
|
||||
} else if constexpr (group_blocks >= thread_k_blocks) {
|
||||
int4* sh_zp_stage =
|
||||
sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
|
||||
(pipe / (group_blocks / thread_k_blocks)));
|
||||
for (int i = 0; i < num_ints_per_thread; i++) {
|
||||
frag_qzp[k % 2][i] =
|
||||
(reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
|
||||
}
|
||||
} else {
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int n_warps = thread_n_blocks / 4;
|
||||
|
||||
int warp_row = warp_id / n_warps;
|
||||
|
||||
int cur_k = warp_row * 16;
|
||||
cur_k += k_iter_size * (k % b_sh_wr_iters);
|
||||
|
||||
int k_blocks = cur_k / 16;
|
||||
int cur_group_id = k_blocks / group_blocks;
|
||||
|
||||
int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
|
||||
|
||||
sh_zp_stage += cur_group_id * zp_sh_stride;
|
||||
|
||||
for (int i = 0; i < num_ints_per_thread; i++) {
|
||||
frag_qzp[k % 2][i] =
|
||||
(reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Execute the actual tensor core matmul of a sub-tile.
|
||||
auto matmul = [&](int k) {
|
||||
if constexpr (has_zp) {
|
||||
FragB frag_zp_0;
|
||||
FragB frag_zp_1;
|
||||
if constexpr (num_bits == 4) {
|
||||
int zp_quant = frag_qzp[k % 2][0];
|
||||
int zp_quant_shift = zp_quant >> 8;
|
||||
frag_zp_0 = dequant_4bit_zp<scalar_t>(zp_quant);
|
||||
frag_zp_1 = dequant_4bit_zp<scalar_t>(zp_quant_shift);
|
||||
|
||||
} else {
|
||||
int zp_quant_0 = frag_qzp[k % 2][0];
|
||||
int zp_quant_1 = frag_qzp[k % 2][1];
|
||||
frag_zp_0 = dequant_8bit_zp<scalar_t>(zp_quant_0);
|
||||
frag_zp_1 = dequant_8bit_zp<scalar_t>(zp_quant_1);
|
||||
}
|
||||
|
||||
frag_zp[0] = frag_zp_0[0];
|
||||
frag_zp[1] = frag_zp_0[1];
|
||||
frag_zp[2] = frag_zp_1[0];
|
||||
frag_zp[3] = frag_zp_1[1];
|
||||
}
|
||||
|
||||
// We have the m dimension as the inner loop in order to encourage overlapping
|
||||
// dequantization and matmul operations.
|
||||
#pragma unroll
|
||||
@ -944,16 +1196,32 @@ __global__ void Marlin(
|
||||
int b_quant = frag_b_quant[k % 2][0][j];
|
||||
int b_quant_shift = b_quant >> 8;
|
||||
|
||||
frag_b0 = dequant_4bit<scalar_t>(b_quant);
|
||||
frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);
|
||||
if constexpr (has_zp) {
|
||||
frag_b0 = dequant_4bit_zp<scalar_t>(b_quant);
|
||||
frag_b1 = dequant_4bit_zp<scalar_t>(b_quant_shift);
|
||||
|
||||
} else {
|
||||
frag_b0 = dequant_4bit<scalar_t>(b_quant);
|
||||
frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);
|
||||
}
|
||||
|
||||
} else {
|
||||
int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
|
||||
int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
|
||||
int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
|
||||
|
||||
frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
|
||||
frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
|
||||
if constexpr (has_zp) {
|
||||
frag_b0 = dequant_8bit_zp<scalar_t>(b_quant_0);
|
||||
frag_b1 = dequant_8bit_zp<scalar_t>(b_quant_1);
|
||||
} else {
|
||||
frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
|
||||
frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply zero-point to frag_b0
|
||||
if constexpr (has_zp) {
|
||||
sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
|
||||
}
|
||||
|
||||
// Apply scale to frag_b0
|
||||
@ -967,6 +1235,11 @@ __global__ void Marlin(
|
||||
}
|
||||
}
|
||||
|
||||
// Apply zero-point to frag_b1
|
||||
if constexpr (has_zp) {
|
||||
sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
|
||||
}
|
||||
|
||||
// Apply scale to frag_b1
|
||||
if constexpr (has_act_order) {
|
||||
scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
|
||||
@ -1189,6 +1462,12 @@ __global__ void Marlin(
|
||||
}
|
||||
fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
|
||||
}
|
||||
|
||||
if constexpr (has_zp && group_blocks == -1) {
|
||||
if (i == 0) {
|
||||
fetch_zp_to_shared();
|
||||
}
|
||||
}
|
||||
fetch_to_shared(i, i, i < slice_iters);
|
||||
}
|
||||
|
||||
@ -1197,6 +1476,7 @@ __global__ void Marlin(
|
||||
init_same_group(0);
|
||||
fetch_to_registers(0, 0);
|
||||
fetch_scales_to_registers(0, 0);
|
||||
fetch_zp_to_registers(0, 0);
|
||||
a_gl_rd += a_gl_rd_delta_o * (stages - 1);
|
||||
slice_k_start_shared_fetch += tb_k * (stages - 1);
|
||||
};
|
||||
@ -1217,6 +1497,7 @@ __global__ void Marlin(
|
||||
for (int k = 0; k < b_sh_wr_iters; k++) {
|
||||
fetch_to_registers(k + 1, pipe % stages);
|
||||
fetch_scales_to_registers(k + 1, pipe);
|
||||
fetch_zp_to_registers(k + 1, pipe);
|
||||
if (k == b_sh_wr_iters - 2) {
|
||||
fetch_to_shared((pipe + stages - 1) % stages, pipe,
|
||||
slice_iters >= stages);
|
||||
@ -1354,6 +1635,7 @@ __global__ void Marlin(
|
||||
|
||||
} else {
|
||||
s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
|
||||
zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
|
||||
}
|
||||
|
||||
start_pipes();
|
||||
@ -1363,22 +1645,24 @@ __global__ void Marlin(
|
||||
}
|
||||
|
||||
#define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
|
||||
THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \
|
||||
THREAD_K_BLOCKS, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, \
|
||||
NUM_THREADS) \
|
||||
else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS && \
|
||||
thread_n_blocks == THREAD_N_BLOCKS && \
|
||||
thread_k_blocks == THREAD_K_BLOCKS && \
|
||||
has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
|
||||
num_threads == NUM_THREADS) { \
|
||||
has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP && \
|
||||
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \
|
||||
cudaFuncSetAttribute( \
|
||||
Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS, \
|
||||
THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
|
||||
GROUP_BLOCKS>, \
|
||||
HAS_ZP, GROUP_BLOCKS>, \
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||
Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS, \
|
||||
THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
|
||||
GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
||||
A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n, \
|
||||
prob_k, locks); \
|
||||
HAS_ZP, GROUP_BLOCKS> \
|
||||
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
||||
A_ptr, B_ptr, C_ptr, s_ptr, zp_ptr, g_idx_ptr, num_groups, \
|
||||
prob_m, prob_n, prob_k, locks); \
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
@ -1548,39 +1832,61 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
|
||||
return exec_config_t{0, {-1, -1, -1}};
|
||||
}
|
||||
|
||||
#define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
|
||||
#define GPTQ_CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS)
|
||||
|
||||
#define AWQ_CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \
|
||||
\
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \
|
||||
__CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS)
|
||||
|
||||
template <typename scalar_t>
|
||||
void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
|
||||
void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp,
|
||||
void* g_idx, void* perm, void* a_tmp, int prob_m,
|
||||
int prob_n, int prob_k, void* workspace, int num_bits,
|
||||
bool has_act_order, bool is_k_full, int num_groups,
|
||||
int group_size, int dev, cudaStream_t stream, int thread_k,
|
||||
int thread_n, int sms, int max_par) {
|
||||
bool has_act_order, bool is_k_full, bool has_zp,
|
||||
int num_groups, int group_size, int dev,
|
||||
cudaStream_t stream, int thread_k, int thread_n, int sms,
|
||||
int max_par) {
|
||||
TORCH_CHECK(num_bits == 4 || num_bits == 8,
|
||||
"num_bits must be 4 or 8. Got = ", num_bits);
|
||||
TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
|
||||
@ -1665,6 +1971,7 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
|
||||
const int4* B_ptr = (const int4*)B;
|
||||
int4* C_ptr = (int4*)C;
|
||||
const int4* s_ptr = (const int4*)s;
|
||||
const int4* zp_ptr = (const int4*)zp;
|
||||
const int* g_idx_ptr = (const int*)g_idx;
|
||||
const int* perm_ptr = (const int*)perm;
|
||||
int4* a_tmp_ptr = (int4*)a_tmp;
|
||||
@ -1701,28 +2008,33 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
|
||||
thread_m_blocks = exec_cfg.max_m_blocks;
|
||||
}
|
||||
|
||||
// Define kernel configurations
|
||||
if (false) {
|
||||
}
|
||||
CALL_IF(4, 32, 2, 256)
|
||||
CALL_IF(4, 16, 4, 256)
|
||||
CALL_IF(4, 8, 8, 256)
|
||||
CALL_IF(4, 8, 4, 128)
|
||||
CALL_IF(4, 4, 8, 128)
|
||||
CALL_IF(8, 32, 2, 256)
|
||||
CALL_IF(8, 16, 4, 256)
|
||||
CALL_IF(8, 8, 8, 256)
|
||||
CALL_IF(8, 8, 4, 128)
|
||||
CALL_IF(8, 4, 8, 128)
|
||||
GPTQ_CALL_IF(4, 16, 4, 256)
|
||||
GPTQ_CALL_IF(4, 8, 8, 256)
|
||||
GPTQ_CALL_IF(4, 8, 4, 128)
|
||||
GPTQ_CALL_IF(4, 4, 8, 128)
|
||||
GPTQ_CALL_IF(8, 16, 4, 256)
|
||||
GPTQ_CALL_IF(8, 8, 8, 256)
|
||||
GPTQ_CALL_IF(8, 8, 4, 128)
|
||||
GPTQ_CALL_IF(8, 4, 8, 128)
|
||||
|
||||
AWQ_CALL_IF(4, 16, 4, 256)
|
||||
AWQ_CALL_IF(4, 8, 8, 256)
|
||||
AWQ_CALL_IF(4, 8, 4, 128)
|
||||
AWQ_CALL_IF(4, 4, 8, 128)
|
||||
AWQ_CALL_IF(8, 16, 4, 256)
|
||||
AWQ_CALL_IF(8, 8, 8, 256)
|
||||
AWQ_CALL_IF(8, 8, 4, 128)
|
||||
AWQ_CALL_IF(8, 4, 8, 128)
|
||||
else {
|
||||
TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
|
||||
str(prob_n) + ", " + str(prob_k) + "]" +
|
||||
", has_act_order = " + str(has_act_order) +
|
||||
", num_groups = " + str(num_groups) +
|
||||
", group_size = " + str(group_size) +
|
||||
", thread_m_blocks = " + str(thread_m_blocks) +
|
||||
", thread_n_blocks = " + str(thread_n_blocks) +
|
||||
", thread_k_blocks = " + str(thread_k_blocks));
|
||||
TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
|
||||
", ", prob_k, "]", ", has_act_order = ", has_act_order,
|
||||
", num_groups = ", num_groups, ", group_size = ", group_size,
|
||||
", thread_m_blocks = ", thread_m_blocks,
|
||||
", thread_n_blocks = ", thread_n_blocks,
|
||||
", thread_k_blocks = ", thread_k_blocks,
|
||||
", num_bits = ", num_bits);
|
||||
}
|
||||
|
||||
A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
|
||||
@ -1733,10 +2045,11 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
|
||||
} // namespace gptq_marlin
|
||||
|
||||
torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
torch::Tensor& b_scales, torch::Tensor& g_idx,
|
||||
torch::Tensor& perm, torch::Tensor& workspace,
|
||||
int64_t num_bits, int64_t size_m, int64_t size_n,
|
||||
int64_t size_k, bool is_k_full) {
|
||||
torch::Tensor& b_scales, torch::Tensor& b_zeros,
|
||||
torch::Tensor& g_idx, torch::Tensor& perm,
|
||||
torch::Tensor& workspace, int64_t num_bits,
|
||||
int64_t size_m, int64_t size_n, int64_t size_k,
|
||||
bool is_k_full, bool has_zp) {
|
||||
// Verify num_bits
|
||||
TORCH_CHECK(num_bits == 4 || num_bits == 8,
|
||||
"num_bits must be 4 or 8. Got = ", num_bits);
|
||||
@ -1749,16 +2062,15 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
", size_k = ", size_k);
|
||||
|
||||
// Verify B
|
||||
TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_size = ", gptq_marlin::tile_size);
|
||||
TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
|
||||
TORCH_CHECK(size_k % marlin::tile_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_size = ", marlin::tile_size);
|
||||
TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
|
||||
"Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
|
||||
", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
|
||||
TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
|
||||
", size_k = ", size_k, ", tile_size = ", marlin::tile_size);
|
||||
TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
|
||||
"b_q_weight.size(1) = ", b_q_weight.size(1),
|
||||
" is not divisible by tile_size = ", gptq_marlin::tile_size);
|
||||
int actual_size_n =
|
||||
(b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
|
||||
" is not divisible by tile_size = ", marlin::tile_size);
|
||||
int actual_size_n = (b_q_weight.size(1) / marlin::tile_size) * pack_factor;
|
||||
TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
|
||||
", actual_size_n = ", actual_size_n);
|
||||
|
||||
@ -1772,6 +2084,9 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
|
||||
TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
|
||||
|
||||
TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
|
||||
TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
|
||||
|
||||
TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
|
||||
TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
|
||||
|
||||
@ -1805,8 +2120,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
int group_size = -1;
|
||||
bool has_act_order = g_idx.size(0) != 0;
|
||||
|
||||
int b_rank = b_scales.sizes().size();
|
||||
TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
|
||||
int rank = b_scales.sizes().size();
|
||||
TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
|
||||
TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
|
||||
" is not size_n = ", size_n);
|
||||
num_groups = b_scales.size(0);
|
||||
@ -1832,34 +2147,44 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
}
|
||||
}
|
||||
|
||||
// Verify b_zeros
|
||||
if (has_zp) {
|
||||
int rank = b_zeros.sizes().size();
|
||||
TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
|
||||
TORCH_CHECK(b_zeros.size(0) == num_groups,
|
||||
"b_zeros dim 0 = ", b_zeros.size(0),
|
||||
" is not num_groups = ", num_groups);
|
||||
TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor,
|
||||
"b_zeros dim 1 = ", b_scales.size(1),
|
||||
" is not size_n / pack_factor = ", size_n / pack_factor);
|
||||
}
|
||||
|
||||
// Verify workspace size
|
||||
TORCH_CHECK(
|
||||
size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
|
||||
", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
|
||||
int min_workspace_size =
|
||||
(size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
|
||||
TORCH_CHECK(size_n % marlin::min_thread_n == 0, "size_n = ", size_n,
|
||||
", is not divisible by min_thread_n = ", marlin::min_thread_n);
|
||||
int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
|
||||
TORCH_CHECK(workspace.numel() >= min_workspace_size,
|
||||
"workspace.numel = ", workspace.numel(),
|
||||
" is below min_workspace_size = ", min_workspace_size);
|
||||
|
||||
int dev = a.get_device();
|
||||
if (a.scalar_type() == at::ScalarType::Half) {
|
||||
gptq_marlin::marlin_mm_f16i4<half>(
|
||||
marlin::marlin_mm_f16i4<half>(
|
||||
a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
|
||||
b_scales.data_ptr<at::Half>(), g_idx.data_ptr(), perm.data_ptr(),
|
||||
a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups,
|
||||
group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
|
||||
thread_n, sms, gptq_marlin::max_par);
|
||||
b_scales.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
|
||||
perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
|
||||
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
||||
thread_k, thread_n, sms, marlin::max_par);
|
||||
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
|
||||
gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
|
||||
marlin::marlin_mm_f16i4<nv_bfloat16>(
|
||||
a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
|
||||
c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
|
||||
g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
|
||||
size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order,
|
||||
is_k_full, num_groups, group_size, dev,
|
||||
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
|
||||
gptq_marlin::max_par);
|
||||
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
|
||||
a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp,
|
||||
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
|
||||
thread_k, thread_n, sms, marlin::max_par);
|
||||
} else {
|
||||
TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
|
||||
}
|
||||
|
@ -1,23 +1,16 @@
|
||||
#include "gptq_marlin.cuh"
|
||||
|
||||
namespace gptq_marlin {
|
||||
|
||||
static constexpr int repack_stages = 8;
|
||||
|
||||
static constexpr int repack_threads = 256;
|
||||
|
||||
static constexpr int tile_k_size = tile_size;
|
||||
static constexpr int tile_n_size = tile_k_size * 4;
|
||||
#include "marlin.cuh"
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
|
||||
namespace marlin {
|
||||
|
||||
template <int const num_threads, int const num_bits, bool const has_perm>
|
||||
__global__ void marlin_repack_kernel(
|
||||
__global__ void gptq_marlin_repack_kernel(
|
||||
uint32_t const* __restrict__ b_q_weight_ptr,
|
||||
uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
|
||||
int size_k, int size_n) {}
|
||||
|
||||
} // namespace gptq_marlin
|
||||
} // namespace marlin
|
||||
|
||||
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||
int64_t size_k, int64_t size_n,
|
||||
@ -29,8 +22,10 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||
|
||||
#else
|
||||
|
||||
namespace marlin {
|
||||
|
||||
template <int const num_threads, int const num_bits, bool const has_perm>
|
||||
__global__ void marlin_repack_kernel(
|
||||
__global__ void gptq_marlin_repack_kernel(
|
||||
uint32_t const* __restrict__ b_q_weight_ptr,
|
||||
uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
|
||||
int size_k, int size_n) {
|
||||
@ -259,28 +254,28 @@ __global__ void marlin_repack_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gptq_marlin
|
||||
} // namespace marlin
|
||||
|
||||
#define CALL_IF(NUM_BITS, HAS_PERM) \
|
||||
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \
|
||||
cudaFuncSetAttribute( \
|
||||
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, \
|
||||
NUM_BITS, HAS_PERM>, \
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
|
||||
HAS_PERM> \
|
||||
<<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>( \
|
||||
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
|
||||
#define CALL_IF(NUM_BITS, HAS_PERM) \
|
||||
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \
|
||||
cudaFuncSetAttribute( \
|
||||
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
|
||||
HAS_PERM>, \
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||
marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
|
||||
HAS_PERM> \
|
||||
<<<blocks, marlin::repack_threads, max_shared_mem, stream>>>( \
|
||||
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
|
||||
}
|
||||
|
||||
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||
int64_t size_k, int64_t size_n,
|
||||
int64_t num_bits) {
|
||||
// Verify compatibility with marlin tile of 16x64
|
||||
TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
|
||||
TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
|
||||
" is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
|
||||
TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_k_size = ", marlin::tile_k_size);
|
||||
TORCH_CHECK(size_n % marlin::tile_n_size == 0, "size_n = ", size_n,
|
||||
" is not divisible by tile_n_size = ", marlin::tile_n_size);
|
||||
|
||||
TORCH_CHECK(num_bits == 4 || num_bits == 8,
|
||||
"num_bits must be 4 or 8. Got = ", num_bits);
|
||||
@ -308,10 +303,9 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||
auto options = torch::TensorOptions()
|
||||
.dtype(b_q_weight.dtype())
|
||||
.device(b_q_weight.device());
|
||||
torch::Tensor out =
|
||||
torch::empty({size_k / gptq_marlin::tile_size,
|
||||
size_n * gptq_marlin::tile_size / pack_factor},
|
||||
options);
|
||||
torch::Tensor out = torch::empty(
|
||||
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
|
||||
options);
|
||||
|
||||
// Detect if there is act_order
|
||||
bool has_perm = perm.size(0) != 0;
|
||||
|
@ -9,7 +9,9 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace gptq_marlin {
|
||||
namespace marlin {
|
||||
|
||||
// Marlin params
|
||||
|
||||
// 8 warps are a good choice since every SM has 4 schedulers and having more
|
||||
// than 1 warp per schedule allows some more latency hiding. At the same time,
|
||||
@ -25,6 +27,15 @@ static constexpr int min_thread_k = 64;
|
||||
static constexpr int tile_size = 16;
|
||||
static constexpr int max_par = 16;
|
||||
|
||||
// Repack params
|
||||
static constexpr int repack_stages = 8;
|
||||
|
||||
static constexpr int repack_threads = 256;
|
||||
|
||||
static constexpr int tile_k_size = tile_size;
|
||||
static constexpr int tile_n_size = tile_k_size * 4;
|
||||
|
||||
// Helpers
|
||||
template <typename T, int n>
|
||||
struct Vec {
|
||||
T elems[n];
|
||||
@ -73,4 +84,4 @@ __device__ inline void cp_async_wait() {
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace gptq_marlin
|
||||
} // namespace marlin
|
@ -1,11 +1,11 @@
|
||||
|
||||
#ifndef _data_types_cuh
|
||||
#define _data_types_cuh
|
||||
#include "gptq_marlin.cuh"
|
||||
#include "marlin.cuh"
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_bf16.h>
|
||||
|
||||
namespace gptq_marlin {
|
||||
namespace marlin {
|
||||
|
||||
template <typename scalar_t>
|
||||
class ScalarType {};
|
||||
@ -23,6 +23,7 @@ class ScalarType<half> {
|
||||
using FragB = Vec<half2, 2>;
|
||||
using FragC = Vec<float, 4>;
|
||||
using FragS = Vec<half2, 1>;
|
||||
using FragZP = Vec<half2, 4>;
|
||||
|
||||
static __device__ float inline num2float(const half x) {
|
||||
return __half2float(x);
|
||||
@ -51,6 +52,7 @@ class ScalarType<nv_bfloat16> {
|
||||
using FragB = Vec<nv_bfloat162, 2>;
|
||||
using FragC = Vec<float, 4>;
|
||||
using FragS = Vec<nv_bfloat162, 1>;
|
||||
using FragZP = Vec<nv_bfloat162, 4>;
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||
static __device__ float inline num2float(const nv_bfloat16 x) {
|
||||
@ -72,6 +74,6 @@ class ScalarType<nv_bfloat16> {
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace gptq_marlin
|
||||
} // namespace marlin
|
||||
|
||||
#endif
|
@ -30,7 +30,7 @@ inline std::string str(T x) {
|
||||
return std::to_string(x);
|
||||
}
|
||||
|
||||
namespace marlin {
|
||||
namespace marlin_dense {
|
||||
|
||||
constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
@ -1040,7 +1040,7 @@ void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace marlin
|
||||
} // namespace marlin_dense
|
||||
|
||||
torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
torch::Tensor& b_scales, torch::Tensor& workspace,
|
||||
@ -1054,24 +1054,25 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
TORCH_CHECK(size_k == a.size(1),
|
||||
"Shape mismatch: a.size(1) = " + str(a.size(1)) +
|
||||
", size_k = " + str(size_k));
|
||||
TORCH_CHECK(size_k % marlin::tile_size == 0,
|
||||
"size_k = " + str(size_k) +
|
||||
" is not divisible by tile_size = " + str(marlin::tile_size));
|
||||
TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
|
||||
TORCH_CHECK(size_k % marlin_dense::tile_size == 0,
|
||||
"size_k = " + str(size_k) + " is not divisible by tile_size = " +
|
||||
str(marlin_dense::tile_size));
|
||||
TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0),
|
||||
"Shape mismatch: b_q_weight.size(0) = " +
|
||||
str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
|
||||
", tile_size = " + str(marlin::tile_size));
|
||||
", tile_size = " + str(marlin_dense::tile_size));
|
||||
|
||||
// Verify N
|
||||
TORCH_CHECK(b_scales.size(1) == size_n,
|
||||
"b_scales.size(1) = " + str(b_scales.size(1)) +
|
||||
", size_n = " + str(size_n));
|
||||
TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
|
||||
"b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
|
||||
" is not divisible by tile_size = " + str(marlin::tile_size));
|
||||
TORCH_CHECK(
|
||||
b_q_weight.size(1) % marlin_dense::tile_size == 0,
|
||||
"b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
|
||||
" is not divisible by tile_size = " + str(marlin_dense::tile_size));
|
||||
|
||||
int actual_size_n =
|
||||
(b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit;
|
||||
int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) *
|
||||
marlin_dense::pack_factor_4bit;
|
||||
TORCH_CHECK(
|
||||
size_n == actual_size_n,
|
||||
"size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
|
||||
@ -1116,21 +1117,22 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||
"Unexpected groupsize = " + str(groupsize));
|
||||
|
||||
// Verify workspace size
|
||||
TORCH_CHECK(
|
||||
size_n % marlin::min_thread_n == 0,
|
||||
"size_n = " + str(size_n) +
|
||||
", is not divisible by min_thread_n = " + str(marlin::min_thread_n));
|
||||
int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
|
||||
TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0,
|
||||
"size_n = " + str(size_n) +
|
||||
", is not divisible by min_thread_n = " +
|
||||
str(marlin_dense::min_thread_n));
|
||||
int min_workspace_size =
|
||||
(size_n / marlin_dense::min_thread_n) * marlin_dense::max_par;
|
||||
TORCH_CHECK(workspace.numel() >= min_workspace_size,
|
||||
"workspace.numel = " + str(workspace.numel()) +
|
||||
" is below min_workspace_size = " + str(min_workspace_size));
|
||||
|
||||
int dev = a.get_device();
|
||||
marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
|
||||
b_scales.data_ptr(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), groupsize, dev,
|
||||
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
|
||||
sms, marlin::max_par);
|
||||
marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
|
||||
b_scales.data_ptr(), size_m, size_n, size_k,
|
||||
workspace.data_ptr(), groupsize, dev,
|
||||
at::cuda::getCurrentCUDAStream(dev), thread_k,
|
||||
thread_n, sms, marlin_dense::max_par);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
@ -27,8 +27,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||
" int blocksparse_local_blocks,"
|
||||
" str kv_cache_dtype, float k_scale, float v_scale,"
|
||||
" int tp_rank, int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
|
||||
@ -41,8 +41,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||
" int blocksparse_local_blocks,"
|
||||
" str kv_cache_dtype, float k_scale, float v_scale,"
|
||||
" int tp_rank, int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
|
||||
@ -72,6 +72,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
|
||||
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
|
||||
|
||||
// prepare_inputs advance_step
|
||||
ops.def("advance_step", &advance_step);
|
||||
ops.impl("advance_step", torch::kCUDA, &advance_step);
|
||||
|
||||
// Layernorm
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
ops.def(
|
||||
@ -137,6 +141,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.def("gptq_marlin_repack", &gptq_marlin_repack);
|
||||
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
||||
|
||||
// awq_marlin repack from AWQ.
|
||||
ops.def("awq_marlin_repack", &awq_marlin_repack);
|
||||
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
|
||||
|
||||
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
|
||||
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
|
||||
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
|
||||
@ -175,12 +183,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
|
||||
ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
|
||||
|
||||
// Compute FP8 quantized tensor and scaling factor.
|
||||
// Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
|
||||
ops.def(
|
||||
"dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
|
||||
"()");
|
||||
ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
|
||||
|
||||
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
|
||||
ops.def(
|
||||
"dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
|
||||
"scale, Tensor? scale_ub) -> "
|
||||
"()");
|
||||
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
|
||||
&dynamic_per_token_scaled_fp8_quant);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
// that it is divisible by the block size.
|
||||
ops.def(
|
||||
@ -223,7 +239,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
" Tensor! key_cache, Tensor! value_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" float kv_scale) -> ()");
|
||||
" float k_scale, float v_scale) -> ()");
|
||||
cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
|
||||
|
||||
// Reshape the key and value tensors and cache them.
|
||||
|
@ -2,7 +2,7 @@ sphinx==6.2.1
|
||||
sphinx-book-theme==1.0.1
|
||||
sphinx-copybutton==0.5.2
|
||||
myst-parser==2.0.0
|
||||
sphinx-argparse
|
||||
sphinx-argparse==0.4.0
|
||||
|
||||
# packages to install to build the documentation
|
||||
pydantic
|
||||
|
@ -5,6 +5,7 @@
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
font-size: 16px;
|
||||
padding: 0 6px 0 6px;
|
||||
}
|
||||
.notification-bar p {
|
||||
margin: 0;
|
||||
|
@ -13,6 +13,7 @@ vLLM is a community project. Our compute resources for development and testing a
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Dropbox
|
||||
- Google Cloud
|
||||
- Lambda Lab
|
||||
- NVIDIA
|
||||
- Replicate
|
||||
|
@ -5,10 +5,10 @@ Input Processing
|
||||
|
||||
.. currentmodule:: vllm.inputs
|
||||
|
||||
vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
|
||||
in :class:`~vllm.LLMEngine` before they are passed to model executors.
|
||||
Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
|
||||
:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
|
||||
|
||||
Currently, this mechanism is only utilized in :ref:`multi-modal models <multi_modality>` for preprocessing multi-modal input
|
||||
Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input
|
||||
data in addition to input prompt, but it can be extended to text-only language models when needed.
|
||||
|
||||
Guides
|
||||
|
17
docs/source/dev/multimodal/adding_multimodal_plugin.rst
Normal file
17
docs/source/dev/multimodal/adding_multimodal_plugin.rst
Normal file
@ -0,0 +1,17 @@
|
||||
.. _adding_multimodal_plugin:
|
||||
|
||||
Adding a Multimodal Plugin
|
||||
==========================
|
||||
|
||||
This document teaches you how to add a new modality to vLLM.
|
||||
|
||||
Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
|
||||
For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
|
||||
|
||||
The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
|
||||
|
||||
.. note::
|
||||
This article is a work in progress.
|
||||
|
||||
..
|
||||
TODO: Add more instructions on how to add new plugins once embeddings is in.
|
@ -7,17 +7,13 @@ Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
|
||||
|
||||
:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
|
||||
which allows you to pass in multi-modal input alongside text and token prompts.
|
||||
Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
|
||||
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`.
|
||||
|
||||
.. note::
|
||||
``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
|
||||
:class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
|
||||
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
|
||||
by following :ref:`this guide <adding_multimodal_plugin>`.
|
||||
|
||||
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`the guide for adding a new multimodal model. <adding_a_new_multimodal_model>`.
|
||||
|
||||
|
||||
# TODO: Add more instructions on how to do that once embeddings is in.
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
|
||||
|
||||
Guides
|
||||
++++++
|
||||
@ -25,7 +21,7 @@ Guides
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
adding_multimodal_model
|
||||
adding_multimodal_plugin
|
||||
|
||||
Module Contents
|
||||
+++++++++++++++
|
||||
@ -44,10 +40,14 @@ Registry
|
||||
Base Classes
|
||||
------------
|
||||
|
||||
.. autoclass:: vllm.multimodal.MultiModalDataDict
|
||||
.. autodata:: vllm.multimodal.BatchedTensors
|
||||
|
||||
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. autodata:: vllm.multimodal.MultiModalDataDict
|
||||
|
||||
.. autoclass:: vllm.multimodal.MultiModalInputs
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
@ -1,7 +1,7 @@
|
||||
LLM Inputs
|
||||
==========
|
||||
|
||||
.. autodata:: vllm.inputs.PromptStrictInputs
|
||||
.. autodata:: vllm.inputs.PromptInputs
|
||||
|
||||
.. autoclass:: vllm.inputs.TextPrompt
|
||||
:show-inheritance:
|
||||
|
@ -3,7 +3,7 @@
|
||||
Installation with ROCm
|
||||
======================
|
||||
|
||||
vLLM supports AMD GPUs with ROCm 5.7 and 6.0.
|
||||
vLLM supports AMD GPUs with ROCm 6.1.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
@ -11,7 +11,7 @@ Requirements
|
||||
* OS: Linux
|
||||
* Python: 3.8 -- 3.11
|
||||
* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
|
||||
* ROCm 6.0 and ROCm 5.7
|
||||
* ROCm 6.1
|
||||
|
||||
Installation options:
|
||||
|
||||
@ -27,10 +27,10 @@ You can build and install vLLM from source.
|
||||
|
||||
First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
|
||||
|
||||
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
|
||||
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches.
|
||||
It provides flexibility to customize the build of docker image using the following arguments:
|
||||
|
||||
* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
|
||||
* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
|
||||
* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
|
||||
* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
|
||||
* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
|
||||
@ -39,24 +39,17 @@ It provides flexibility to customize the build of docker image using the followi
|
||||
Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
|
||||
|
||||
|
||||
To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
|
||||
To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker build -f Dockerfile.rocm -t vllm-rocm .
|
||||
$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
|
||||
To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
|
||||
-f Dockerfile.rocm -t vllm-rocm .
|
||||
$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
To run the above docker image ``vllm-rocm``, use the below command:
|
||||
|
||||
@ -85,39 +78,24 @@ Option 2: Build from source
|
||||
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
|
||||
|
||||
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
|
||||
- `Pytorch <https://pytorch.org/>`_
|
||||
- `PyTorch <https://pytorch.org/>`_
|
||||
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
|
||||
|
||||
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
|
||||
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`.
|
||||
|
||||
Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
|
||||
|
||||
For rocm6.0:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0
|
||||
|
||||
|
||||
For rocm5.7:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
|
||||
Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guild in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_
|
||||
|
||||
|
||||
1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
|
||||
|
||||
Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
|
||||
|
||||
2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm>`_
|
||||
2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_
|
||||
|
||||
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
|
||||
Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_
|
||||
Alternatively, wheels intended for vLLM use can be accessed under the releases.
|
||||
|
||||
.. note::
|
||||
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
|
||||
- If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
|
||||
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
|
||||
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
|
||||
|
||||
3. Build vLLM.
|
||||
@ -131,7 +109,7 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl
|
||||
|
||||
.. tip::
|
||||
|
||||
- You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
|
||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
|
||||
- To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention.
|
||||
- The ROCm version of pytorch, ideally, should match the ROCm driver version.
|
||||
- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
|
||||
- To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention.
|
||||
- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
|
||||
|
@ -20,7 +20,7 @@ Requirements
|
||||
|
||||
* OS: Linux
|
||||
* Compiler: gcc/g++>=12.3.0 (optional, recommended)
|
||||
* Instruction set architecture (ISA) requirement: AVX512 is required.
|
||||
* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
|
||||
|
||||
.. _cpu_backend_quick_start_dockerfile:
|
||||
|
||||
|
@ -19,9 +19,6 @@ If you have already taken care of the above issues, but the vLLM instance still
|
||||
- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
|
||||
- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
|
||||
|
||||
.. warning::
|
||||
vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.
|
||||
|
||||
With more logging, hopefully you can find the root cause of the issue.
|
||||
|
||||
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
|
||||
@ -50,6 +47,8 @@ Here are some common issues that can cause hangs:
|
||||
value = cpu_data.mean().item()
|
||||
assert value == world_size, f"Expected {world_size}, got {value}"
|
||||
|
||||
print("sanity check is successful!")
|
||||
|
||||
.. tip::
|
||||
|
||||
Save the script as ``test.py``.
|
||||
@ -62,4 +61,10 @@ Here are some common issues that can cause hangs:
|
||||
- is reachable from all nodes
|
||||
- is set before running the script.
|
||||
|
||||
If the script runs successfully, you should see the message ``sanity check is successful!``.
|
||||
|
||||
If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
|
||||
|
||||
.. warning::
|
||||
|
||||
After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.
|
||||
|
@ -42,6 +42,19 @@ You can install vLLM using pip:
|
||||
|
||||
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
|
||||
|
||||
.. note::
|
||||
|
||||
vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ export VLLM_VERSION=0.5.2 # vLLM's main branch version is currently set to latest released tag
|
||||
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
|
||||
$ # You can also access a specific commit
|
||||
$ # export VLLM_COMMIT=...
|
||||
$ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
|
||||
|
||||
|
||||
.. _build_from_source:
|
||||
|
||||
Build from source
|
||||
|
@ -73,16 +73,13 @@ Start the server:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ python -m vllm.entrypoints.openai.api_server \
|
||||
$ --model facebook/opt-125m
|
||||
$ vllm serve facebook/opt-125m
|
||||
|
||||
By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ python -m vllm.entrypoints.openai.api_server \
|
||||
$ --model facebook/opt-125m \
|
||||
$ --chat-template ./examples/template_chatml.jinja
|
||||
$ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
|
||||
|
||||
This server can be queried in the same format as OpenAI API. For example, list the models:
|
||||
|
||||
|
@ -56,7 +56,7 @@ First, install the dependencies:
|
||||
$ pip uninstall torch torch-xla -y
|
||||
|
||||
$ # Install PyTorch and PyTorch XLA.
|
||||
$ export DATE="+20240601"
|
||||
$ export DATE="+20240713"
|
||||
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
|
||||
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
|
||||
|
||||
@ -85,7 +85,7 @@ Next, build vLLM from source. This will only take a few seconds:
|
||||
ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
|
||||
|
||||
|
||||
You can install OpenBLAS with the following command:
|
||||
Please install OpenBLAS with the following command:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
|
@ -40,12 +40,13 @@ Quick start using Dockerfile
|
||||
Build from source
|
||||
-----------------
|
||||
|
||||
- First, install required driver and intel OneAPI 2024.1.
|
||||
- First, install required driver and intel OneAPI 2024.1 or later.
|
||||
|
||||
- Second, install Python packages for vLLM XPU backend building:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ source /opt/intel/oneapi/setvars.sh
|
||||
$ pip install --upgrade pip
|
||||
$ pip install -v -r requirements-xpu.txt
|
||||
|
||||
|
@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:
|
||||
|
||||
* Seamless integration with popular HuggingFace models
|
||||
* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||
* Tensor parallelism support for distributed inference
|
||||
* Tensor parallelism and pipeline parallelism support for distributed inference
|
||||
* Streaming outputs
|
||||
* OpenAI-compatible API server
|
||||
* Support NVIDIA GPUs and AMD GPUs
|
||||
@ -92,6 +92,7 @@ Documentation
|
||||
|
||||
models/supported_models
|
||||
models/adding_model
|
||||
models/enabling_multimodal_inputs
|
||||
models/engine_args
|
||||
models/lora
|
||||
models/vlm
|
||||
@ -116,6 +117,7 @@ Documentation
|
||||
automatic_prefix_caching/details
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Developer Documentation
|
||||
|
||||
dev/sampling_params
|
||||
|
@ -10,6 +10,10 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
|
||||
.. note::
|
||||
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
|
||||
please follow :ref:`this guide <enabling_multimodal_inputs>` after implementing the model here.
|
||||
|
||||
.. tip::
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
|
||||
We will be happy to help you out!
|
||||
@ -44,23 +48,23 @@ Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your mo
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
- attention_mask: Optional[torch.Tensor] = None,
|
||||
- position_ids: Optional[torch.LongTensor] = None,
|
||||
- past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
- inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
- labels: Optional[torch.LongTensor] = None,
|
||||
- use_cache: Optional[bool] = None,
|
||||
- output_attentions: Optional[bool] = None,
|
||||
- output_hidden_states: Optional[bool] = None,
|
||||
- return_dict: Optional[bool] = None,
|
||||
-) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||
+ positions: torch.Tensor,
|
||||
+ kv_caches: List[torch.Tensor],
|
||||
+ attn_metadata: AttentionMetadata,
|
||||
+) -> Optional[SamplerOutput]:
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
- attention_mask: Optional[torch.Tensor] = None,
|
||||
- position_ids: Optional[torch.LongTensor] = None,
|
||||
- past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
- inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
- labels: Optional[torch.LongTensor] = None,
|
||||
- use_cache: Optional[bool] = None,
|
||||
- output_attentions: Optional[bool] = None,
|
||||
- output_hidden_states: Optional[bool] = None,
|
||||
- return_dict: Optional[bool] = None,
|
||||
- ) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||
+ positions: torch.Tensor,
|
||||
+ kv_caches: List[torch.Tensor],
|
||||
+ attn_metadata: AttentionMetadata,
|
||||
+ ) -> Optional[SamplerOutput]:
|
||||
|
||||
1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
|
||||
2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
|
||||
@ -110,7 +114,7 @@ Just add the following lines in your code:
|
||||
from your_code import YourModelForCausalLM
|
||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||
|
||||
If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code:
|
||||
If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -120,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server
|
||||
import runpy
|
||||
runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
|
||||
|
||||
Save the above code in a file and run it with `python your_file.py args`.
|
||||
Save the above code in a file and run it with :code:`python your_file.py <args>`.
|
||||
|
@ -1,26 +1,21 @@
|
||||
.. _adding_a_new_multimodal_model:
|
||||
.. _enabling_multimodal_inputs:
|
||||
|
||||
Adding a New Multimodal Model
|
||||
=============================
|
||||
Enabling Multimodal Inputs
|
||||
==========================
|
||||
|
||||
This document provides a high-level guide on integrating a :ref:`multi-modal model <multi_modality>` into vLLM.
|
||||
This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal <multi_modality>` inputs.
|
||||
|
||||
.. note::
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
|
||||
.. tip::
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
|
||||
We will be happy to help you out!
|
||||
.. seealso::
|
||||
:ref:`adding_a_new_model`
|
||||
|
||||
|
||||
1. Set up the base vLLM model
|
||||
1. Update the base vLLM model
|
||||
-----------------------------
|
||||
|
||||
As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
|
||||
It is assumed that you have already implemented the model in vLLM according to :ref:`these steps <adding_a_new_model>`.
|
||||
Further update the model as follows:
|
||||
|
||||
- You should additionally implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
|
||||
- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
@ -33,19 +28,19 @@ As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model
|
||||
The model class does not have to be named :code:`*ForCausalLM`.
|
||||
Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
|
||||
|
||||
- While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
|
||||
- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward`
|
||||
for each input tensor that corresponds to a multi-modal input, as shown in the following example:
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
+ pixel_values: torch.Tensor,
|
||||
) -> SamplerOutput:
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
+ pixel_values: torch.Tensor,
|
||||
) -> SamplerOutput:
|
||||
|
||||
|
||||
2. Register input mappers
|
||||
@ -68,8 +63,8 @@ A default mapper is available for each modality in the core vLLM library. This i
|
||||
:ref:`input_processing_pipeline`
|
||||
|
||||
|
||||
3. Register maximum number of multimodal tokens
|
||||
----------------------------------------------------------
|
||||
3. Register maximum number of multi-modal tokens
|
||||
------------------------------------------------
|
||||
|
||||
For each modality type that the model accepts as input, calculate the maximum possible number of tokens
|
||||
and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
|
@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM:
|
||||
.. argparse::
|
||||
:module: vllm.engine.arg_utils
|
||||
:func: _engine_args_parser
|
||||
:prog: -m vllm.entrypoints.openai.api_server
|
||||
:prog: vllm serve
|
||||
:nodefaultconst:
|
||||
|
||||
Async Engine Arguments
|
||||
@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
|
||||
.. argparse::
|
||||
:module: vllm.engine.arg_utils
|
||||
:func: _async_engine_args_parser
|
||||
:prog: -m vllm.entrypoints.openai.api_server
|
||||
:prog: vllm serve
|
||||
:nodefaultconst:
|
@ -61,10 +61,12 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model meta-llama/Llama-2-7b-hf \
|
||||
vllm serve meta-llama/Llama-2-7b-hf \
|
||||
--enable-lora \
|
||||
--lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
|
||||
--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
|
||||
|
||||
.. note::
|
||||
The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
|
||||
|
||||
The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
|
||||
etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along
|
||||
|
@ -73,5 +73,5 @@ Resources for vLLM contributors
|
||||
-------------------------------
|
||||
* `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
|
||||
* `What is Lookahead Scheduling in vLLM? <https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a>`_
|
||||
* `Information on batch expansion. <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
|
||||
* `Information on batch expansion <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
|
||||
* `Dynamic speculative decoding <https://github.com/vllm-project/vllm/issues/4565>`_
|
||||
|
@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
|
||||
The following is the list of model architectures that are currently supported by vLLM.
|
||||
Alongside each architecture, we include some popular models that use it.
|
||||
|
||||
Decoder-only Language Models
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. list-table::
|
||||
:widths: 25 25 50 5
|
||||
:header-rows: 1
|
||||
@ -95,14 +97,6 @@ Alongside each architecture, we include some popular models that use it.
|
||||
- LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
|
||||
- :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
|
||||
- ✅︎
|
||||
* - :code:`LlavaForConditionalGeneration`
|
||||
- LLaVA-1.5
|
||||
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
|
||||
-
|
||||
* - :code:`LlavaNextForConditionalGeneration`
|
||||
- LLaVA-NeXT
|
||||
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
|
||||
-
|
||||
* - :code:`MiniCPMForCausalLM`
|
||||
- MiniCPM
|
||||
- :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
|
||||
@ -143,10 +137,10 @@ Alongside each architecture, we include some popular models that use it.
|
||||
- Phi-3-Small
|
||||
- :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
|
||||
-
|
||||
* - :code:`Phi3VForCausalLM`
|
||||
- Phi-3-Vision
|
||||
- :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
|
||||
-
|
||||
* - :code:`PersimmonForCausalLM`
|
||||
- Persimmon
|
||||
- :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
|
||||
-
|
||||
* - :code:`QWenLMHeadModel`
|
||||
- Qwen
|
||||
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
|
||||
@ -172,14 +166,52 @@ Alongside each architecture, we include some popular models that use it.
|
||||
- :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
|
||||
-
|
||||
|
||||
|
||||
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
||||
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
|
||||
Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
|
||||
|
||||
.. note::
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
|
||||
.. _supported_vlms:
|
||||
|
||||
Vision Language Models
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:widths: 25 25 50 5
|
||||
:header-rows: 1
|
||||
|
||||
* - Architecture
|
||||
- Models
|
||||
- Example HuggingFace Models
|
||||
- :ref:`LoRA <lora>`
|
||||
* - :code:`ChameleonForConditionalGeneration`
|
||||
- Chameleon
|
||||
- :code:`facebook/chameleon-7b` etc.
|
||||
-
|
||||
* - :code:`FuyuForCausalLM`
|
||||
- Fuyu
|
||||
- :code:`adept/fuyu-8b` etc.
|
||||
-
|
||||
* - :code:`LlavaForConditionalGeneration`
|
||||
- LLaVA-1.5
|
||||
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
|
||||
-
|
||||
* - :code:`LlavaNextForConditionalGeneration`
|
||||
- LLaVA-NeXT
|
||||
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
|
||||
-
|
||||
* - :code:`PaliGemmaForConditionalGeneration`
|
||||
- PaliGemma
|
||||
- :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
|
||||
-
|
||||
* - :code:`Phi3VForCausalLM`
|
||||
- Phi-3-Vision
|
||||
- :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
|
||||
-
|
||||
|
||||
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
|
||||
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>`
|
||||
for instructions on how to implement support for your model.
|
||||
Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
|
||||
|
||||
.. tip::
|
||||
The easiest way to check if your model is supported is to run the program below:
|
||||
|
||||
@ -210,8 +242,9 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
|
||||
output = llm.generate("Hello, my name is")
|
||||
print(output)
|
||||
|
||||
|
||||
Model Support Policy
|
||||
---------------------
|
||||
=====================
|
||||
|
||||
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
|
||||
|
||||
|
@ -3,7 +3,8 @@
|
||||
Using VLMs
|
||||
==========
|
||||
|
||||
vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
|
||||
vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here <supported_vlms>`.
|
||||
This document shows you how to run and serve these models using vLLM.
|
||||
|
||||
.. important::
|
||||
We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
|
||||
@ -29,7 +30,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
|
||||
internally for each model.
|
||||
|
||||
|
||||
To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
|
||||
To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
|
||||
|
||||
* ``prompt``: The prompt should follow the format that is documented on HuggingFace.
|
||||
* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
|
||||
@ -93,9 +94,7 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model llava-hf/llava-1.5-7b-hf \
|
||||
--chat-template template_llava.jinja
|
||||
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
|
||||
|
||||
.. important::
|
||||
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
||||
|
@ -28,6 +28,9 @@ Next, to install the required packages, add the following to your cerebrium.toml
|
||||
|
||||
.. code-block:: toml
|
||||
|
||||
[cerebrium.deployment]
|
||||
docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
|
||||
|
||||
[cerebrium.dependencies.pip]
|
||||
vllm = "latest"
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user