mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
103 Commits
Author | SHA1 | Date | |
---|---|---|---|
c7f2cf2b7f | |||
8d8357c8ed | |||
4302987069 | |||
021b1a2ab7 | |||
2a052011ca | |||
36fb68f947 | |||
bc8ad68455 | |||
344bf7cd2d | |||
ab50275111 | |||
43c413ec57 | |||
f8e7adda21 | |||
7e65477e5e | |||
3521ba4f25 | |||
2d7bce9cd5 | |||
ce3f1eedf8 | |||
808632d3b4 | |||
344a5d0c33 | |||
0f8a91401c | |||
9b5c9f9484 | |||
32881f3f31 | |||
5b8a7c1cb0 | |||
1ff0c73a79 | |||
5ad60b0cbd | |||
fb087af52e | |||
7038e8b803 | |||
2a85f93007 | |||
cf8cac8c70 | |||
5e401bce17 | |||
0d62fe58db | |||
b8afa8b95a | |||
826b82a260 | |||
c9d852d601 | |||
6ef09b08f8 | |||
3a922c1e7e | |||
c47ba4aaa9 | |||
24bb4fe432 | |||
a657bfc48a | |||
24750f4cad | |||
b38e42fbca | |||
8b798eec75 | |||
69909126a7 | |||
e491c7e053 | |||
4dc8026d86 | |||
a88bb9b032 | |||
6f1df80436 | |||
d6f4bd7cdd | |||
c3845d82dc | |||
a822eb3413 | |||
f458112e8a | |||
2e240c69a9 | |||
ee37328da0 | |||
6ad58f42c5 | |||
dd1a50a8bc | |||
715c2d854d | |||
a494140433 | |||
111815d482 | |||
b31a1fb63c | |||
4bb53e2dde | |||
26f2fb5113 | |||
fa32207842 | |||
d627a3d837 | |||
f4f921b7f1 | |||
ac5ccf0156 | |||
73c8d677e5 | |||
df29793dc7 | |||
03dd7d52bf | |||
bf480c5302 | |||
9c7306ac11 | |||
4ea1f9678d | |||
ba4be44c32 | |||
d6e520e170 | |||
81661da7b2 | |||
dfea173148 | |||
7134303cbb | |||
3da24c2df7 | |||
eefeb16464 | |||
18d23f642a | |||
87f545ba6f | |||
8947bc3c15 | |||
12628d3c78 | |||
258a2c58d0 | |||
aba47be3fe | |||
a62aaf1df5 | |||
603ad84815 | |||
a88081bf76 | |||
2f30e7c72f | |||
a74dee9b62 | |||
cf29b7eda4 | |||
efffb63f58 | |||
15e7c675b0 | |||
b6dcb4d442 | |||
b5b4a398a7 | |||
f4bc4de1b1 | |||
bd7a8eef25 | |||
7ee82bef1e | |||
fbf152d976 | |||
479d69fad0 | |||
96e90fdeb3 | |||
a395a638c2 | |||
2768884ac4 | |||
aae08249ac | |||
7923dcad12 | |||
3cd9b5bb2d |
36
.buildkite/check-wheel-size.py
Normal file
36
.buildkite/check-wheel-size.py
Normal file
@ -0,0 +1,36 @@
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
MAX_SIZE_MB = 100
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||
for f, size in file_sizes[:10]:
|
||||
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
|
||||
|
||||
|
||||
def check_wheel_size(directory):
|
||||
for root, _, files in os.walk(directory):
|
||||
for f in files:
|
||||
if f.endswith(".whl"):
|
||||
wheel_path = os.path.join(root, f)
|
||||
wheel_size = os.path.getsize(wheel_path)
|
||||
wheel_size_mb = wheel_size / (1024 * 1024)
|
||||
if wheel_size_mb > MAX_SIZE_MB:
|
||||
print(
|
||||
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
|
||||
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
|
||||
print_top_10_largest_files(wheel_path)
|
||||
return 1
|
||||
else:
|
||||
print(f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb} MB).")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(check_wheel_size(sys.argv[1]))
|
@ -1,38 +1,44 @@
|
||||
# This script build the ROCm docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
# This script build the ROCm docker image and runs test inside it.
|
||||
set -ex
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t rocm -f Dockerfile.rocm .
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f rocm || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
|
||||
# Run the image
|
||||
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
|
||||
|
||||
# Wait for the server to start
|
||||
wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
echo "Timeout after $timeout seconds"
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
wait_for_server_to_start
|
||||
|
||||
# Test a simple prompt
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
localhost:8000/generate \
|
||||
-d '{"prompt": "San Francisco is a"}'
|
||||
echo "--- Building container"
|
||||
sha=$(git rev-parse --short HEAD)
|
||||
container_name=rocm_${sha}
|
||||
docker build \
|
||||
-t ${container_name} \
|
||||
-f Dockerfile.rocm \
|
||||
--progress plain \
|
||||
.
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
echo "--- Running container"
|
||||
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
--name ${container_name} \
|
||||
${container_name} \
|
||||
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
|
||||
|
||||
|
@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
|
||||
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
|
||||
echo '```' >> benchmark_results.md
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# upload the results to buildkite
|
||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||
|
||||
|
@ -4,6 +4,20 @@ set -e
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
docker system prune -f
|
||||
echo $current_time > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t neuron -f Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
|
@ -17,27 +17,38 @@ steps:
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Core Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s core
|
||||
|
||||
- label: Distributed Comm Ops Test
|
||||
command: pytest -v -s test_comm_ops.py
|
||||
working_dir: "/vllm-workspace/tests/distributed"
|
||||
num_gpus: 2 # only support 1 or 2 for now.
|
||||
num_gpus: 2
|
||||
|
||||
- label: Distributed Tests
|
||||
working_dir: "/vllm-workspace/tests/distributed"
|
||||
|
||||
num_gpus: 2 # only support 1 or 2 for now.
|
||||
mirror_hardwares: [amd]
|
||||
|
||||
commands:
|
||||
- pytest -v -s test_pynccl.py
|
||||
- pytest -v -s test_pynccl_library.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
|
||||
|
||||
- label: Distributed Tests (Multiple Groups)
|
||||
working_dir: "/vllm-workspace/tests/distributed"
|
||||
num_gpus: 4
|
||||
commands:
|
||||
- pytest -v -s test_pynccl.py
|
||||
|
||||
- label: Engine Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
|
||||
|
||||
- label: Entrypoints Test
|
||||
@ -48,6 +59,7 @@ steps:
|
||||
|
||||
- label: Examples Test
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
# install aws cli for llava_example.py
|
||||
- pip install awscli
|
||||
@ -61,16 +73,19 @@ steps:
|
||||
parallelism: 4
|
||||
|
||||
- label: Models Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
|
||||
|
||||
- label: Llava Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s models/test_llava.py
|
||||
|
||||
- label: Prefix Caching Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
@ -78,12 +93,15 @@ steps:
|
||||
command: pytest -v -s samplers
|
||||
|
||||
- label: LogitsProcessor Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s test_logits_processor.py
|
||||
|
||||
- label: Worker Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s worker
|
||||
|
||||
- label: Speculative decoding tests
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s spec_decode
|
||||
|
||||
- label: LoRA Test %N
|
||||
@ -101,6 +119,7 @@ steps:
|
||||
|
||||
- label: Benchmarks
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pip install aiohttp
|
||||
- bash run-benchmarks.sh
|
||||
|
@ -16,17 +16,29 @@ steps:
|
||||
limit: 5
|
||||
- wait
|
||||
|
||||
- label: "AMD Test"
|
||||
- group: "AMD Tests"
|
||||
depends_on: ~
|
||||
steps:
|
||||
{% for step in steps %}
|
||||
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
||||
- label: "AMD: {{ step.label }}"
|
||||
agents:
|
||||
queue: amd
|
||||
command: bash .buildkite/run-amd-test.sh
|
||||
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
- label: "Neuron Test"
|
||||
depends_on: ~
|
||||
agents:
|
||||
queue: neuron
|
||||
command: bash .buildkite/run-neuron-test.sh
|
||||
soft_fail: true
|
||||
|
||||
- label: "CPU Test"
|
||||
- label: "Intel Test"
|
||||
depends_on: ~
|
||||
command: bash .buildkite/run-cpu-test.sh
|
||||
|
||||
{% for step in steps %}
|
||||
@ -44,6 +56,9 @@ steps:
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
{% if step.num_gpus %}
|
||||
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
|
||||
{% endif %}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
|
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
Normal file
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
name: 💬 Request for comments (RFC).
|
||||
description: Ask for feedback on major architectural changes or design choices.
|
||||
title: "[RFC]: "
|
||||
labels: ["RFC"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Motivation.
|
||||
description: >
|
||||
The motivation of the RFC.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Proposed Change.
|
||||
description: >
|
||||
The proposed change of the RFC.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Feedback Period.
|
||||
description: >
|
||||
The feedback period of the RFC. Usually at least one week.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: CC List.
|
||||
description: >
|
||||
The list of people you want to CC.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Any Other Things.
|
||||
description: >
|
||||
Any other things you would like to mention.
|
||||
validations:
|
||||
required: false
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
11
.github/workflows/mypy.yaml
vendored
11
.github/workflows/mypy.yaml
vendored
@ -33,8 +33,7 @@ jobs:
|
||||
- name: Mypy
|
||||
run: |
|
||||
mypy vllm/attention --config-file pyproject.toml
|
||||
# TODO(sang): Fix nested dir
|
||||
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/core --config-file pyproject.toml
|
||||
mypy vllm/distributed --config-file pyproject.toml
|
||||
mypy vllm/entrypoints --config-file pyproject.toml
|
||||
mypy vllm/executor --config-file pyproject.toml
|
||||
@ -44,8 +43,8 @@ jobs:
|
||||
mypy vllm/engine --config-file pyproject.toml
|
||||
mypy vllm/worker --config-file pyproject.toml
|
||||
mypy vllm/spec_decode --config-file pyproject.toml
|
||||
# TODO(sang): Fix nested dir
|
||||
mypy vllm/model_executor/*.py --config-file pyproject.toml
|
||||
# TODO(sang): Fix nested dir
|
||||
# mypy vllm/lora/*.py --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy vllm/lora --config-file pyproject.toml
|
||||
mypy vllm/logging --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
|
||||
|
4
.github/workflows/publish.yml
vendored
4
.github/workflows/publish.yml
vendored
@ -49,7 +49,7 @@ jobs:
|
||||
matrix:
|
||||
os: ['ubuntu-20.04']
|
||||
python-version: ['3.8', '3.9', '3.10', '3.11']
|
||||
pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
cuda-version: ['11.8', '12.1']
|
||||
|
||||
steps:
|
||||
@ -79,6 +79,8 @@ jobs:
|
||||
|
||||
- name: Build wheel
|
||||
shell: bash
|
||||
env:
|
||||
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||
run: |
|
||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
||||
|
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
|
||||
generate_release_notes: true,
|
||||
name: process.env.RELEASE_TAG,
|
||||
owner: context.repo.owner,
|
||||
prerelease: false,
|
||||
prerelease: true,
|
||||
repo: context.repo.repo,
|
||||
tag_name: process.env.RELEASE_TAG,
|
||||
});
|
||||
|
@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
|
||||
|
||||
@ -177,6 +177,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/custom_all_reduce.cu")
|
||||
endif()
|
||||
|
||||
|
18
Dockerfile
18
Dockerfile
@ -1,9 +1,13 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
# Please update any changes made here to
|
||||
# docs/source/dev/dockerfile/dockerfile.rst and
|
||||
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
||||
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y python3-pip git
|
||||
@ -12,7 +16,7 @@ RUN apt-get update -y \
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-12.1/compat/
|
||||
RUN ldconfig /usr/local/cuda-12.4/compat/
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
@ -71,6 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist
|
||||
|
||||
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
RUN python3 check-wheel-size.py dist
|
||||
|
||||
# the `vllm_nccl` package must be installed from source distribution
|
||||
# pip is too smart to store a wheel in the cache, and other CI jobs
|
||||
# will directly use the wheel from the cache, which is not what we want.
|
||||
@ -85,7 +93,7 @@ FROM dev as flash-attn-builder
|
||||
ARG max_jobs=2
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
# flash attention version
|
||||
ARG flash_attn_version=v2.5.6
|
||||
ARG flash_attn_version=v2.5.8
|
||||
ENV FLASH_ATTN_VERSION=${flash_attn_version}
|
||||
|
||||
WORKDIR /usr/src/flash-attention-v2
|
||||
@ -98,7 +106,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
|
||||
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
|
||||
WORKDIR /vllm-workspace
|
||||
|
||||
RUN apt-get update -y \
|
||||
@ -108,7 +116,7 @@ RUN apt-get update -y \
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-12.1/compat/
|
||||
RUN ldconfig /usr/local/cuda-12.4/compat/
|
||||
|
||||
# install vllm wheel first, so that torch etc will be installed
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
|
@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
|
||||
|
||||
### Mount Point ###
|
||||
# When launching the container, mount the code directory to /app
|
||||
ARG APP_MOUNT=/app
|
||||
ARG APP_MOUNT=/vllm-workspace
|
||||
VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}
|
||||
|
||||
@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
|
||||
&& cd ../..; \
|
||||
fi
|
||||
|
||||
COPY ./ /app/vllm
|
||||
WORKDIR /vllm-workspace
|
||||
COPY . .
|
||||
|
||||
RUN python3 -m pip install --upgrade pip numba
|
||||
|
||||
RUN cd /app \
|
||||
&& cd vllm \
|
||||
&& pip install -U -r requirements-rocm.txt \
|
||||
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -U -r requirements-rocm.txt \
|
||||
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
|
||||
&& python3 setup.py install \
|
||||
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
|
||||
&& cd ..
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
|
@ -1,6 +1,9 @@
|
||||
include LICENSE
|
||||
include requirements-common.txt
|
||||
include requirements-cuda.txt
|
||||
include requirements-rocm.txt
|
||||
include requirements-neuron.txt
|
||||
include requirements-cpu.txt
|
||||
include CMakeLists.txt
|
||||
|
||||
recursive-include cmake *
|
||||
|
@ -74,10 +74,11 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
|
||||
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
||||
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
|
||||
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
||||
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
|
||||
- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
|
||||
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
||||
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
|
||||
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
|
||||
- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
|
||||
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
||||
- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
|
||||
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
|
||||
|
@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
|
||||
|
||||
|
||||
def main(args):
|
||||
llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
|
||||
llm = LLM(model=args.model,
|
||||
tokenizer_mode='auto',
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
use_v2_block_manager=args.use_v2_block_manager,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
enable_prefix_caching=args.enable_prefix_caching)
|
||||
|
||||
num_prompts = 100
|
||||
prompts = [PROMPT] * num_prompts
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=100)
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
|
||||
print("------warm up------")
|
||||
test_prefix(
|
||||
llm=llm,
|
||||
prompts=prompts[:1],
|
||||
prompts=prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
@ -45,8 +47,16 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Benchmark the performance with or without automatic '
|
||||
'prefix caching.')
|
||||
parser.add_argument('--model',
|
||||
type=str,
|
||||
default='baichuan-inc/Baichuan2-13B-Chat')
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||
parser.add_argument('--output-len', type=int, default=10)
|
||||
parser.add_argument('--enable-prefix-caching',
|
||||
action='store_true',
|
||||
help='enable prefix caching')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
help='Use BlockSpaceMangerV2')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -27,7 +27,7 @@ import time
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import AsyncGenerator, List, Tuple
|
||||
from typing import AsyncGenerator, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
@ -58,7 +58,11 @@ def sample_sharegpt_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int] = None,
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
||||
# Load the dataset.
|
||||
with open(dataset_path) as f:
|
||||
dataset = json.load(f)
|
||||
@ -68,38 +72,32 @@ def sample_sharegpt_requests(
|
||||
dataset = [(data["conversations"][0]["value"],
|
||||
data["conversations"][1]["value"]) for data in dataset]
|
||||
|
||||
# some of these will be filtered out, so sample more than we need
|
||||
sampled_indices = random.sample(range(len(dataset)),
|
||||
int(num_requests * 1.2))
|
||||
dataset = [dataset[i] for i in sampled_indices]
|
||||
# Shuffle the dataset.
|
||||
random.shuffle(dataset)
|
||||
|
||||
# Filter out sequences that are too long or too short
|
||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||
for i in range(len(dataset)):
|
||||
if len(filtered_dataset) == num_requests:
|
||||
break
|
||||
|
||||
# Tokenize the prompts and completions.
|
||||
prompts = [prompt for prompt, _ in dataset]
|
||||
prompt_token_ids = tokenizer(prompts).input_ids
|
||||
completions = [completion for _, completion in dataset]
|
||||
completion_token_ids = tokenizer(completions).input_ids
|
||||
tokenized_dataset = []
|
||||
for i in range(len(dataset)):
|
||||
output_len = len(completion_token_ids[i])
|
||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
||||
|
||||
# Filter out too long sequences.
|
||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||
for prompt, prompt_token_ids, output_len in tokenized_dataset:
|
||||
prompt = dataset[i][0]
|
||||
prompt_token_ids = tokenizer(prompt).input_ids
|
||||
completion = dataset[i][1]
|
||||
completion_token_ids = tokenizer(completion).input_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = len(completion_token_ids
|
||||
) if fixed_output_len is None else fixed_output_len
|
||||
if prompt_len < 4 or output_len < 4:
|
||||
# Prune too short sequences.
|
||||
# This is because TGI causes errors when the input or output length
|
||||
# is too short.
|
||||
continue
|
||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||
|
||||
# Sample the requests.
|
||||
sampled_requests = random.sample(filtered_dataset, num_requests)
|
||||
return sampled_requests
|
||||
return filtered_dataset
|
||||
|
||||
|
||||
def sample_sonnet_requests(
|
||||
@ -361,6 +359,7 @@ def main(args: argparse.Namespace):
|
||||
dataset_path=args.dataset,
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
fixed_output_len=args.sharegpt_output_len,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "sharegpt":
|
||||
@ -368,6 +367,7 @@ def main(args: argparse.Namespace):
|
||||
dataset_path=args.dataset_path,
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
fixed_output_len=args.sharegpt_output_len,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "sonnet":
|
||||
@ -524,6 +524,12 @@ if __name__ == "__main__":
|
||||
default=1000,
|
||||
help="Number of prompts to process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sharegpt-output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the output length "
|
||||
"from the ShareGPT dataset.")
|
||||
parser.add_argument(
|
||||
"--sonnet-input-len",
|
||||
type=int,
|
||||
|
@ -103,25 +103,22 @@ def run_vllm(
|
||||
)
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts = []
|
||||
sampling_params = []
|
||||
for prompt, _, output_len in requests:
|
||||
sampling_params = SamplingParams(
|
||||
prompts.append(prompt)
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
temperature=0.0 if use_beam_search else 1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
)
|
||||
# FIXME(woosuk): Do not use internal method.
|
||||
llm._add_request(
|
||||
prompt=prompt,
|
||||
prompt_token_ids=None,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
))
|
||||
|
||||
start = time.perf_counter()
|
||||
# FIXME(woosuk): Do not use internal method.
|
||||
llm._run_engine(use_tqdm=True)
|
||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
@ -6,7 +6,7 @@ from typing import Optional
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm._C import ops
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.aqlm import (
|
||||
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
||||
optimized_dequantize_gemm)
|
||||
|
@ -1,3 +1,4 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
@ -5,6 +6,7 @@ import sys
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import triton
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import (fused_moe,
|
||||
get_config_file_name)
|
||||
@ -12,16 +14,16 @@ from vllm.model_executor.layers.fused_moe import (fused_moe,
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
|
||||
|
||||
def main():
|
||||
def main(dtype: str):
|
||||
method = fused_moe
|
||||
for bs in [
|
||||
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
||||
2048, 3072, 4096
|
||||
]:
|
||||
run_grid(bs, method=method)
|
||||
run_grid(bs, method=method, dtype=dtype)
|
||||
|
||||
|
||||
def run_grid(bs, method):
|
||||
def run_grid(bs, method, dtype: str):
|
||||
d_model = 4096
|
||||
num_total_experts = 8
|
||||
top_k = 2
|
||||
@ -34,39 +36,29 @@ def run_grid(bs, method):
|
||||
num_trials = 1
|
||||
|
||||
configs = []
|
||||
if bs <= 16:
|
||||
BLOCK_SIZES_M = [16]
|
||||
elif bs <= 32:
|
||||
BLOCK_SIZES_M = [16, 32]
|
||||
elif bs <= 64:
|
||||
BLOCK_SIZES_M = [16, 32, 64]
|
||||
elif bs <= 128:
|
||||
BLOCK_SIZES_M = [16, 32, 64, 128]
|
||||
else:
|
||||
BLOCK_SIZES_M = [16, 32, 64, 128, 256]
|
||||
|
||||
for block_size_n in [32, 64, 128, 256]:
|
||||
for block_size_m in BLOCK_SIZES_M:
|
||||
for block_size_m in [16, 32, 64, 128, 256]:
|
||||
for block_size_k in [64, 128, 256]:
|
||||
for group_size_m in [1, 16, 32, 64]:
|
||||
for num_warps in [4, 8]:
|
||||
for num_stages in [2, 3, 4, 5]:
|
||||
configs.append({
|
||||
"BLOCK_SIZE_M": block_size_m,
|
||||
"BLOCK_SIZE_N": block_size_n,
|
||||
"BLOCK_SIZE_K": block_size_k,
|
||||
"GROUP_SIZE_M": group_size_m,
|
||||
"num_warps": num_warps,
|
||||
"num_stages": 4,
|
||||
"num_stages": num_stages,
|
||||
})
|
||||
|
||||
best_config = None
|
||||
best_time_us = 1e20
|
||||
|
||||
for config in configs:
|
||||
print(f'{tp_size=} {bs=}')
|
||||
print(f'{config}')
|
||||
|
||||
for config in tqdm(configs):
|
||||
# warmup
|
||||
print('warming up')
|
||||
try:
|
||||
for _ in range(num_warmup_trials):
|
||||
run_timing(
|
||||
@ -79,12 +71,12 @@ def run_grid(bs, method):
|
||||
model_intermediate_size=model_intermediate_size,
|
||||
method=method,
|
||||
config=config,
|
||||
dtype=dtype,
|
||||
)
|
||||
except triton.runtime.autotuner.OutOfResources:
|
||||
continue
|
||||
|
||||
# trial
|
||||
print('benchmarking')
|
||||
for _ in range(num_trials):
|
||||
kernel_dur_ms = run_timing(
|
||||
num_calls=num_calls,
|
||||
@ -96,6 +88,7 @@ def run_grid(bs, method):
|
||||
model_intermediate_size=model_intermediate_size,
|
||||
method=method,
|
||||
config=config,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
kernel_dur_us = 1000 * kernel_dur_ms
|
||||
@ -105,7 +98,8 @@ def run_grid(bs, method):
|
||||
best_config = config
|
||||
best_time_us = kernel_dur_us
|
||||
|
||||
print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
|
||||
tqdm.write(
|
||||
f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
|
||||
f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
|
||||
f'{d_model=} {model_intermediate_size=} {num_layers=}')
|
||||
|
||||
@ -114,7 +108,8 @@ def run_grid(bs, method):
|
||||
|
||||
# holds Dict[str, Dict[str, int]]
|
||||
filename = get_config_file_name(num_total_experts,
|
||||
model_intermediate_size // tp_size)
|
||||
model_intermediate_size // tp_size,
|
||||
"float8" if dtype == "float8" else None)
|
||||
print(f"writing config to file {filename}")
|
||||
existing_content = {}
|
||||
if os.path.exists(filename):
|
||||
@ -128,27 +123,48 @@ def run_grid(bs, method):
|
||||
|
||||
def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
|
||||
top_k: int, tp_size: int, model_intermediate_size: int, method,
|
||||
config) -> float:
|
||||
config, dtype: str) -> float:
|
||||
shard_intermediate_size = model_intermediate_size // tp_size
|
||||
|
||||
hidden_states = torch.rand(
|
||||
(bs, d_model),
|
||||
device="cuda:0",
|
||||
dtype=torch.bfloat16,
|
||||
dtype=torch.float16,
|
||||
)
|
||||
|
||||
ws = torch.rand(
|
||||
w1 = torch.rand(
|
||||
(num_total_experts, 2 * shard_intermediate_size, d_model),
|
||||
device=hidden_states.device,
|
||||
dtype=hidden_states.dtype,
|
||||
)
|
||||
|
||||
w2s = torch.rand(
|
||||
w2 = torch.rand(
|
||||
(num_total_experts, d_model, shard_intermediate_size),
|
||||
device=hidden_states.device,
|
||||
dtype=hidden_states.dtype,
|
||||
)
|
||||
|
||||
w1_scale = None
|
||||
w2_scale = None
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
|
||||
if dtype == "float8":
|
||||
w1 = w1.to(torch.float8_e4m3fn)
|
||||
w2 = w2.to(torch.float8_e4m3fn)
|
||||
w1_scale = torch.ones(num_total_experts,
|
||||
device=hidden_states.device,
|
||||
dtype=torch.float32)
|
||||
w2_scale = torch.ones(num_total_experts,
|
||||
device=hidden_states.device,
|
||||
dtype=torch.float32)
|
||||
a1_scale = torch.ones(1,
|
||||
device=hidden_states.device,
|
||||
dtype=torch.float32)
|
||||
a2_scale = torch.ones(1,
|
||||
device=hidden_states.device,
|
||||
dtype=torch.float32)
|
||||
|
||||
gating_output = F.softmax(torch.rand(
|
||||
(num_calls, bs, num_total_experts),
|
||||
device=hidden_states.device,
|
||||
@ -163,13 +179,18 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
|
||||
for i in range(num_calls):
|
||||
hidden_states = method(
|
||||
hidden_states=hidden_states,
|
||||
w1=ws,
|
||||
w2=w2s,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
gating_output=gating_output[i],
|
||||
topk=2,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
override_config=config,
|
||||
use_fp8=dtype == "float8",
|
||||
)
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
@ -179,4 +200,16 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='benchmark_mixtral_moe',
|
||||
description='Benchmark and tune the fused_moe kernel',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dtype',
|
||||
type=str,
|
||||
default='auto',
|
||||
choices=['float8', 'float16'],
|
||||
help='Data type used for fused_moe kernel computations',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
sys.exit(main(args.dtype))
|
||||
|
@ -16,7 +16,7 @@ PARTITION_SIZE = 512
|
||||
def main(
|
||||
version: str,
|
||||
num_seqs: int,
|
||||
context_len: int,
|
||||
seq_len: int,
|
||||
num_query_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
@ -48,12 +48,12 @@ def main(
|
||||
dtype=torch.float,
|
||||
device=device)
|
||||
|
||||
context_lens = [context_len for _ in range(num_seqs)]
|
||||
max_context_len = max(context_lens)
|
||||
context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
|
||||
seq_lens = [seq_len for _ in range(num_seqs)]
|
||||
max_seq_len = max(seq_lens)
|
||||
seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
|
||||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
@ -77,8 +77,7 @@ def main(
|
||||
# Prepare for the paged attention kernel.
|
||||
output = torch.empty_like(query)
|
||||
if version == "v2":
|
||||
num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
|
||||
PARTITION_SIZE)
|
||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||
tmp_output = torch.empty(
|
||||
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
||||
dtype=output.dtype,
|
||||
@ -110,9 +109,9 @@ def main(
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
context_lens,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_context_len,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
@ -129,9 +128,9 @@ def main(
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
context_lens,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_context_len,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
@ -166,7 +165,7 @@ if __name__ == '__main__':
|
||||
choices=["v1", "v2"],
|
||||
default="v2")
|
||||
parser.add_argument("--batch-size", type=int, default=8)
|
||||
parser.add_argument("--context-len", type=int, default=4096)
|
||||
parser.add_argument("--seq_len", type=int, default=4096)
|
||||
parser.add_argument("--num-query-heads", type=int, default=64)
|
||||
parser.add_argument("--num-kv-heads", type=int, default=8)
|
||||
parser.add_argument("--head-size",
|
||||
@ -199,7 +198,7 @@ if __name__ == '__main__':
|
||||
main(
|
||||
version=args.version,
|
||||
num_seqs=args.batch_size,
|
||||
context_len=args.context_len,
|
||||
seq_len=args.seq_len,
|
||||
num_query_heads=args.num_query_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
head_size=args.head_size,
|
||||
|
@ -104,7 +104,7 @@ __device__ void paged_attention_kernel(
|
||||
const int num_kv_heads, // [num_heads]
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride,
|
||||
@ -115,23 +115,23 @@ __device__ void paged_attention_kernel(
|
||||
const int partition_idx = blockIdx.z;
|
||||
const int max_num_partitions = gridDim.z;
|
||||
constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
|
||||
const int context_len = context_lens[seq_idx];
|
||||
if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
|
||||
// No work to do. Terminate the thread block.
|
||||
return;
|
||||
}
|
||||
|
||||
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
|
||||
const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
|
||||
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
|
||||
const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
|
||||
|
||||
// [start_block_idx, end_block_idx) is the range of blocks to process.
|
||||
const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
|
||||
const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
|
||||
const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
|
||||
const int num_blocks = end_block_idx - start_block_idx;
|
||||
|
||||
// [start_token_idx, end_token_idx) is the range of tokens to process.
|
||||
const int start_token_idx = start_block_idx * BLOCK_SIZE;
|
||||
const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
|
||||
const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
|
||||
const int num_tokens = end_token_idx - start_token_idx;
|
||||
|
||||
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||
@ -245,12 +245,12 @@ __device__ void paged_attention_kernel(
|
||||
// This includes a reduction across the threads in the same thread group.
|
||||
float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
|
||||
// Add the ALiBi bias if slopes are given.
|
||||
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
|
||||
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
|
||||
|
||||
if (thread_group_offset == 0) {
|
||||
// Store the partial reductions to shared memory.
|
||||
// NOTE(woosuk): It is required to zero out the masked logits.
|
||||
const bool mask = token_idx >= context_len;
|
||||
const bool mask = token_idx >= seq_len;
|
||||
logits[token_idx - start_token_idx] = mask ? 0.f : qk;
|
||||
// Update the max value.
|
||||
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
|
||||
@ -364,14 +364,14 @@ __device__ void paged_attention_kernel(
|
||||
} else {
|
||||
v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
|
||||
}
|
||||
if (block_idx == num_context_blocks - 1) {
|
||||
if (block_idx == num_seq_blocks - 1) {
|
||||
// NOTE(woosuk): When v_vec contains the tokens that are out of the context,
|
||||
// we should explicitly zero out the values since they may contain NaNs.
|
||||
// See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
|
||||
scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < V_VEC_SIZE; j++) {
|
||||
v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
|
||||
v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
|
||||
}
|
||||
}
|
||||
accs[i] += dot(logits_vec, v_vec);
|
||||
@ -457,7 +457,7 @@ __global__ void paged_attention_v1_kernel(
|
||||
const int num_kv_heads, // [num_heads]
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride,
|
||||
@ -466,7 +466,7 @@ __global__ void paged_attention_v1_kernel(
|
||||
const float kv_scale) {
|
||||
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE>(
|
||||
/* exp_sums */ nullptr, /* max_logits */ nullptr,
|
||||
out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
|
||||
out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens,
|
||||
max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale);
|
||||
}
|
||||
|
||||
@ -489,7 +489,7 @@ __global__ void paged_attention_v2_kernel(
|
||||
const int num_kv_heads, // [num_heads]
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride,
|
||||
@ -498,7 +498,7 @@ __global__ void paged_attention_v2_kernel(
|
||||
const float kv_scale) {
|
||||
paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_KV_CACHE, PARTITION_SIZE>(
|
||||
exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
|
||||
block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
|
||||
block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes,
|
||||
q_stride, kv_block_stride, kv_head_stride, kv_scale);
|
||||
}
|
||||
|
||||
@ -513,13 +513,13 @@ __global__ void paged_attention_v2_reduce_kernel(
|
||||
const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||
const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_partitions) {
|
||||
const int num_heads = gridDim.x;
|
||||
const int head_idx = blockIdx.x;
|
||||
const int seq_idx = blockIdx.y;
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
|
||||
if (num_partitions == 1) {
|
||||
// No need to reduce. Only copy tmp_out to out.
|
||||
scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
|
||||
@ -616,7 +616,7 @@ __global__ void paged_attention_v2_reduce_kernel(
|
||||
num_kv_heads, \
|
||||
scale, \
|
||||
block_tables_ptr, \
|
||||
context_lens_ptr, \
|
||||
seq_lens_ptr, \
|
||||
max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, \
|
||||
q_stride, \
|
||||
@ -639,8 +639,8 @@ void paged_attention_v1_launcher(
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
torch::Tensor& block_tables,
|
||||
torch::Tensor& context_lens,
|
||||
int max_context_len,
|
||||
torch::Tensor& seq_lens,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
float kv_scale) {
|
||||
int num_seqs = query.size(0);
|
||||
@ -664,11 +664,11 @@ void paged_attention_v1_launcher(
|
||||
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
|
||||
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* context_lens_ptr = context_lens.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
|
||||
int logits_size = padded_max_context_len * sizeof(float);
|
||||
int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
|
||||
int logits_size = padded_max_seq_len * sizeof(float);
|
||||
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
|
||||
// Keep that in sync with the logic here!
|
||||
@ -715,8 +715,8 @@ void paged_attention_v1_launcher(
|
||||
num_kv_heads, \
|
||||
scale, \
|
||||
block_tables, \
|
||||
context_lens, \
|
||||
max_context_len, \
|
||||
seq_lens, \
|
||||
max_seq_len, \
|
||||
alibi_slopes, \
|
||||
kv_scale);
|
||||
|
||||
@ -746,9 +746,9 @@ void paged_attention_v1(
|
||||
int num_kv_heads, // [num_heads]
|
||||
float scale,
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& context_lens, // [num_seqs]
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int block_size,
|
||||
int max_context_len,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype,
|
||||
float kv_scale) {
|
||||
@ -790,7 +790,7 @@ void paged_attention_v1(
|
||||
num_kv_heads, \
|
||||
scale, \
|
||||
block_tables_ptr, \
|
||||
context_lens_ptr, \
|
||||
seq_lens_ptr, \
|
||||
max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, \
|
||||
q_stride, \
|
||||
@ -803,7 +803,7 @@ void paged_attention_v1(
|
||||
exp_sums_ptr, \
|
||||
max_logits_ptr, \
|
||||
tmp_out_ptr, \
|
||||
context_lens_ptr, \
|
||||
seq_lens_ptr, \
|
||||
max_num_partitions);
|
||||
|
||||
template<
|
||||
@ -824,8 +824,8 @@ void paged_attention_v2_launcher(
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
torch::Tensor& block_tables,
|
||||
torch::Tensor& context_lens,
|
||||
int max_context_len,
|
||||
torch::Tensor& seq_lens,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
float kv_scale) {
|
||||
int num_seqs = query.size(0);
|
||||
@ -852,10 +852,10 @@ void paged_attention_v2_launcher(
|
||||
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
|
||||
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* context_lens_ptr = context_lens.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
|
||||
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
||||
int logits_size = PARTITION_SIZE * sizeof(float);
|
||||
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||
|
||||
@ -909,8 +909,8 @@ void paged_attention_v2_launcher(
|
||||
num_kv_heads, \
|
||||
scale, \
|
||||
block_tables, \
|
||||
context_lens, \
|
||||
max_context_len, \
|
||||
seq_lens, \
|
||||
max_seq_len, \
|
||||
alibi_slopes, \
|
||||
kv_scale);
|
||||
|
||||
@ -943,9 +943,9 @@ void paged_attention_v2(
|
||||
int num_kv_heads, // [num_heads]
|
||||
float scale,
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& context_lens, // [num_seqs]
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int block_size,
|
||||
int max_context_len,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype,
|
||||
float kv_scale) {
|
||||
|
@ -24,6 +24,14 @@ void reshape_and_cache(
|
||||
const std::string& kv_cache_dtype,
|
||||
const float kv_scale);
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
torch::Tensor& key,
|
||||
torch::Tensor& value,
|
||||
torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype);
|
||||
|
||||
// Just for unittest
|
||||
void convert_fp8(
|
||||
torch::Tensor& src_cache,
|
||||
|
@ -215,6 +215,41 @@ __global__ void reshape_and_cache_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
__global__ void reshape_and_cache_flash_kernel(
|
||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||
scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int block_stride,
|
||||
const int key_stride,
|
||||
const int value_stride,
|
||||
const int num_heads,
|
||||
const int head_size,
|
||||
const int block_size) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0) {
|
||||
return;
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
const int n = num_heads * head_size;
|
||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||
const int head_idx = i / head_size;
|
||||
const int head_offset = i % head_size;
|
||||
const int64_t tgt_value_idx = block_idx * block_stride
|
||||
+ block_offset * num_heads * head_size
|
||||
+ head_idx * head_size
|
||||
+ head_offset;
|
||||
k_cache[tgt_value_idx] = key[src_key_idx];
|
||||
v_cache[tgt_value_idx] = value[src_value_idx];
|
||||
}
|
||||
}
|
||||
} // namespace vllm
|
||||
|
||||
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \
|
||||
@ -275,6 +310,51 @@ void reshape_and_cache(
|
||||
}
|
||||
}
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor& value, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
const std::string& kv_cache_dtype)
|
||||
{
|
||||
// FIXME: only support auto datatype, does not support fp8
|
||||
if (kv_cache_dtype != "auto") {
|
||||
TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
|
||||
}
|
||||
int num_tokens = key.size(0);
|
||||
int num_heads = key.size(1);
|
||||
int head_size = key.size(2);
|
||||
int block_size = k_cache.size(1);
|
||||
|
||||
int key_stride = key.stride(0);
|
||||
int value_stride = value.stride(0);
|
||||
int block_stride = k_cache.stride(0);
|
||||
TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(num_heads * head_size, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key.scalar_type(),
|
||||
"reshape_and_cache_flash",
|
||||
[&] {
|
||||
vllm::reshape_and_cache_flash_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
key.data_ptr<scalar_t>(),
|
||||
value.data_ptr<scalar_t>(),
|
||||
k_cache.data_ptr<scalar_t>(),
|
||||
v_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(),
|
||||
block_stride,
|
||||
key_stride,
|
||||
value_stride,
|
||||
num_heads,
|
||||
head_size,
|
||||
block_size);
|
||||
});
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template<typename Tout, typename Tin>
|
||||
|
@ -70,11 +70,11 @@ template <typename T>
|
||||
FORCE_INLINE std::pair<T, T>
|
||||
reduceSoftmaxAlibi(T *data, const int size, const int capacity,
|
||||
const float alibi_slope, const int start_index,
|
||||
const int context_len) {
|
||||
data[0] += alibi_slope * (start_index - context_len + 1);
|
||||
const int seq_len) {
|
||||
data[0] += alibi_slope * (start_index - seq_len + 1);
|
||||
T max = data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
T qk = data[i] + alibi_slope * (start_index + i - context_len + 1);
|
||||
T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
|
||||
data[i] = qk;
|
||||
max = max >= qk ? max : qk;
|
||||
}
|
||||
@ -225,7 +225,7 @@ struct paged_attention_v1_impl {
|
||||
const int num_kv_heads, const float scale,
|
||||
const int
|
||||
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int *__restrict__ context_lens, // [num_seqs]
|
||||
const int *__restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float *__restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
@ -235,32 +235,32 @@ struct paged_attention_v1_impl {
|
||||
|
||||
static_assert(BLOCK_SIZE == 16);
|
||||
|
||||
int max_context_len = max_num_blocks_per_seq * BLOCK_SIZE;
|
||||
int max_context_len_padded = (max_context_len + 15) & 0xFFFFFFF0;
|
||||
TORCH_CHECK((max_context_len_padded * sizeof(float)) % 64 == 0);
|
||||
int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
|
||||
int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
|
||||
TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
|
||||
|
||||
const int parallel_work_item_num = omp_get_max_threads();
|
||||
|
||||
size_t logits_bytes =
|
||||
parallel_work_item_num * max_context_len_padded * sizeof(float);
|
||||
parallel_work_item_num * max_seq_len_padded * sizeof(float);
|
||||
float *logits = (float *)std::aligned_alloc(
|
||||
64, logits_bytes); // Cacheline alignment for each context token.
|
||||
// [parallel_work_item_num, max_context_len_padded]
|
||||
// [parallel_work_item_num, max_seq_len_padded]
|
||||
|
||||
#pragma omp parallel for collapse(2) schedule(dynamic, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
int context_len = context_lens[seq_idx];
|
||||
int seq_len = seq_lens[seq_idx];
|
||||
const int *seq_block_table =
|
||||
block_tables + max_num_blocks_per_seq * seq_idx;
|
||||
const int block_num = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||
const scalar_t *__restrict__ q_vec_ptr =
|
||||
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
const int last_block_token_num =
|
||||
context_len - (block_num - 1) * BLOCK_SIZE;
|
||||
seq_len - (block_num - 1) * BLOCK_SIZE;
|
||||
float *__restrict__ thread_block_logits =
|
||||
logits + omp_get_thread_num() * max_context_len_padded;
|
||||
logits + omp_get_thread_num() * max_seq_len_padded;
|
||||
|
||||
// Compute logits
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
@ -278,11 +278,11 @@ struct paged_attention_v1_impl {
|
||||
|
||||
// Compute softmax
|
||||
if (alibi_slopes) {
|
||||
reduceSoftmaxAlibi(thread_block_logits, context_len,
|
||||
reduceSoftmaxAlibi(thread_block_logits, seq_len,
|
||||
block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
|
||||
context_len);
|
||||
seq_len);
|
||||
} else {
|
||||
reduceSoftmax(thread_block_logits, context_len,
|
||||
reduceSoftmax(thread_block_logits, seq_len,
|
||||
block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
@ -340,7 +340,7 @@ struct paged_attention_v1_impl {
|
||||
#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
|
||||
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
|
||||
block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \
|
||||
num_heads);
|
||||
|
||||
@ -348,8 +348,8 @@ template <typename T, int BLOCK_SIZE>
|
||||
void paged_attention_v1_impl_launcher(
|
||||
torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables, torch::Tensor &context_lens,
|
||||
int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
torch::Tensor &block_tables, torch::Tensor &seq_lens,
|
||||
int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -369,7 +369,7 @@ void paged_attention_v1_impl_launcher(
|
||||
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
|
||||
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
|
||||
int *block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int *context_lens_ptr = context_lens.data_ptr<int>();
|
||||
int *seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 64:
|
||||
@ -399,7 +399,7 @@ void paged_attention_v1_impl_launcher(
|
||||
#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||
context_lens, max_context_len, alibi_slopes);
|
||||
seq_lens, max_seq_len, alibi_slopes);
|
||||
|
||||
#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
@ -416,8 +416,8 @@ void paged_attention_v1(torch::Tensor &out, torch::Tensor &query,
|
||||
torch::Tensor &key_cache, torch::Tensor &value_cache,
|
||||
int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables,
|
||||
torch::Tensor &context_lens, int block_size,
|
||||
int max_context_len,
|
||||
torch::Tensor &seq_lens, int block_size,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor> &alibi_slopes,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
@ -448,7 +448,7 @@ struct paged_attention_v2_impl {
|
||||
const int num_kv_heads, const float scale,
|
||||
const int
|
||||
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int *__restrict__ context_lens, // [num_seqs]
|
||||
const int *__restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float *__restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
@ -465,22 +465,22 @@ struct paged_attention_v2_impl {
|
||||
for (int partition_idx = 0; partition_idx < max_num_partitions;
|
||||
++partition_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int start_token_idx = partition_idx * PARTITION_SIZE;
|
||||
|
||||
if (start_token_idx >= context_len)
|
||||
if (start_token_idx >= seq_len)
|
||||
continue;
|
||||
|
||||
const int partition_num =
|
||||
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
const bool no_reduce = (partition_num == 1);
|
||||
const int context_token_num =
|
||||
(std::min(context_len, start_token_idx + PARTITION_SIZE) -
|
||||
const int token_num =
|
||||
(std::min(seq_len, start_token_idx + PARTITION_SIZE) -
|
||||
start_token_idx);
|
||||
const int block_num =
|
||||
(context_token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
(token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int last_block_token_num =
|
||||
context_token_num - (block_num - 1) * BLOCK_SIZE;
|
||||
token_num - (block_num - 1) * BLOCK_SIZE;
|
||||
const int *seq_block_table = block_tables +
|
||||
max_num_blocks_per_seq * seq_idx +
|
||||
start_token_idx / BLOCK_SIZE;
|
||||
@ -507,10 +507,10 @@ struct paged_attention_v2_impl {
|
||||
std::pair<float, float> max_and_sum;
|
||||
if (alibi_slopes) {
|
||||
max_and_sum = reduceSoftmaxAlibi(
|
||||
logits, context_token_num, block_num * BLOCK_SIZE,
|
||||
alibi_slopes[head_idx], start_token_idx, context_len);
|
||||
logits, token_num, block_num * BLOCK_SIZE,
|
||||
alibi_slopes[head_idx], start_token_idx, seq_len);
|
||||
} else {
|
||||
max_and_sum = reduceSoftmax(logits, context_token_num,
|
||||
max_and_sum = reduceSoftmax(logits, token_num,
|
||||
block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
@ -583,9 +583,9 @@ struct paged_attention_v2_impl {
|
||||
#pragma omp parallel for collapse(2) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1)
|
||||
continue;
|
||||
@ -612,9 +612,9 @@ struct paged_attention_v2_impl {
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1)
|
||||
continue;
|
||||
@ -649,7 +649,7 @@ struct paged_attention_v2_impl {
|
||||
paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
|
||||
key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||
context_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||
kv_block_stride, kv_head_stride, num_seqs, num_heads, \
|
||||
max_num_partitions);
|
||||
|
||||
@ -658,8 +658,8 @@ void paged_attention_v2_impl_launcher(
|
||||
torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
|
||||
torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
|
||||
int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
torch::Tensor &block_tables, torch::Tensor &seq_lens, int block_size,
|
||||
int max_seq_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -683,7 +683,7 @@ void paged_attention_v2_impl_launcher(
|
||||
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
|
||||
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
|
||||
int *block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int *context_lens_ptr = context_lens.data_ptr<int>();
|
||||
int *seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 64:
|
||||
@ -713,8 +713,8 @@ void paged_attention_v2_impl_launcher(
|
||||
#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, context_lens, block_size, \
|
||||
max_context_len, alibi_slopes);
|
||||
num_kv_heads, scale, block_tables, seq_lens, block_size, \
|
||||
max_seq_len, alibi_slopes);
|
||||
|
||||
#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
@ -732,8 +732,8 @@ void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums,
|
||||
torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads,
|
||||
float scale, torch::Tensor &block_tables,
|
||||
torch::Tensor &context_lens, int block_size,
|
||||
int max_context_len,
|
||||
torch::Tensor &seq_lens, int block_size,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor> &alibi_slopes,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
|
35
csrc/ops.h
35
csrc/ops.h
@ -10,9 +10,9 @@ void paged_attention_v1(
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
torch::Tensor& block_tables,
|
||||
torch::Tensor& context_lens,
|
||||
torch::Tensor& seq_lens,
|
||||
int block_size,
|
||||
int max_context_len,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype,
|
||||
float kv_scale);
|
||||
@ -28,9 +28,9 @@ void paged_attention_v2(
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
torch::Tensor& block_tables,
|
||||
torch::Tensor& context_lens,
|
||||
torch::Tensor& seq_lens,
|
||||
int block_size,
|
||||
int max_context_len,
|
||||
int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype,
|
||||
float kv_scale);
|
||||
@ -124,6 +124,26 @@ torch::Tensor marlin_gemm(
|
||||
int64_t size_m,
|
||||
int64_t size_n,
|
||||
int64_t size_k);
|
||||
|
||||
torch::Tensor gptq_marlin_gemm(
|
||||
torch::Tensor &a,
|
||||
torch::Tensor &b_q_weight,
|
||||
torch::Tensor &b_scales,
|
||||
torch::Tensor &g_idx,
|
||||
torch::Tensor &perm,
|
||||
torch::Tensor &workspace,
|
||||
int64_t num_bits,
|
||||
int64_t size_m,
|
||||
int64_t size_n,
|
||||
int64_t size_k,
|
||||
bool is_k_full);
|
||||
|
||||
torch::Tensor gptq_marlin_repack(
|
||||
torch::Tensor &b_q_weight,
|
||||
torch::Tensor &perm,
|
||||
int64_t size_k,
|
||||
int64_t size_n,
|
||||
int64_t num_bits);
|
||||
#endif
|
||||
|
||||
void squeezellm_gemm(
|
||||
@ -146,7 +166,12 @@ void gptq_shuffle(
|
||||
torch::Tensor q_perm,
|
||||
int bit);
|
||||
|
||||
void scaled_fp8_quant(
|
||||
void static_scaled_fp8_quant(
|
||||
torch::Tensor& out,
|
||||
torch::Tensor& input,
|
||||
torch::Tensor& scale);
|
||||
|
||||
void dynamic_scaled_fp8_quant(
|
||||
torch::Tensor& out,
|
||||
torch::Tensor& input,
|
||||
torch::Tensor& scale);
|
||||
|
@ -2,3 +2,4 @@
|
||||
#include "bgmv_impl.cuh"
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
|
||||
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
|
||||
|
@ -2,3 +2,4 @@
|
||||
#include "bgmv_impl.cuh"
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
|
||||
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)
|
||||
|
@ -74,6 +74,74 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
||||
// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
|
||||
// and vllm/tests/lora/test_punica.py
|
||||
|
||||
// Used for defining kernels going from the variety of
|
||||
// dim in to the narrow dim out
|
||||
// Using it for the fully sharded column
|
||||
// parallel LoRA A which splits the rank dim
|
||||
#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
|
||||
f(in_T, out_T, W_T, 128, narrow) \
|
||||
f(in_T, out_T, W_T, 256, narrow) \
|
||||
f(in_T, out_T, W_T, 512, narrow) \
|
||||
f(in_T, out_T, W_T, 640, narrow) \
|
||||
f(in_T, out_T, W_T, 768, narrow) \
|
||||
f(in_T, out_T, W_T, 1024, narrow) \
|
||||
f(in_T, out_T, W_T, 1152, narrow) \
|
||||
f(in_T, out_T, W_T, 1280, narrow) \
|
||||
f(in_T, out_T, W_T, 1536, narrow) \
|
||||
f(in_T, out_T, W_T, 1728, narrow) \
|
||||
f(in_T, out_T, W_T, 1792, narrow) \
|
||||
f(in_T, out_T, W_T, 2048, narrow) \
|
||||
f(in_T, out_T, W_T, 2304, narrow) \
|
||||
f(in_T, out_T, W_T, 2560, narrow) \
|
||||
f(in_T, out_T, W_T, 2752, narrow) \
|
||||
f(in_T, out_T, W_T, 2816, narrow) \
|
||||
f(in_T, out_T, W_T, 3072, narrow) \
|
||||
f(in_T, out_T, W_T, 3456, narrow) \
|
||||
f(in_T, out_T, W_T, 3584, narrow) \
|
||||
f(in_T, out_T, W_T, 4096, narrow) \
|
||||
f(in_T, out_T, W_T, 4608, narrow) \
|
||||
f(in_T, out_T, W_T, 5120, narrow) \
|
||||
f(in_T, out_T, W_T, 5504, narrow) \
|
||||
f(in_T, out_T, W_T, 5632, narrow) \
|
||||
f(in_T, out_T, W_T, 6144, narrow) \
|
||||
f(in_T, out_T, W_T, 6848, narrow) \
|
||||
f(in_T, out_T, W_T, 6912, narrow) \
|
||||
f(in_T, out_T, W_T, 7168, narrow) \
|
||||
f(in_T, out_T, W_T, 8192, narrow) \
|
||||
f(in_T, out_T, W_T, 9216, narrow) \
|
||||
f(in_T, out_T, W_T, 10240, narrow) \
|
||||
f(in_T, out_T, W_T, 11008, narrow) \
|
||||
f(in_T, out_T, W_T, 12288, narrow) \
|
||||
f(in_T, out_T, W_T, 13696, narrow) \
|
||||
f(in_T, out_T, W_T, 13824, narrow) \
|
||||
f(in_T, out_T, W_T, 14336, narrow) \
|
||||
f(in_T, out_T, W_T, 15360, narrow) \
|
||||
f(in_T, out_T, W_T, 16384, narrow) \
|
||||
f(in_T, out_T, W_T, 20480, narrow) \
|
||||
f(in_T, out_T, W_T, 22016, narrow) \
|
||||
f(in_T, out_T, W_T, 24576, narrow) \
|
||||
f(in_T, out_T, W_T, 27392, narrow) \
|
||||
f(in_T, out_T, W_T, 28672, narrow) \
|
||||
f(in_T, out_T, W_T, 32000, narrow) \
|
||||
f(in_T, out_T, W_T, 32256, narrow) \
|
||||
f(in_T, out_T, W_T, 32512, narrow) \
|
||||
f(in_T, out_T, W_T, 32768, narrow) \
|
||||
f(in_T, out_T, W_T, 33024, narrow) \
|
||||
f(in_T, out_T, W_T, 36864, narrow) \
|
||||
f(in_T, out_T, W_T, 43264, narrow) \
|
||||
f(in_T, out_T, W_T, 49152, narrow) \
|
||||
f(in_T, out_T, W_T, 64000, narrow) \
|
||||
f(in_T, out_T, W_T, 64256, narrow) \
|
||||
f(in_T, out_T, W_T, 64512, narrow) \
|
||||
f(in_T, out_T, W_T, 102400, narrow) \
|
||||
f(in_T, out_T, W_T, 102656, narrow) \
|
||||
f(in_T, out_T, W_T, 102912, narrow) \
|
||||
f(in_T, out_T, W_T, 128000, narrow) \
|
||||
f(in_T, out_T, W_T, 128256, narrow) \
|
||||
f(in_T, out_T, W_T, 128512, narrow) \
|
||||
// Keep above in sync with vllm/lora/layers::SamplerWithLoRA
|
||||
|
||||
|
||||
// Keep this in sync with vllm/config::LoRAConfig
|
||||
#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
|
||||
FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8) \
|
||||
@ -81,4 +149,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
||||
FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
|
||||
FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
|
||||
|
||||
|
||||
#define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
|
||||
FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \
|
||||
FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \
|
||||
FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \
|
||||
f(in_T, out_T, W_T, 8, 64) \
|
||||
f(in_T, out_T, W_T, 16, 64) \
|
||||
f(in_T, out_T, W_T, 32, 64) \
|
||||
f(in_T, out_T, W_T, 64, 64)
|
||||
|
||||
// clang-format on
|
||||
|
@ -2,3 +2,4 @@
|
||||
#include "bgmv_impl.cuh"
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
|
||||
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
|
||||
|
@ -2,3 +2,4 @@
|
||||
#include "bgmv_impl.cuh"
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
|
||||
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
|
||||
|
@ -2,3 +2,4 @@
|
||||
#include "bgmv_impl.cuh"
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
|
||||
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)
|
||||
|
@ -2,3 +2,4 @@
|
||||
#include "bgmv_impl.cuh"
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
|
||||
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
|
||||
|
@ -199,7 +199,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
||||
constexpr int tz = 4;
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if constexpr (feat_in < feat_out) {
|
||||
if constexpr (feat_in <= feat_out) {
|
||||
static_assert(feat_in % vec_size == 0);
|
||||
constexpr int tx = feat_in / vec_size;
|
||||
|
||||
@ -289,6 +289,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
||||
int64_t y_offset, int64_t full_y_size, int64_t batch_size, \
|
||||
int64_t num_layers, int64_t layer_idx, float scale);
|
||||
|
||||
#define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out) \
|
||||
INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)
|
||||
|
||||
#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide) \
|
||||
INST_BGMV(narrow, wide, in_T, out_T, W_T) \
|
||||
INST_BGMV(wide, narrow, in_T, out_T, W_T)
|
||||
|
@ -10,6 +10,7 @@ TEMPLATE = """
|
||||
#include "bgmv_impl.cuh"
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
|
||||
FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
|
||||
""".lstrip() # noqa: E501
|
||||
|
||||
for input_dtype in DTYPES:
|
||||
|
@ -79,12 +79,12 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
|
||||
CASE_ONESIDE(in_T, out_T, W_T, wide, narrow)
|
||||
|
||||
FOR_BGMV_WIDE_NARROW(CASE, _, _, _)
|
||||
FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _)
|
||||
#undef CASE
|
||||
#undef CASE_ONESIDE
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -67,13 +67,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
|
||||
ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
|
||||
ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
|
||||
ops.def("gptq_marlin_gemm", &gptq_marlin_gemm, "gptq_marlin Optimized Quantized GEMM for GPTQ");
|
||||
ops.def("gptq_marlin_repack", &gptq_marlin_repack, "gptq_marlin repack from GPTQ");
|
||||
ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
|
||||
#endif
|
||||
|
||||
ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
|
||||
ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
|
||||
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
|
||||
ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
|
||||
ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
|
||||
ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
|
||||
ops.def(
|
||||
"moe_align_block_size",
|
||||
&moe_align_block_size,
|
||||
@ -93,6 +96,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
"reshape_and_cache",
|
||||
&reshape_and_cache,
|
||||
"Reshape the key and value tensors and cache them");
|
||||
cache_ops.def(
|
||||
"reshape_and_cache_flash",
|
||||
&reshape_and_cache_flash,
|
||||
"Reshape the key and value tensors and cache them");
|
||||
cache_ops.def(
|
||||
"convert_fp8",
|
||||
&convert_fp8,
|
||||
|
@ -74,7 +74,30 @@ __global__ void scaled_fp8_quant_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void scaled_fp8_quant(
|
||||
void static_scaled_fp8_quant(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input, // [..., d]
|
||||
torch::Tensor& scale) // [1]
|
||||
{
|
||||
int64_t num_tokens = input.numel() / input.size(-1);
|
||||
int64_t num_elems = input.numel();
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(1024);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(),
|
||||
"scaled_fp8_quant_kernel",
|
||||
[&] {
|
||||
vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<c10::Float8_e4m3fn>(),
|
||||
input.data_ptr<scalar_t>(),
|
||||
scale.data_ptr<float>(),
|
||||
num_elems);
|
||||
});
|
||||
}
|
||||
|
||||
void dynamic_scaled_fp8_quant(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input, // [..., d]
|
||||
torch::Tensor& scale) // [1]
|
||||
|
1722
csrc/quantization/gptq_marlin/gptq_marlin.cu
Normal file
1722
csrc/quantization/gptq_marlin/gptq_marlin.cu
Normal file
File diff suppressed because it is too large
Load Diff
70
csrc/quantization/gptq_marlin/gptq_marlin.cuh
Normal file
70
csrc/quantization/gptq_marlin/gptq_marlin.cuh
Normal file
@ -0,0 +1,70 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace gptq_marlin {
|
||||
|
||||
// 8 warps are a good choice since every SM has 4 schedulers and having more than 1 warp per
|
||||
// schedule allows some more latency hiding. At the same time, we want relatively few warps to have
|
||||
// many registers per warp and small tiles.
|
||||
static constexpr int default_threads = 256;
|
||||
|
||||
static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory
|
||||
|
||||
static constexpr int min_thread_n = 64;
|
||||
static constexpr int min_thread_k = 64;
|
||||
|
||||
static constexpr int tile_size = 16;
|
||||
static constexpr int max_par = 16;
|
||||
|
||||
template <typename T, int n>
|
||||
struct Vec {
|
||||
T elems[n];
|
||||
__device__ T& operator[](int i) { return elems[i]; }
|
||||
};
|
||||
|
||||
using I4 = Vec<int, 4>;
|
||||
|
||||
constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
// No support for async
|
||||
#else
|
||||
|
||||
__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) {
|
||||
const int BYTES = 16;
|
||||
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
|
||||
asm volatile("{\n"
|
||||
" .reg .pred p;\n"
|
||||
" setp.ne.b32 p, %0, 0;\n"
|
||||
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
|
||||
"}\n" ::"r"((int)pred),
|
||||
"r"(smem), "l"(glob_ptr), "n"(BYTES));
|
||||
}
|
||||
|
||||
__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
|
||||
const int BYTES = 16;
|
||||
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
|
||||
asm volatile("{\n"
|
||||
" cp.async.cg.shared.global [%0], [%1], %2;\n"
|
||||
"}\n" ::"r"(smem),
|
||||
"l"(glob_ptr), "n"(BYTES));
|
||||
}
|
||||
|
||||
__device__ inline void cp_async_fence() { asm volatile("cp.async.commit_group;\n" ::); }
|
||||
|
||||
template <int n>
|
||||
__device__ inline void cp_async_wait() {
|
||||
asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace gptq_marlin
|
352
csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
Normal file
352
csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
Normal file
@ -0,0 +1,352 @@
|
||||
#include "gptq_marlin.cuh"
|
||||
|
||||
namespace gptq_marlin {
|
||||
|
||||
static constexpr int repack_stages = 8;
|
||||
|
||||
static constexpr int repack_threads = 256;
|
||||
|
||||
static constexpr int tile_k_size = tile_size;
|
||||
static constexpr int tile_n_size = tile_k_size * 4;
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
|
||||
template <int const num_threads, int const num_bits, bool const has_perm>
|
||||
__global__ void
|
||||
marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
|
||||
uint32_t const *__restrict__ perm_ptr,
|
||||
uint32_t *__restrict__ out_ptr, int size_k, int size_n) {}
|
||||
|
||||
} // namespace gptq_marlin
|
||||
|
||||
torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
|
||||
int64_t size_k, int64_t size_n,
|
||||
int64_t num_bits) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
|
||||
return torch::empty({1, 1});
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template <int const num_threads, int const num_bits, bool const has_perm>
|
||||
__global__ void
|
||||
marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
|
||||
uint32_t const *__restrict__ perm_ptr,
|
||||
uint32_t *__restrict__ out_ptr, int size_k, int size_n) {
|
||||
constexpr int pack_factor = 32 / num_bits;
|
||||
|
||||
int k_tiles = size_k / tile_k_size;
|
||||
int n_tiles = size_n / tile_n_size;
|
||||
int block_k_tiles = div_ceil(k_tiles, gridDim.x);
|
||||
|
||||
int start_k_tile = blockIdx.x * block_k_tiles;
|
||||
if (start_k_tile >= k_tiles) {
|
||||
return;
|
||||
}
|
||||
|
||||
int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
|
||||
|
||||
// Wait until the next thread tile has been loaded to shared memory.
|
||||
auto wait_for_stage = [&]() {
|
||||
// We only have `stages - 2` active fetches since we are double buffering
|
||||
// and can only issue the next fetch when it is guaranteed that the previous
|
||||
// shared memory load is fully complete (as it may otherwise be
|
||||
// overwritten).
|
||||
cp_async_wait<repack_stages - 2>();
|
||||
__syncthreads();
|
||||
};
|
||||
|
||||
extern __shared__ int4 sh[];
|
||||
|
||||
constexpr int perm_size = tile_k_size / 4;
|
||||
|
||||
int4 *sh_perm_ptr = sh;
|
||||
int4 *sh_pipe_ptr = sh_perm_ptr;
|
||||
if constexpr (has_perm) {
|
||||
sh_pipe_ptr += perm_size;
|
||||
}
|
||||
|
||||
constexpr int tile_ints = tile_k_size / pack_factor;
|
||||
|
||||
constexpr int stage_n_threads = tile_n_size / 4;
|
||||
constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
|
||||
constexpr int stage_size = stage_k_threads * stage_n_threads;
|
||||
|
||||
auto load_perm_to_shared = [&](int k_tile_id) {
|
||||
int first_k_int4 = (k_tile_id * tile_k_size) / 4;
|
||||
|
||||
int4 const *perm_int4_ptr = reinterpret_cast<int4 const *>(perm_ptr);
|
||||
|
||||
if (threadIdx.x < perm_size) {
|
||||
sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
};
|
||||
|
||||
auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
|
||||
if (n_tile_id >= n_tiles) {
|
||||
cp_async_fence();
|
||||
return;
|
||||
}
|
||||
|
||||
int first_n = n_tile_id * tile_n_size;
|
||||
|
||||
int4 *sh_ptr = sh_pipe_ptr + stage_size * pipe;
|
||||
|
||||
if constexpr (has_perm) {
|
||||
if (threadIdx.x < stage_size) {
|
||||
int k_id = threadIdx.x / stage_n_threads;
|
||||
int n_id = threadIdx.x % stage_n_threads;
|
||||
|
||||
uint32_t const *sh_perm_int_ptr =
|
||||
reinterpret_cast<uint32_t const *>(sh_perm_ptr);
|
||||
|
||||
int src_k = sh_perm_int_ptr[k_id];
|
||||
int src_k_packed = src_k / pack_factor;
|
||||
|
||||
cp_async4(
|
||||
&sh_ptr[k_id * stage_n_threads + n_id],
|
||||
reinterpret_cast<int4 const *>(&(
|
||||
b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
|
||||
}
|
||||
|
||||
} else {
|
||||
if (threadIdx.x < stage_size) {
|
||||
int k_id = threadIdx.x / stage_n_threads;
|
||||
int n_id = threadIdx.x % stage_n_threads;
|
||||
|
||||
int first_k = k_tile_id * tile_k_size;
|
||||
int first_k_packed = first_k / pack_factor;
|
||||
|
||||
cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
|
||||
reinterpret_cast<int4 const *>(
|
||||
&(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
|
||||
first_n + (n_id * 4)])));
|
||||
}
|
||||
}
|
||||
|
||||
cp_async_fence();
|
||||
};
|
||||
|
||||
auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
|
||||
if (n_tile_id >= n_tiles) {
|
||||
return;
|
||||
}
|
||||
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int th_id = threadIdx.x % 32;
|
||||
|
||||
if (warp_id >= 4) {
|
||||
return;
|
||||
}
|
||||
|
||||
int tc_col = th_id / 4;
|
||||
int tc_row = (th_id % 4) * 2;
|
||||
|
||||
constexpr int tc_offsets[4] = {0, 1, 8, 9};
|
||||
|
||||
int cur_n = warp_id * 16 + tc_col;
|
||||
|
||||
constexpr int sh_stride = 64;
|
||||
constexpr uint32_t mask = (1 << num_bits) - 1;
|
||||
|
||||
int4 *sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
|
||||
uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);
|
||||
|
||||
uint32_t *sh_perm_int_ptr = reinterpret_cast<uint32_t *>(sh_perm_ptr);
|
||||
|
||||
uint32_t vals[8];
|
||||
|
||||
if constexpr (has_perm) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int k_idx = tc_row + tc_offsets[i];
|
||||
|
||||
uint32_t src_k = sh_perm_int_ptr[k_idx];
|
||||
uint32_t src_k_pos = src_k % pack_factor;
|
||||
|
||||
uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
|
||||
uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
|
||||
|
||||
uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
|
||||
uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
|
||||
|
||||
vals[i] = b1_cur_val;
|
||||
vals[4 + i] = b2_cur_val;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
uint32_t b1_vals[tile_ints];
|
||||
uint32_t b2_vals[tile_ints];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < tile_ints; i++) {
|
||||
b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
|
||||
b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int cur_elem = tc_row + tc_offsets[i];
|
||||
int cur_int = cur_elem / pack_factor;
|
||||
int cur_pos = cur_elem % pack_factor;
|
||||
|
||||
vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
|
||||
vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
|
||||
int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
|
||||
|
||||
// Result of:
|
||||
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
|
||||
if constexpr (num_bits == 4) {
|
||||
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
|
||||
|
||||
uint32_t res = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
res |= vals[pack_idx[i]] << (i * 4);
|
||||
}
|
||||
|
||||
out_ptr[out_offset + th_id * 4 + warp_id] = res;
|
||||
|
||||
} else {
|
||||
constexpr int pack_idx[4] = {0, 2, 1, 3};
|
||||
|
||||
uint32_t res1 = 0;
|
||||
uint32_t res2 = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
res1 |= vals[pack_idx[i]] << (i * 8);
|
||||
res2 |= vals[4 + pack_idx[i]] << (i * 8);
|
||||
}
|
||||
|
||||
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
|
||||
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
|
||||
}
|
||||
};
|
||||
|
||||
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
|
||||
#pragma unroll
|
||||
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
|
||||
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
|
||||
}
|
||||
|
||||
wait_for_stage();
|
||||
};
|
||||
#pragma unroll
|
||||
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
|
||||
int n_tile_id = 0;
|
||||
|
||||
if constexpr (has_perm) {
|
||||
load_perm_to_shared(k_tile_id);
|
||||
}
|
||||
|
||||
start_pipes(k_tile_id, n_tile_id);
|
||||
|
||||
while (n_tile_id < n_tiles) {
|
||||
#pragma unroll
|
||||
for (int pipe = 0; pipe < repack_stages; pipe++) {
|
||||
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
|
||||
n_tile_id + pipe + repack_stages - 1);
|
||||
repack_tile(pipe, k_tile_id, n_tile_id + pipe);
|
||||
wait_for_stage();
|
||||
}
|
||||
n_tile_id += repack_stages;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gptq_marlin
|
||||
|
||||
#define CALL_IF(NUM_BITS, HAS_PERM) \
|
||||
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \
|
||||
cudaFuncSetAttribute( \
|
||||
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, \
|
||||
NUM_BITS, HAS_PERM>, \
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
|
||||
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
|
||||
HAS_PERM> \
|
||||
<<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>( \
|
||||
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
|
||||
}
|
||||
|
||||
torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
|
||||
int64_t size_k, int64_t size_n,
|
||||
int64_t num_bits) {
|
||||
// Verify compatibility with marlin tile of 16x64
|
||||
TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
|
||||
" is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
|
||||
TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
|
||||
" is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
|
||||
|
||||
TORCH_CHECK(num_bits == 4 || num_bits == 8,
|
||||
"num_bits must be 4 or 8. Got = ", num_bits);
|
||||
int const pack_factor = 32 / num_bits;
|
||||
|
||||
// Verify B
|
||||
TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
|
||||
"Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
|
||||
", size_k = ", size_k, ", pack_factor = ", pack_factor);
|
||||
TORCH_CHECK(b_q_weight.size(1) == size_n,
|
||||
"b_q_weight.size(1) = ", b_q_weight.size(1),
|
||||
" is not size_n = ", size_n);
|
||||
|
||||
// Verify device and strides
|
||||
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
|
||||
TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
|
||||
TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
|
||||
|
||||
TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
|
||||
TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
|
||||
TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
|
||||
|
||||
// Alloc buffers
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
|
||||
auto options = torch::TensorOptions()
|
||||
.dtype(b_q_weight.dtype())
|
||||
.device(b_q_weight.device());
|
||||
torch::Tensor out =
|
||||
torch::empty({size_k / gptq_marlin::tile_size,
|
||||
size_n * gptq_marlin::tile_size / pack_factor},
|
||||
options);
|
||||
|
||||
// Detect if there is act_order
|
||||
bool has_perm = perm.size(0) != 0;
|
||||
|
||||
// Get ptrs
|
||||
uint32_t const *b_q_weight_ptr =
|
||||
reinterpret_cast<uint32_t const *>(b_q_weight.data_ptr());
|
||||
uint32_t const *perm_ptr =
|
||||
reinterpret_cast<uint32_t const *>(perm.data_ptr());
|
||||
uint32_t *out_ptr = reinterpret_cast<uint32_t *>(out.data_ptr());
|
||||
|
||||
// Get dev info
|
||||
int dev = b_q_weight.get_device();
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
|
||||
int blocks;
|
||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||
|
||||
int max_shared_mem = 0;
|
||||
cudaDeviceGetAttribute(&max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
TORCH_CHECK(max_shared_mem > 0);
|
||||
|
||||
if (false) {
|
||||
}
|
||||
CALL_IF(4, false)
|
||||
CALL_IF(4, true)
|
||||
CALL_IF(8, false)
|
||||
CALL_IF(8, true)
|
||||
else {
|
||||
TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
|
||||
", has_perm = ", has_perm);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
#endif
|
@ -67,20 +67,13 @@ __device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
|
||||
"r"(smem), "l"(glob_ptr), "n"(BYTES));
|
||||
}
|
||||
|
||||
// Asynchronous global->shared copy with a cache hint indicating that the values
|
||||
// may be evicted immediately; used for quantized weights B, which are only
|
||||
// accessed precisely once and should thus not pollute the L2 cache which we
|
||||
// need for inputs A and outputs C.
|
||||
__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) {
|
||||
// Asynchronous global->shared copy
|
||||
__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
|
||||
const int BYTES = 16;
|
||||
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
|
||||
asm volatile(
|
||||
"{\n"
|
||||
" .reg .b64 p;\n"
|
||||
" createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
|
||||
" cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
|
||||
"}\n" ::"r"(smem),
|
||||
"l"(glob_ptr), "n"(BYTES));
|
||||
asm volatile("{\n"
|
||||
" cp.async.cg.shared.global [%0], [%1], %2;\n"
|
||||
"}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES));
|
||||
}
|
||||
|
||||
// Async copy fence.
|
||||
@ -448,14 +441,14 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
|
||||
int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < b_sh_wr_iters; i++) {
|
||||
cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
|
||||
cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
|
||||
B_ptr[i] += b_gl_rd_delta_o;
|
||||
}
|
||||
// Only fetch scales if this tile starts a new group
|
||||
if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
|
||||
int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
|
||||
if (s_sh_wr_pred)
|
||||
cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
|
||||
cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
|
||||
s_gl_rd += s_gl_rd_delta;
|
||||
}
|
||||
}
|
||||
@ -750,7 +743,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
|
||||
// write-out
|
||||
if (group_blocks == -1 && last) {
|
||||
if (s_sh_wr_pred)
|
||||
cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]);
|
||||
cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
|
||||
cp_async_fence();
|
||||
}
|
||||
thread_block_reduce();
|
||||
|
BIN
docs/source/assets/dev/dockerfile-stages-dependency.png
Normal file
BIN
docs/source/assets/dev/dockerfile-stages-dependency.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 115 KiB |
@ -98,9 +98,10 @@ autodoc_mock_imports = [
|
||||
for mock_target in autodoc_mock_imports:
|
||||
if mock_target in sys.modules:
|
||||
logger.info(
|
||||
f"Potentially problematic mock target ({mock_target}) found; "
|
||||
"Potentially problematic mock target (%s) found; "
|
||||
"autodoc_mock_imports cannot mock modules that have already "
|
||||
"been loaded into sys.modules when the sphinx build starts.")
|
||||
"been loaded into sys.modules when the sphinx build starts.",
|
||||
mock_target)
|
||||
|
||||
|
||||
class MockedClassDocumenter(autodoc.ClassDocumenter):
|
||||
|
50
docs/source/dev/dockerfile/dockerfile.rst
Normal file
50
docs/source/dev/dockerfile/dockerfile.rst
Normal file
@ -0,0 +1,50 @@
|
||||
Dockerfile
|
||||
====================
|
||||
|
||||
See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct
|
||||
the image for running an OpenAI compatible server with vLLM.
|
||||
|
||||
- Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||
|
||||
- All build stages
|
||||
- The default build target (highlighted in grey)
|
||||
- External images (with dashed borders)
|
||||
|
||||
The edges of the build graph represent:
|
||||
|
||||
- FROM ... dependencies (with a solid line and a full arrow head)
|
||||
- COPY --from=... dependencies (with a dashed line and an empty arrow head)
|
||||
- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
|
||||
|
||||
.. figure:: ../../assets/dev/dockerfile-stages-dependency.png
|
||||
:alt: query
|
||||
:width: 100%
|
||||
:align: center
|
||||
|
||||
Made using: https://github.com/patrickhoefler/dockerfilegraph
|
||||
|
||||
Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
|
||||
|
||||
.. code:: bash
|
||||
|
||||
dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
|
||||
|
||||
or in case you want to run it directly with the docker image:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
docker run \
|
||||
--rm \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--workdir /workspace \
|
||||
--volume "$(pwd)":/workspace \
|
||||
ghcr.io/patrickhoefler/dockerfilegraph:alpine \
|
||||
--output png \
|
||||
--dpi 200 \
|
||||
--max-label-length 50 \
|
||||
--filename Dockerfile \
|
||||
--legend
|
||||
|
||||
(To run it for a different file, you can pass in a different argument to the flag `--filename`.)
|
||||
|
||||
|
@ -3,9 +3,7 @@
|
||||
Installation with ROCm
|
||||
======================
|
||||
|
||||
vLLM 0.2.4 onwards supports model inferencing and serving on AMD GPUs with ROCm.
|
||||
At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported.
|
||||
Data types currently supported in ROCm are FP16 and BF16.
|
||||
vLLM supports AMD GPUs with ROCm 5.7 and 6.0.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
@ -13,114 +11,57 @@ Requirements
|
||||
* OS: Linux
|
||||
* Python: 3.8 -- 3.11
|
||||
* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
|
||||
* Pytorch 2.0.1/2.1.1/2.2
|
||||
* ROCm 5.7 (Verified on python 3.10) or ROCm 6.0 (Verified on python 3.9)
|
||||
* ROCm 6.0 and ROCm 5.7
|
||||
|
||||
Installation options:
|
||||
|
||||
#. :ref:`(Recommended) Quick start with vLLM pre-installed in Docker Image <quick_start_docker_rocm>`
|
||||
#. :ref:`Build from source <build_from_source_rocm>`
|
||||
#. :ref:`Build from source with docker <build_from_source_docker_rocm>`
|
||||
|
||||
.. _quick_start_docker_rocm:
|
||||
|
||||
(Recommended) Option 1: Quick start with vLLM pre-installed in Docker Image
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
This option is for ROCm 5.7 only:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4
|
||||
$ docker run -it \
|
||||
--network=host \
|
||||
--group-add=video \
|
||||
--ipc=host \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
-v <path/to/model>:/app/model \
|
||||
embeddedllminfo/vllm-rocm \
|
||||
bash
|
||||
|
||||
|
||||
.. _build_from_source_rocm:
|
||||
|
||||
Option 2: Build from source
|
||||
---------------------------
|
||||
|
||||
You can build and install vLLM from source:
|
||||
|
||||
Below instruction is for ROCm 5.7 only.
|
||||
At the time of this documentation update, PyTorch on ROCm 6.0 wheel is not yet available on the PyTorch website.
|
||||
|
||||
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
|
||||
|
||||
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
|
||||
- `Pytorch <https://pytorch.org/>`_
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip install torch==2.2.0.dev20231206+rocm5.7 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 # tested version
|
||||
|
||||
|
||||
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
|
||||
|
||||
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
|
||||
|
||||
.. note::
|
||||
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
|
||||
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
|
||||
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
|
||||
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
|
||||
|
||||
2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip install xformers==0.0.23 --no-deps
|
||||
$ bash patch_xformers.rocm.sh
|
||||
|
||||
3. Build vLLM.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ cd vllm
|
||||
$ pip install -U -r requirements-rocm.txt
|
||||
$ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
|
||||
|
||||
#. :ref:`Build from source <build_from_source_rocm>`
|
||||
|
||||
.. _build_from_source_docker_rocm:
|
||||
|
||||
Option 3: Build from source with docker
|
||||
Option 1: Build from source with docker (recommended)
|
||||
-----------------------------------------------------
|
||||
|
||||
You can build and install vLLM from source:
|
||||
You can build and install vLLM from source.
|
||||
|
||||
Build a docker image from `Dockerfile.rocm`, and launch a docker container.
|
||||
First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
|
||||
|
||||
The `Dockerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:
|
||||
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
|
||||
It provides flexibility to customize the build of docker image using the following arguments:
|
||||
|
||||
* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
|
||||
* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
|
||||
* `FA_BRANCH`: specifies the branch used to build the flash-attention in `ROCmSoftwarePlatform's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `3d2b6f5`
|
||||
* `BUILD_FA`: specifies whether to build flash-attention. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
|
||||
* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
|
||||
* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
|
||||
* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
|
||||
* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
|
||||
|
||||
Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
|
||||
|
||||
For example, to build docker image for vllm on ROCm 5.7, you can run:
|
||||
|
||||
To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker build -f Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
|
||||
-f Dockerfile.rocm -t vllm-rocm .
|
||||
|
||||
To build vllm on ROCm 6.0, you can use the default:
|
||||
To run the above docker image ``vllm-rocm``, use the below command:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker build -f Dockerfile.rocm -t vllm-rocm .
|
||||
$ docker run -it \
|
||||
--network=host \
|
||||
--group-add=video \
|
||||
@ -133,7 +74,13 @@ To build vllm on ROCm 6.0, you can use the default:
|
||||
vllm-rocm \
|
||||
bash
|
||||
|
||||
Alternatively, if you plan to install vLLM-ROCm on a local machine or start from a fresh docker image (e.g. rocm/pytorch), you can follow the steps below:
|
||||
Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
|
||||
|
||||
|
||||
.. _build_from_source_rocm:
|
||||
|
||||
Option 2: Build from source
|
||||
---------------------------
|
||||
|
||||
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
|
||||
|
||||
@ -141,22 +88,37 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
|
||||
- `Pytorch <https://pytorch.org/>`_
|
||||
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
|
||||
|
||||
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
|
||||
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
|
||||
|
||||
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
|
||||
Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
|
||||
|
||||
.. note::
|
||||
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
|
||||
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
|
||||
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
|
||||
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
|
||||
|
||||
2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
|
||||
For rocm6.0:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip install xformers==0.0.23 --no-deps
|
||||
$ bash patch_xformers.rocm.sh
|
||||
$ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0
|
||||
|
||||
|
||||
For rocm5.7:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
|
||||
|
||||
|
||||
1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
|
||||
|
||||
Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
|
||||
|
||||
2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm>`_
|
||||
|
||||
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
|
||||
|
||||
.. note::
|
||||
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
|
||||
- If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
|
||||
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
|
||||
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
|
||||
|
||||
3. Build vLLM.
|
||||
|
||||
@ -164,9 +126,12 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
|
||||
|
||||
$ cd vllm
|
||||
$ pip install -U -r requirements-rocm.txt
|
||||
$ python setup.py install # This may take 5-10 minutes.
|
||||
$ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
|
||||
|
||||
.. note::
|
||||
|
||||
.. tip::
|
||||
|
||||
- You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
|
||||
|
||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
|
||||
- To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention.
|
||||
- The ROCm version of pytorch, ideally, should match the ROCm driver version.
|
||||
|
@ -53,6 +53,7 @@ You can also build and install vLLM from source:
|
||||
|
||||
$ git clone https://github.com/vllm-project/vllm.git
|
||||
$ cd vllm
|
||||
$ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability
|
||||
$ pip install -e . # This may take 5-10 minutes.
|
||||
|
||||
.. tip::
|
||||
|
@ -75,6 +75,7 @@ Documentation
|
||||
serving/deploying_with_docker
|
||||
serving/distributed_serving
|
||||
serving/metrics
|
||||
serving/env_vars
|
||||
serving/usage_stats
|
||||
serving/integrations
|
||||
|
||||
@ -86,6 +87,7 @@ Documentation
|
||||
models/adding_model
|
||||
models/engine_args
|
||||
models/lora
|
||||
models/performance
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
@ -102,6 +104,7 @@ Documentation
|
||||
dev/sampling_params
|
||||
dev/engine/engine_index
|
||||
dev/kernel/paged_attention
|
||||
dev/dockerfile/dockerfile
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
38
docs/source/models/performance.rst
Normal file
38
docs/source/models/performance.rst
Normal file
@ -0,0 +1,38 @@
|
||||
.. _performance:
|
||||
|
||||
Performance and Tuning
|
||||
======================
|
||||
|
||||
Chunked Prefill
|
||||
---------------
|
||||
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
|
||||
|
||||
You can enable the feature by specifying
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
|
||||
# Set max_num_batched_tokens to tune performance.
|
||||
# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
|
||||
# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
|
||||
|
||||
By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to thefirst token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
|
||||
|
||||
Once chunked prefill is enabled, the policy is changed to
|
||||
|
||||
- prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill.
|
||||
- When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
|
||||
|
||||
This policy has two benefits.
|
||||
|
||||
- It improves ITL (inter token latency) and generation decode because decode requests are prioritized.
|
||||
- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
|
||||
|
||||
You can tune the performance by changing `max_num_batched_tokens`.
|
||||
By default, it is set to 512, which has the best ITL on A100 in the initial benchmark.
|
||||
Smaller batch size achieves better ITL because there are fewer prefills interrupting decodes.
|
||||
Higher batch size achieves better TTFT as you can put more prefill to the batch.
|
||||
If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
|
||||
Note that the default batch size (512) is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput.
|
||||
|
||||
See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369).
|
@ -101,7 +101,7 @@ Alongside each architecture, we include some popular models that use it.
|
||||
-
|
||||
* - :code:`OLMoForCausalLM`
|
||||
- OLMo
|
||||
- :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
|
||||
- :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
|
||||
-
|
||||
* - :code:`OPTForCausalLM`
|
||||
- OPT, OPT-IML
|
||||
@ -115,6 +115,10 @@ Alongside each architecture, we include some popular models that use it.
|
||||
- Phi
|
||||
- :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
|
||||
-
|
||||
* - :code:`Phi3ForCausalLM`
|
||||
- Phi-3
|
||||
- :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
|
||||
-
|
||||
* - :code:`QWenLMHeadModel`
|
||||
- Qwen
|
||||
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
|
||||
|
@ -49,3 +49,6 @@ To run vLLM:
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
vllm/vllm-openai <args...>
|
||||
|
||||
.. note::
|
||||
|
||||
vLLM docker image is currently designed to be run under the root user (contribution welcomed for changing this!). It will try to load library at runtime under the root user's home directory, e.g. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . If you are running the container under a different user, you may need to change the permissions of the library (and all the parent directories) to allow the user to access it. Then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
|
||||
|
9
docs/source/serving/env_vars.rst
Normal file
9
docs/source/serving/env_vars.rst
Normal file
@ -0,0 +1,9 @@
|
||||
Environment Variables
|
||||
========================
|
||||
|
||||
vLLM uses the following environment variables to configure the system:
|
||||
|
||||
.. literalinclude:: ../../../vllm/envs.py
|
||||
:language: python
|
||||
:start-after: begin-env-vars-definition
|
||||
:end-before: end-env-vars-definition
|
@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
|
||||
|
||||
You can start the server using Python, or using [Docker](deploying_with_docker.rst):
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.2 --dtype auto --api-key token-abc123
|
||||
python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
|
||||
```
|
||||
|
||||
To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
|
||||
@ -16,7 +16,7 @@ client = OpenAI(
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Hello!"}
|
||||
]
|
||||
@ -37,7 +37,7 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
|
||||
|
||||
```python
|
||||
completion = client.chat.completions.create(
|
||||
model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
model="NousResearch/Meta-Llama-3-8B-Instruct",
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
@ -87,7 +87,7 @@ In order for the language model to support chat protocol, vLLM requires the mode
|
||||
a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
|
||||
specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
|
||||
|
||||
An example chat template for `mistralai/Mistral-7B-Instruct-v0.2` can be found [here](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format)
|
||||
An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
|
||||
|
||||
Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
|
||||
you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
|
||||
|
178
examples/logging_configuration.md
Normal file
178
examples/logging_configuration.md
Normal file
@ -0,0 +1,178 @@
|
||||
# Logging Configuration
|
||||
|
||||
vLLM leverages Python's `logging.config.dictConfig` functionality to enable
|
||||
robust and flexible configuration of the various loggers used by vLLM.
|
||||
|
||||
vLLM offers two environment variables that can be used to accommodate a range
|
||||
of logging configurations that range from simple-and-inflexible to
|
||||
more-complex-and-more-flexible.
|
||||
|
||||
- No vLLM logging (simple and inflexible)
|
||||
- Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
|
||||
- vLLM's default logging configuration (simple and inflexible)
|
||||
- Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
|
||||
- Fine-grained custom logging configuration (more complex, more flexible)
|
||||
- Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
|
||||
set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
|
||||
|
||||
|
||||
## Logging Configuration Environment Variables
|
||||
|
||||
### `VLLM_CONFIGURE_LOGGING`
|
||||
|
||||
`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to
|
||||
configure the loggers used by vLLM. This functionality is enabled by default,
|
||||
but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM.
|
||||
|
||||
If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for
|
||||
`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to
|
||||
configure the root vLLM logger. By default, no other vLLM loggers are
|
||||
configured and, as such, all vLLM loggers defer to the root vLLM logger to make
|
||||
all logging decisions.
|
||||
|
||||
If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for
|
||||
`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM.
|
||||
|
||||
### `VLLM_LOGGING_CONFIG_PATH`
|
||||
|
||||
`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of
|
||||
alternative, custom logging configuration that will be used instead of vLLM's
|
||||
built-in default logging configuration. The logging configuration should be
|
||||
provided in JSON format following the schema specified by Python's [logging
|
||||
configuration dictionary
|
||||
schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details).
|
||||
|
||||
If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
|
||||
disabled, an error will occur while starting vLLM.
|
||||
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Customize vLLM root logger
|
||||
|
||||
For this example, we will customize the vLLM root logger to use
|
||||
[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
|
||||
STDOUT of the console in JSON format with a log level of `INFO`.
|
||||
|
||||
To begin, first, create an appropriate JSON logging configuration file:
|
||||
|
||||
**/path/to/logging_config.json:**
|
||||
|
||||
```json
|
||||
{
|
||||
"formatters": {
|
||||
"json": {
|
||||
"class": "pythonjsonlogger.jsonlogger.JsonFormatter"
|
||||
}
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"class" : "logging.StreamHandler",
|
||||
"formatter": "json",
|
||||
"level": "INFO",
|
||||
"stream": "ext://sys.stdout"
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"vllm": {
|
||||
"handlers": ["console"],
|
||||
"level": "INFO",
|
||||
"propagate": false
|
||||
}
|
||||
},
|
||||
"version": 1
|
||||
}
|
||||
```
|
||||
|
||||
Next, install the `python-json-logger` package if it's not already installed:
|
||||
|
||||
```bash
|
||||
pip install python-json-logger
|
||||
```
|
||||
|
||||
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
|
||||
to the path of the custom logging configuration JSON file:
|
||||
|
||||
```bash
|
||||
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--max-model-len 2048 \
|
||||
--model mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
|
||||
### Example 2: Silence a particular vLLM logger
|
||||
|
||||
To silence a particular vLLM logger, it is necessary to provide custom logging
|
||||
configuration for the target logger that configures the logger so that it won't
|
||||
propagate its log messages to the root vLLM logger.
|
||||
|
||||
When custom configuration is provided for any logger, it is also necessary to
|
||||
provide configuration for the root vLLM logger since any custom logger
|
||||
configuration overrides the built-in default logging configuration used by vLLM.
|
||||
|
||||
First, create an appropriate JSON logging configuration file that includes
|
||||
configuration for the root vLLM logger and for the logger you wish to silence:
|
||||
|
||||
**/path/to/logging_config.json:**
|
||||
|
||||
```json
|
||||
{
|
||||
"formatters": {
|
||||
"vllm": {
|
||||
"class": "vllm.logging.NewLineFormatter",
|
||||
"datefmt": "%m-%d %H:%M:%S",
|
||||
"format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
|
||||
}
|
||||
},
|
||||
"handlers": {
|
||||
"vllm": {
|
||||
"class" : "logging.StreamHandler",
|
||||
"formatter": "vllm",
|
||||
"level": "INFO",
|
||||
"stream": "ext://sys.stdout"
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"vllm": {
|
||||
"handlers": ["vllm"],
|
||||
"level": "DEBUG",
|
||||
"propagage": false
|
||||
},
|
||||
"vllm.example_noisy_logger": {
|
||||
"propagate": false
|
||||
}
|
||||
},
|
||||
"version": 1
|
||||
}
|
||||
```
|
||||
|
||||
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
|
||||
to the path of the custom logging configuration JSON file:
|
||||
|
||||
```bash
|
||||
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--max-model-len 2048 \
|
||||
--model mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
|
||||
### Example 3: Disable vLLM default logging configuration
|
||||
|
||||
To disable vLLM's default logging configuration and silence all vLLM loggers,
|
||||
simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM
|
||||
for configuring the root vLLM logger, which in turn, silences all other vLLM
|
||||
loggers.
|
||||
|
||||
```bash
|
||||
VLLM_CONFIGURE_LOGGING=0 \
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--max-model-len 2048 \
|
||||
--model mistralai/Mistral-7B-v0.1
|
||||
```
|
||||
|
||||
|
||||
## Additional resources
|
||||
|
||||
- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
|
@ -873,6 +873,289 @@
|
||||
],
|
||||
"title": "Cache Utilization",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"type": "heatmap",
|
||||
"title": "Request Prompt Length",
|
||||
"description": "Heatmap of request prompt length",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 24,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"datasource": {
|
||||
"uid": "prometheus",
|
||||
"type": "prometheus"
|
||||
},
|
||||
"id": 12,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"refId": "A",
|
||||
"expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
|
||||
"range": true,
|
||||
"instant": false,
|
||||
"editorMode": "builder",
|
||||
"legendFormat": "{{le}}",
|
||||
"useBackend": false,
|
||||
"disableTextWrap": false,
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"format": "heatmap"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"yAxis": {
|
||||
"axisPlacement": "left",
|
||||
"reverse": false,
|
||||
"unit": "none",
|
||||
"axisLabel": "Prompt Length"
|
||||
},
|
||||
"rowsFrame": {
|
||||
"layout": "auto",
|
||||
"value": "Request count"
|
||||
},
|
||||
"color": {
|
||||
"mode": "scheme",
|
||||
"fill": "dark-orange",
|
||||
"scale": "exponential",
|
||||
"exponent": 0.5,
|
||||
"scheme": "Spectral",
|
||||
"steps": 64,
|
||||
"reverse": false,
|
||||
"min": 0
|
||||
},
|
||||
"cellGap": 1,
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"yHistogram": true
|
||||
},
|
||||
"legend": {
|
||||
"show": true
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,0,255,0.7)"
|
||||
},
|
||||
"cellValues": {
|
||||
"unit": "none"
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"pluginVersion": "10.2.0"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"uid": "prometheus",
|
||||
"type": "prometheus"
|
||||
},
|
||||
"type": "heatmap",
|
||||
"title": "Request Generation Length",
|
||||
"description": "Heatmap of request generation length",
|
||||
"gridPos": {
|
||||
"x": 12,
|
||||
"y": 24,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"id": 13,
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"refId": "A",
|
||||
"expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
|
||||
"range": true,
|
||||
"instant": false,
|
||||
"editorMode": "builder",
|
||||
"legendFormat": "{{le}}",
|
||||
"useBackend": false,
|
||||
"disableTextWrap": false,
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"format": "heatmap"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"yAxis": {
|
||||
"axisPlacement": "left",
|
||||
"reverse": false,
|
||||
"unit": "none",
|
||||
"axisLabel": "Generation Length"
|
||||
},
|
||||
"rowsFrame": {
|
||||
"layout": "auto",
|
||||
"value": "Request count"
|
||||
},
|
||||
"color": {
|
||||
"mode": "scheme",
|
||||
"fill": "dark-orange",
|
||||
"scale": "exponential",
|
||||
"exponent": 0.5,
|
||||
"scheme": "Spectral",
|
||||
"steps": 64,
|
||||
"reverse": false,
|
||||
"min": 0
|
||||
},
|
||||
"cellGap": 1,
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"yHistogram": true
|
||||
},
|
||||
"legend": {
|
||||
"show": true
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,0,255,0.7)"
|
||||
},
|
||||
"cellValues": {
|
||||
"unit": "none"
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"pluginVersion": "10.2.0"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "linear",
|
||||
"barAlignment": 0,
|
||||
"lineWidth": 1,
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"spanNulls": false,
|
||||
"insertNulls": false,
|
||||
"showPoints": "auto",
|
||||
"pointSize": 5,
|
||||
"stacking": {
|
||||
"mode": "none",
|
||||
"group": "A"
|
||||
},
|
||||
"axisPlacement": "auto",
|
||||
"axisLabel": "",
|
||||
"axisColorMode": "text",
|
||||
"axisBorderShow": false,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"axisCenteredZero": false,
|
||||
"hideFrom": {
|
||||
"tooltip": false,
|
||||
"viz": false,
|
||||
"legend": false
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
},
|
||||
"legend": {
|
||||
"showLegend": true,
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"calcs": []
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"interval": "",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "Finish Reason",
|
||||
"description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "",
|
||||
|
@ -95,7 +95,7 @@ echo 'vLLM yapf: Done'
|
||||
# Run mypy
|
||||
echo 'vLLM mypy:'
|
||||
mypy vllm/attention --config-file pyproject.toml
|
||||
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/core --config-file pyproject.toml
|
||||
mypy vllm/distributed --config-file pyproject.toml
|
||||
mypy vllm/entrypoints --config-file pyproject.toml
|
||||
mypy vllm/executor --config-file pyproject.toml
|
||||
@ -105,8 +105,10 @@ mypy vllm/transformers_utils --config-file pyproject.toml
|
||||
mypy vllm/engine --config-file pyproject.toml
|
||||
mypy vllm/worker --config-file pyproject.toml
|
||||
mypy vllm/spec_decode --config-file pyproject.toml
|
||||
mypy vllm/model_executor/*.py --config-file pyproject.toml
|
||||
# mypy vllm/lora/*.py --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy vllm/lora --config-file pyproject.toml
|
||||
mypy vllm/logging --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
|
||||
|
||||
CODESPELL_EXCLUDES=(
|
||||
|
@ -5,7 +5,7 @@ requires = [
|
||||
"ninja",
|
||||
"packaging",
|
||||
"setuptools >= 49.4.0",
|
||||
"torch == 2.2.1",
|
||||
"torch == 2.3.0",
|
||||
"wheel",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
@ -32,6 +32,7 @@ select = [
|
||||
"SIM",
|
||||
# isort
|
||||
# "I",
|
||||
"G",
|
||||
]
|
||||
ignore = [
|
||||
# star imports
|
||||
|
@ -3,5 +3,5 @@ cmake>=3.21
|
||||
ninja
|
||||
packaging
|
||||
setuptools>=49.4.0
|
||||
torch==2.2.1
|
||||
torch==2.3.0
|
||||
wheel
|
||||
|
@ -8,9 +8,11 @@ py-cpuinfo
|
||||
transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3.
|
||||
tokenizers >= 0.19.1 # Required for Llama 3.
|
||||
fastapi
|
||||
openai
|
||||
uvicorn[standard]
|
||||
pydantic >= 2.0 # Required for OpenAI server.
|
||||
prometheus_client >= 0.18.0
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
tiktoken == 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer == 0.9.8
|
||||
outlines == 0.0.34 # Requires torch >= 2.1.0
|
||||
|
@ -2,5 +2,5 @@
|
||||
-r requirements-common.txt
|
||||
|
||||
# Dependencies for x86_64 CPUs
|
||||
torch == 2.2.1+cpu
|
||||
torch == 2.3.0+cpu
|
||||
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
|
@ -5,5 +5,5 @@
|
||||
ray >= 2.9
|
||||
nvidia-ml-py # for pynvml package
|
||||
vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library
|
||||
torch == 2.2.1
|
||||
xformers == 0.0.25 # Requires PyTorch 2.2.1
|
||||
torch == 2.3.0
|
||||
xformers == 0.0.26.post1 # Requires PyTorch 2.3.0
|
||||
|
@ -14,19 +14,17 @@ types-setuptools
|
||||
|
||||
# testing
|
||||
pytest
|
||||
tensorizer==2.9.0a0
|
||||
tensorizer==2.9.0
|
||||
pytest-forked
|
||||
pytest-asyncio
|
||||
pytest-rerunfailures
|
||||
pytest-shard
|
||||
httpx
|
||||
einops # required for MPT
|
||||
openai
|
||||
requests
|
||||
ray
|
||||
peft
|
||||
awscli
|
||||
ai2-olmo # required for OLMo
|
||||
|
||||
# Benchmarking
|
||||
aiohttp
|
||||
|
45
setup.py
45
setup.py
@ -1,3 +1,4 @@
|
||||
import importlib.util
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
@ -13,10 +14,23 @@ from setuptools import Extension, find_packages, setup
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
|
||||
|
||||
def load_module_from_path(module_name, path):
|
||||
spec = importlib.util.spec_from_file_location(module_name, path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
ROOT_DIR = os.path.dirname(__file__)
|
||||
logger = logging.getLogger(__name__)
|
||||
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
|
||||
VLLM_TARGET_DEVICE = os.getenv("VLLM_TARGET_DEVICE", "cuda")
|
||||
|
||||
# cannot import envs directly because it depends on vllm,
|
||||
# which is not installed yet
|
||||
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
|
||||
|
||||
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
|
||||
|
||||
# vLLM only supports Linux platform
|
||||
assert sys.platform.startswith(
|
||||
@ -60,10 +74,10 @@ class cmake_build_ext(build_ext):
|
||||
def compute_num_jobs(self):
|
||||
# `num_jobs` is either the value of the MAX_JOBS environment variable
|
||||
# (if defined) or the number of CPUs available.
|
||||
num_jobs = os.environ.get("MAX_JOBS", None)
|
||||
num_jobs = envs.MAX_JOBS
|
||||
if num_jobs is not None:
|
||||
num_jobs = int(num_jobs)
|
||||
logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
|
||||
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
|
||||
else:
|
||||
try:
|
||||
# os.sched_getaffinity() isn't universally available, so fall
|
||||
@ -78,11 +92,12 @@ class cmake_build_ext(build_ext):
|
||||
# environment variable (if defined) or 1.
|
||||
# when it is set, we reduce `num_jobs` to avoid
|
||||
# overloading the system.
|
||||
nvcc_threads = os.getenv("NVCC_THREADS", None)
|
||||
nvcc_threads = envs.NVCC_THREADS
|
||||
if nvcc_threads is not None:
|
||||
nvcc_threads = int(nvcc_threads)
|
||||
logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
|
||||
" of nvcc threads.")
|
||||
logger.info(
|
||||
"Using NVCC_THREADS=%d as the number of nvcc threads.",
|
||||
nvcc_threads)
|
||||
else:
|
||||
nvcc_threads = 1
|
||||
num_jobs = max(1, num_jobs // nvcc_threads)
|
||||
@ -103,7 +118,7 @@ class cmake_build_ext(build_ext):
|
||||
# Select the build type.
|
||||
# Note: optimization level + debug info are set by the build type
|
||||
default_cfg = "Debug" if self.debug else "RelWithDebInfo"
|
||||
cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
|
||||
cfg = envs.CMAKE_BUILD_TYPE or default_cfg
|
||||
|
||||
# where .so files will be written, should be the same for all extensions
|
||||
# that use the same CMakeLists.txt.
|
||||
@ -117,7 +132,7 @@ class cmake_build_ext(build_ext):
|
||||
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
|
||||
]
|
||||
|
||||
verbose = bool(int(os.getenv('VERBOSE', '0')))
|
||||
verbose = envs.VERBOSE
|
||||
if verbose:
|
||||
cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
|
||||
|
||||
@ -204,8 +219,7 @@ def _is_neuron() -> bool:
|
||||
subprocess.run(["neuron-ls"], capture_output=True, check=True)
|
||||
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
|
||||
torch_neuronx_installed = False
|
||||
return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
|
||||
False)
|
||||
return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON
|
||||
|
||||
|
||||
def _is_cpu() -> bool:
|
||||
@ -213,7 +227,7 @@ def _is_cpu() -> bool:
|
||||
|
||||
|
||||
def _install_punica() -> bool:
|
||||
return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
|
||||
return envs.VLLM_INSTALL_PUNICA_KERNELS
|
||||
|
||||
|
||||
def get_hipcc_rocm_version():
|
||||
@ -376,7 +390,8 @@ if not _is_neuron():
|
||||
package_data = {
|
||||
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
|
||||
}
|
||||
if os.environ.get("VLLM_USE_PRECOMPILED"):
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
ext_modules = []
|
||||
package_data["vllm"].append("*.so")
|
||||
|
||||
setup(
|
||||
@ -402,12 +417,12 @@ setup(
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
],
|
||||
packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
|
||||
"tests")),
|
||||
"tests*")),
|
||||
python_requires=">=3.8",
|
||||
install_requires=get_requirements(),
|
||||
ext_modules=ext_modules,
|
||||
extras_require={
|
||||
"tensorizer": ["tensorizer==2.9.0a1"],
|
||||
"tensorizer": ["tensorizer==2.9.0"],
|
||||
},
|
||||
cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
|
||||
package_data=package_data,
|
||||
|
@ -91,4 +91,6 @@ async def test_new_requests_event():
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
|
||||
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
|
||||
assert engine.get_model_config() is not None
|
||||
assert engine.get_tokenizer() is not None
|
||||
assert engine.get_decoding_config() is not None
|
||||
|
@ -60,12 +60,13 @@ class MockServingChat:
|
||||
tokenizer: MockTokenizer
|
||||
|
||||
|
||||
def test_load_chat_template():
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_chat_template():
|
||||
# Testing chatml template
|
||||
tokenizer = MockTokenizer()
|
||||
mock_serving_chat = MockServingChat(tokenizer)
|
||||
OpenAIServingChat._load_chat_template(mock_serving_chat,
|
||||
chat_template=chatml_jinja_path)
|
||||
await OpenAIServingChat._load_chat_template(
|
||||
mock_serving_chat, chat_template=chatml_jinja_path)
|
||||
|
||||
template_content = tokenizer.chat_template
|
||||
|
||||
@ -76,7 +77,8 @@ def test_load_chat_template():
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
|
||||
|
||||
|
||||
def test_no_load_chat_template_filelike():
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_load_chat_template_filelike():
|
||||
# Testing chatml template
|
||||
template = "../../examples/does_not_exist"
|
||||
tokenizer = MockTokenizer()
|
||||
@ -84,17 +86,18 @@ def test_no_load_chat_template_filelike():
|
||||
mock_serving_chat = MockServingChat(tokenizer)
|
||||
|
||||
with pytest.raises(ValueError, match="looks like a file path"):
|
||||
OpenAIServingChat._load_chat_template(mock_serving_chat,
|
||||
await OpenAIServingChat._load_chat_template(mock_serving_chat,
|
||||
chat_template=template)
|
||||
|
||||
|
||||
def test_no_load_chat_template_literallike():
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_load_chat_template_literallike():
|
||||
# Testing chatml template
|
||||
template = "{{ messages }}"
|
||||
tokenizer = MockTokenizer()
|
||||
|
||||
mock_serving_chat = MockServingChat(tokenizer)
|
||||
OpenAIServingChat._load_chat_template(mock_serving_chat,
|
||||
await OpenAIServingChat._load_chat_template(mock_serving_chat,
|
||||
chat_template=template)
|
||||
template_content = tokenizer.chat_template
|
||||
|
||||
@ -110,7 +113,7 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||
# Initialize the tokenizer
|
||||
tokenizer = get_tokenizer(tokenizer_name=model)
|
||||
mock_serving_chat = MockServingChat(tokenizer)
|
||||
OpenAIServingChat._load_chat_template(mock_serving_chat,
|
||||
await OpenAIServingChat._load_chat_template(mock_serving_chat,
|
||||
chat_template=template)
|
||||
|
||||
# Create a mock request object using keyword arguments
|
||||
|
41
tests/async_engine/test_merge_async_iterators.py
Normal file
41
tests/async_engine/test_merge_async_iterators.py
Normal file
@ -0,0 +1,41 @@
|
||||
import asyncio
|
||||
from typing import AsyncIterator, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import merge_async_iterators
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_merge_async_iterators():
|
||||
|
||||
async def mock_async_iterator(idx: int) -> AsyncIterator[str]:
|
||||
try:
|
||||
while True:
|
||||
yield f"item from iterator {idx}"
|
||||
await asyncio.sleep(0.1)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
iterators = [mock_async_iterator(i) for i in range(3)]
|
||||
merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators(
|
||||
*iterators)
|
||||
|
||||
async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
|
||||
async for idx, output in generator:
|
||||
print(f"idx: {idx}, output: {output}")
|
||||
|
||||
task = asyncio.create_task(stream_output(merged_iterator))
|
||||
await asyncio.sleep(0.5)
|
||||
task.cancel()
|
||||
with pytest.raises(asyncio.CancelledError):
|
||||
await task
|
||||
|
||||
for iterator in iterators:
|
||||
try:
|
||||
await asyncio.wait_for(anext(iterator), 1)
|
||||
except StopAsyncIteration:
|
||||
# All iterators should be cancelled and print this message.
|
||||
print("Iterator was cancelled normally")
|
||||
except (Exception, asyncio.CancelledError) as e:
|
||||
raise AssertionError() from e
|
157
tests/async_engine/test_openapi_server_ray.py
Normal file
157
tests/async_engine/test_openapi_server_ray.py
Normal file
@ -0,0 +1,157 @@
|
||||
# imports for guided decoding tests
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
# using Ray for overall ease of process management, parallel requests,
|
||||
# and debugging.
|
||||
import ray
|
||||
import requests
|
||||
|
||||
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class ServerRunner:
|
||||
|
||||
def __init__(self, args):
|
||||
env = os.environ.copy()
|
||||
env["PYTHONUNBUFFERED"] = "1"
|
||||
self.proc = subprocess.Popen(
|
||||
["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
|
||||
env=env,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stderr,
|
||||
)
|
||||
self._wait_for_server()
|
||||
|
||||
def ready(self):
|
||||
return True
|
||||
|
||||
def _wait_for_server(self):
|
||||
# run health check
|
||||
start = time.time()
|
||||
while True:
|
||||
try:
|
||||
if requests.get(
|
||||
"http://localhost:8000/health").status_code == 200:
|
||||
break
|
||||
except Exception as err:
|
||||
if self.proc.poll() is not None:
|
||||
raise RuntimeError("Server exited unexpectedly.") from err
|
||||
|
||||
time.sleep(0.5)
|
||||
if time.time() - start > MAX_SERVER_START_WAIT_S:
|
||||
raise RuntimeError(
|
||||
"Server failed to start in time.") from err
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, "proc"):
|
||||
self.proc.terminate()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def server():
|
||||
ray.init()
|
||||
server_runner = ServerRunner.remote([
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--enforce-eager",
|
||||
"--engine-use-ray"
|
||||
])
|
||||
ray.get(server_runner.ready.remote())
|
||||
yield server_runner
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def client():
|
||||
client = openai.AsyncOpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="token-abc123",
|
||||
)
|
||||
yield client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_models(server, client: openai.AsyncOpenAI):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert all(model.root == MODEL_NAME for model in models)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_completion(server, client: openai.AsyncOpenAI):
|
||||
completion = await client.completions.create(model=MODEL_NAME,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
assert completion.choices[0].text is not None and len(
|
||||
completion.choices[0].text) >= 5
|
||||
assert completion.choices[0].finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert completion.choices[0].text is not None and len(
|
||||
completion.choices[0].text) >= 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_chat_session(server, client: openai.AsyncOpenAI):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5)
|
||||
assert chat_completion.id is not None
|
||||
assert chat_completion.choices is not None and len(
|
||||
chat_completion.choices) == 1
|
||||
assert chat_completion.choices[0].message is not None
|
||||
assert chat_completion.choices[0].logprobs is not None
|
||||
assert chat_completion.choices[0].logprobs.top_logprobs is not None
|
||||
assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
@ -2,12 +2,15 @@
|
||||
|
||||
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
]
|
||||
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@ -23,11 +26,18 @@ def test_models(
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
) -> None:
|
||||
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
|
||||
if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
|
||||
pytest.skip("Skipping non-eager test for FlashInferBackend.")
|
||||
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager)
|
||||
vllm_model = vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
del vllm_model
|
||||
|
||||
|
@ -55,7 +55,6 @@ def test_models(
|
||||
)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
del vllm_model
|
||||
print(vllm_outputs[0])
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
|
223
tests/basic_correctness/test_preemption.py
Normal file
223
tests/basic_correctness/test_preemption.py
Normal file
@ -0,0 +1,223 @@
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
|
||||
|
||||
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
|
||||
pytest tests/basic_correctness/test_preemption.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
|
||||
ENABLE_ARTIFICIAL_PREEMPT)
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
]
|
||||
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
|
||||
"tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
def test_chunked_prefill_recompute(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
) -> None:
|
||||
"""Ensure that chunked prefill works with preemption."""
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_seqs=max_num_seqs,
|
||||
)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
|
||||
ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""By default, recompute preemption is enabled"""
|
||||
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
|
||||
ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("beam_width", [4])
|
||||
def test_swap(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
beam_width: int,
|
||||
) -> None:
|
||||
"""Use beam search enables swapping."""
|
||||
example_prompts = example_prompts[:1]
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
|
||||
max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(model, dtype=dtype, swap_space=10)
|
||||
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
|
||||
max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
|
||||
ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, _ = hf_outputs[i]
|
||||
vllm_output_ids, _ = vllm_outputs[i]
|
||||
assert len(hf_output_ids) == len(vllm_output_ids)
|
||||
for j in range(len(hf_output_ids)):
|
||||
assert hf_output_ids[j] == vllm_output_ids[j], (
|
||||
f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
|
||||
f"vLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("beam_width", [4])
|
||||
def test_swap_infeasible(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
beam_width: int,
|
||||
) -> None:
|
||||
"""Verify infeasible swap request will be ignored."""
|
||||
BLOCK_SIZE = 16
|
||||
prefill_blocks = 2
|
||||
decode_blocks = max_tokens // BLOCK_SIZE
|
||||
example_prompts = example_prompts[:1]
|
||||
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
swap_space=10,
|
||||
block_size=BLOCK_SIZE,
|
||||
# Since beam search have more than 1 sequence, prefill + decode blocks
|
||||
# are not enough to finish.
|
||||
num_gpu_blocks_override=prefill_blocks + decode_blocks,
|
||||
max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
|
||||
)
|
||||
sampling_params = SamplingParams(n=beam_width,
|
||||
use_beam_search=True,
|
||||
temperature=0.0,
|
||||
max_tokens=max_tokens,
|
||||
ignore_eos=True)
|
||||
req_outputs = vllm_model.model.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
|
||||
ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
del vllm_model
|
||||
# Verify the request is ignored and not hang.
|
||||
assert req_outputs[0].outputs[0].finish_reason == "length"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption_infeasible(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""Verify infeasible preemption request will be ignored."""
|
||||
BLOCK_SIZE = 16
|
||||
prefill_blocks = 2
|
||||
decode_blocks = max_tokens // BLOCK_SIZE
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
block_size=BLOCK_SIZE,
|
||||
# Not enough gpu blocks to complete a single sequence.
|
||||
# preemption should happen, and the sequence should be
|
||||
# ignored instead of hanging forever.
|
||||
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
|
||||
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
|
||||
)
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
|
||||
req_outputs = vllm_model.model.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
|
||||
ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
del vllm_model
|
||||
# Verify the request is ignored and not hang.
|
||||
for req_output in req_outputs:
|
||||
outputs = req_output.outputs
|
||||
assert len(outputs) == 1
|
||||
assert outputs[0].finish_reason == "length"
|
@ -296,6 +296,7 @@ class VllmRunner:
|
||||
tensor_parallel_size: int = 1,
|
||||
block_size: int = 16,
|
||||
enable_chunked_prefill: bool = False,
|
||||
swap_space=4,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.model = LLM(
|
||||
@ -303,7 +304,7 @@ class VllmRunner:
|
||||
tokenizer=tokenizer_name,
|
||||
trust_remote_code=True,
|
||||
dtype=dtype,
|
||||
swap_space=0,
|
||||
swap_space=swap_space,
|
||||
disable_log_stats=disable_log_stats,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
|
@ -300,6 +300,152 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
|
||||
# Enable prefill cache
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"use_v2_block_manager": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
|
||||
baseline_llm_generator, test_llm_generator, batch_size):
|
||||
"""Verify block manager v2 produces same outputs as block manager v1, even
|
||||
when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that the KV
|
||||
cache is not corrupted in the v2 block manager.
|
||||
|
||||
NOTE: We want a significant number of generated tokens so that any incorrect
|
||||
KV mapping has time to build up error.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids from block manager v1')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids from block manager v2')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
|
||||
# Test APC in v2 block
|
||||
"use_v2_block_manager": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"enable_prefix_caching": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify block manager v2 with auto prefix caching enabled produces same
|
||||
outputs as auto prefix caching disabled, even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that auto
|
||||
prefix caching itself at least don't cause result error.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with APC disabled')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with APC enabled')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
|
||||
for llm in llm_generator:
|
||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
|
@ -358,6 +358,131 @@ class TestPrefixCachingBlockAllocator:
|
||||
i)
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
|
||||
seed: int):
|
||||
"""Verify get_common_computed_block_ids could get correct result
|
||||
by create two immutable chain sharing prefix at specified pos,
|
||||
and compare whether we also could get right result
|
||||
from get_common_computed_block_ids.
|
||||
"""
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = random.randint(1, num_blocks - 1)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
blocks = list(range(num_blocks_to_consume))
|
||||
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# mark all blocks in first chain as computed
|
||||
allocator.mark_blocks_as_computed(blocks)
|
||||
|
||||
# After zero_point, second_chain's token_ids would be set -1, which
|
||||
# make it different from here comparing with first_chain
|
||||
zero_point = random.randint(1, len(token_ids) - 1)
|
||||
zero_point_blocks = zero_point // block_size
|
||||
token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
|
||||
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
first_computed_ids = [
|
||||
first_chain[i].block_id for i in range(num_blocks_to_consume)
|
||||
]
|
||||
second_computed_ids = [
|
||||
second_chain[i].block_id for i in range(num_blocks_to_consume)
|
||||
]
|
||||
res = allocator.get_common_computed_block_ids(
|
||||
[first_computed_ids, second_computed_ids])
|
||||
|
||||
assert (len(res) == zero_point_blocks)
|
||||
|
||||
# Test case where two last accessed times are equal
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_eviction_order(num_blocks: int, block_size: int, seed: int):
|
||||
"""This test case simulate the two chain created and free in order,
|
||||
and together they would exhaust the initial freed blocks.
|
||||
|
||||
So the next block created after those two chain shall use the block
|
||||
from the first chain as that block has long access time.
|
||||
While first chain has two blocks, it shall pick up the last one, as
|
||||
it has larger token number.
|
||||
"""
|
||||
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = num_blocks + 1
|
||||
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
num_blocks_in_first_chain = 2
|
||||
num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
|
||||
# First chain takes the first block
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[:num_tokens_in_first_chain],
|
||||
allocator=allocator,
|
||||
)
|
||||
# There should only be one block allocated at this point
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_in_first_chain)
|
||||
|
||||
# Set the last accessed time of the first block to 1
|
||||
blocks_ids = [block.block_id for block in first_chain]
|
||||
allocator.mark_blocks_as_accessed(blocks_ids, 1)
|
||||
|
||||
# Second chain takes the rest of the blocks
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[num_tokens_in_first_chain:-block_size],
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# There shouldn't be any blocks left at this point
|
||||
assert allocator.get_num_free_blocks() == (0)
|
||||
|
||||
assert len(first_chain) == num_blocks_in_first_chain
|
||||
last_block_id = first_chain[-1].block_id
|
||||
# Free each block in the first chain.
|
||||
for i, block in enumerate(first_chain):
|
||||
allocator.free(block)
|
||||
|
||||
# Set the last accessed time on all of the blocks in the second chain
|
||||
# to 2
|
||||
blocks_ids = [block.block_id for block in second_chain]
|
||||
allocator.mark_blocks_as_accessed(blocks_ids, 2)
|
||||
|
||||
# Free each block in the second chain.
|
||||
for i, block in enumerate(second_chain):
|
||||
allocator.free(block)
|
||||
|
||||
# Allocate a new block and check that it's the least recently used block
|
||||
# from the first chain.
|
||||
new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[-block_size:],
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
assert new_block[0].block_id == last_block_id
|
||||
|
||||
@staticmethod
|
||||
def create_immutable_chain(
|
||||
block_size: int,
|
||||
|
@ -224,7 +224,7 @@ def test_swap():
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
cpu_blocks = block_manager.get_block_table(prompt)
|
||||
assert block_manager.can_swap_in(seq_group)
|
||||
assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_in(seq_group)
|
||||
|
@ -4,6 +4,7 @@ from unittest.mock import MagicMock
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import Logprob, SequenceGroup
|
||||
|
||||
@ -410,7 +411,7 @@ def test_running_prefill_prioritized_over_swap():
|
||||
|
||||
# Add 1 more task. Swap is not possible, so prefill is running.
|
||||
scheduler.block_manager.can_swap_in = MagicMock()
|
||||
scheduler.block_manager.can_swap_in.return_value = False
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
|
||||
|
||||
_, seq_group2 = create_dummy_prompt("2", prompt_length=60)
|
||||
scheduler.add_seq_group(seq_group2)
|
||||
@ -423,7 +424,7 @@ def test_running_prefill_prioritized_over_swap():
|
||||
assert out.scheduled_seq_groups[0].seq_group == seq_group2
|
||||
|
||||
# Now although swap is possible, running prefill is prioritized.
|
||||
scheduler.block_manager.can_swap_in.return_value = True
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.OK
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
# 3 decodes. It is swapped in.
|
||||
|
@ -791,7 +791,7 @@ def test_schedule_swapped_cannot_swap_in():
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.can_swap_in = MagicMock()
|
||||
scheduler.block_manager.can_swap_in.return_value = False
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
|
||||
# Since we cannot swap in, none of the requests are swapped in.
|
||||
budget = create_token_budget()
|
||||
remaining_swapped, output = scheduler._schedule_swapped(
|
||||
@ -803,6 +803,34 @@ def test_schedule_swapped_cannot_swap_in():
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
|
||||
|
||||
def test_infeasible_swap():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
policy = PolicyFactory.get_policy(policy_name="fcfs")
|
||||
curr_loras = None
|
||||
blocks_to_swap_out = {}
|
||||
for _ in range(2):
|
||||
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
swapped.append(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.can_swap_in = MagicMock()
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
|
||||
# Since we cannot swap in, none of the requests are swapped in.
|
||||
budget = create_token_budget()
|
||||
remaining_swapped, output = scheduler._schedule_swapped(
|
||||
swapped, budget, curr_loras, policy)
|
||||
assert len(remaining_swapped) == 0
|
||||
assert len(output.infeasible_seq_groups) == 2
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(output.decode_seq_groups) == 0
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
|
||||
|
||||
def test_schedule_swapped_blocks_to_copy():
|
||||
scheduler = initialize_scheduler()
|
||||
swapped = deque()
|
||||
|
@ -18,6 +18,7 @@ import torch
|
||||
MODELS = [
|
||||
os.environ["TEST_DIST_MODEL"],
|
||||
]
|
||||
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
@ -33,16 +34,19 @@ def test_models(
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
enforce_eager = False
|
||||
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
|
||||
if backend_by_env_var == "FLASHINFER":
|
||||
enforce_eager = True
|
||||
|
||||
hf_model = hf_runner(model, dtype=dtype)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
vllm_model = vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
)
|
||||
enforce_eager=enforce_eager)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
del vllm_model
|
||||
|
||||
|
@ -3,9 +3,13 @@ import multiprocessing
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
|
||||
ncclGetUniqueId)
|
||||
from vllm.distributed.parallel_state import init_distributed_environment
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
|
||||
init_distributed_environment, with_pynccl_for_all_reduce)
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
|
||||
@ -58,6 +62,65 @@ def test_pynccl():
|
||||
distributed_run(worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_tp_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[2, 3], backend="gloo")
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
|
||||
comm = NCCLCommunicator(group=group, device=device)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
# two groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
comm.all_reduce(tensor)
|
||||
comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
else:
|
||||
comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_tp():
|
||||
# this tests pynccl for multiple tp groups, in a standalone way
|
||||
# i.e. call `comm.all_reduce` directly
|
||||
distributed_run(multiple_tp_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_tp_with_vllm_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
torch.cuda.set_device(torch.distributed.get_rank())
|
||||
ensure_model_parallel_initialized(2, 2)
|
||||
pynccl_utils.init_process_group(
|
||||
group=get_tensor_model_parallel_cpu_group())
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
with with_pynccl_for_all_reduce():
|
||||
# two tp groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
else:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_tp_with_vllm():
|
||||
# this tests pynccl for multiple tp groups, together with vllm
|
||||
# i.e. call `tensor_model_parallel_all_reduce`
|
||||
distributed_run(multiple_tp_with_vllm_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn_with_cudagraph():
|
||||
with torch.no_grad():
|
||||
|
176
tests/engine/test_multiproc_workers.py
Normal file
176
tests/engine/test_multiproc_workers.py
Normal file
@ -0,0 +1,176 @@
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from time import sleep
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
|
||||
|
||||
class DummyWorker:
|
||||
"""Dummy version of vllm.worker.worker.Worker"""
|
||||
|
||||
def __init__(self, rank: int):
|
||||
self.rank = rank
|
||||
|
||||
def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
|
||||
sleep(0.05)
|
||||
|
||||
if isinstance(worker_input, Exception):
|
||||
# simulate error case
|
||||
raise worker_input
|
||||
|
||||
return self.rank, input
|
||||
|
||||
|
||||
def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
|
||||
result_handler = ResultHandler()
|
||||
workers = [
|
||||
ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
|
||||
for rank in range(8)
|
||||
]
|
||||
|
||||
worker_monitor = WorkerMonitor(workers, result_handler)
|
||||
assert not worker_monitor.is_alive()
|
||||
|
||||
result_handler.start()
|
||||
worker_monitor.start()
|
||||
assert worker_monitor.is_alive()
|
||||
|
||||
return workers, worker_monitor
|
||||
|
||||
|
||||
def test_local_workers() -> None:
|
||||
"""Test workers with sync task submission"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
def execute_workers(worker_input: str) -> None:
|
||||
worker_outputs = [
|
||||
worker.execute_method("worker_method", worker_input)
|
||||
for worker in workers
|
||||
]
|
||||
|
||||
for rank, output in enumerate(worker_outputs):
|
||||
assert output.get() == (rank, input)
|
||||
|
||||
executor = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
# Test concurrent submission from different threads
|
||||
futures = [
|
||||
executor.submit(partial(execute_workers, f"thread {thread_num}"))
|
||||
for thread_num in range(4)
|
||||
]
|
||||
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
# Test error case
|
||||
exception = ValueError("fake error")
|
||||
result = workers[0].execute_method("worker_method", exception)
|
||||
try:
|
||||
result.get()
|
||||
pytest.fail("task should have failed")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ValueError)
|
||||
assert str(e) == "fake error"
|
||||
|
||||
# Test cleanup when a worker fails
|
||||
assert worker_monitor.is_alive()
|
||||
workers[3].process.kill()
|
||||
|
||||
# Other workers should get shut down here
|
||||
worker_monitor.join(2)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = workers[0].execute_method("worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
|
||||
|
||||
def test_local_workers_clean_shutdown() -> None:
|
||||
"""Test clean shutdown"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
assert worker_monitor.is_alive()
|
||||
assert all(worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Clean shutdown
|
||||
worker_monitor.close()
|
||||
|
||||
worker_monitor.join(5)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = workers[0].execute_method("worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_local_workers_async() -> None:
|
||||
"""Test local workers with async task submission"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
async def execute_workers(worker_input: str) -> None:
|
||||
worker_coros = [
|
||||
worker.execute_method_async("worker_method", worker_input)
|
||||
for worker in workers
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*worker_coros)
|
||||
for rank, result in enumerate(results):
|
||||
assert result == (rank, input)
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(execute_workers(f"task {task_num}"))
|
||||
for task_num in range(4)
|
||||
]
|
||||
|
||||
for task in tasks:
|
||||
await task
|
||||
|
||||
# Test error case
|
||||
exception = ValueError("fake error")
|
||||
try:
|
||||
_result = await workers[0].execute_method_async(
|
||||
"worker_method", exception)
|
||||
pytest.fail("task should have failed")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ValueError)
|
||||
assert str(e) == "fake error"
|
||||
|
||||
# Test cleanup when a worker fails
|
||||
assert worker_monitor.is_alive()
|
||||
workers[3].process.kill()
|
||||
|
||||
# Other workers should get shut down here
|
||||
worker_monitor.join(2)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = await workers[0].execute_method_async(
|
||||
"worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
37
tests/entrypoints/openai/test_serving_chat.py
Normal file
37
tests/entrypoints/openai/test_serving_chat.py
Normal file
@ -0,0 +1,37 @@
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
CHAT_TEMPLATE = "Dummy chat template for testing {}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
tokenizer = MODEL_NAME
|
||||
trust_remote_code = False
|
||||
tokenizer_mode = "auto"
|
||||
max_model_len = 100
|
||||
tokenizer_revision = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockEngine:
|
||||
|
||||
async def get_model_config(self):
|
||||
return MockModelConfig
|
||||
|
||||
|
||||
async def _async_serving_chat_init():
|
||||
serving_completion = OpenAIServingChat(MockEngine(),
|
||||
served_model_names=[MODEL_NAME],
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE)
|
||||
return serving_completion
|
||||
|
||||
|
||||
def test_async_serving_chat_init():
|
||||
serving_completion = asyncio.run(_async_serving_chat_init())
|
||||
assert serving_completion.tokenizer is not None
|
||||
assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
|
@ -57,7 +57,9 @@ def test_guided_logits_processors():
|
||||
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
|
||||
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
|
||||
regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
|
||||
json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer)
|
||||
json_LP = JSONLogitsProcessor(TEST_SCHEMA,
|
||||
tokenizer,
|
||||
whitespace_pattern=None)
|
||||
|
||||
regex_LP.init_state()
|
||||
token_ids = tokenizer.encode(
|
||||
|
@ -13,8 +13,10 @@ import pytest
|
||||
# and debugging.
|
||||
import ray
|
||||
import requests
|
||||
import torch
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
from openai import BadRequestError
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
@ -148,7 +150,7 @@ def server(zephyr_lora_files):
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@pytest.fixture(scope="module")
|
||||
def client():
|
||||
client = openai.AsyncOpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
@ -770,6 +772,40 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
|
||||
assert loaded == {"result": 2}, loaded
|
||||
|
||||
|
||||
async def test_extra_fields(server, client: openai.AsyncOpenAI):
|
||||
with pytest.raises(BadRequestError) as exc_info:
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
"extra_field": "0",
|
||||
}], # type: ignore
|
||||
temperature=0,
|
||||
seed=0)
|
||||
|
||||
assert "extra_forbidden" in exc_info.value.message
|
||||
|
||||
|
||||
async def test_complex_message_content(server, client: openai.AsyncOpenAI):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
"what is 1+1? please provide the result without any other text."
|
||||
}]
|
||||
}],
|
||||
temperature=0,
|
||||
seed=0)
|
||||
content = resp.choices[0].message.content
|
||||
assert content == "2"
|
||||
|
||||
|
||||
async def test_guided_grammar(server, client: openai.AsyncOpenAI):
|
||||
simple_sql_grammar = """
|
||||
start: select_statement
|
||||
@ -835,5 +871,24 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
|
||||
assert len(logprobs.tokens) > 5
|
||||
|
||||
|
||||
async def test_long_seed(server, client: openai.AsyncOpenAI):
|
||||
for seed in [
|
||||
torch.iinfo(torch.long).min - 1,
|
||||
torch.iinfo(torch.long).max + 1
|
||||
]:
|
||||
with pytest.raises(BadRequestError) as exc_info:
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
}],
|
||||
temperature=0,
|
||||
seed=seed)
|
||||
|
||||
assert ("greater_than_equal" in exc_info.value.message
|
||||
or "less_than_equal" in exc_info.value.message)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
|
@ -1,8 +1,14 @@
|
||||
import pytest
|
||||
|
||||
from vllm.utils import create_kv_caches_with_random
|
||||
from vllm.utils import (create_kv_caches_with_random,
|
||||
create_kv_caches_with_random_flash)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def kv_cache_factory():
|
||||
return create_kv_caches_with_random
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def kv_cache_factory_flashinfer():
|
||||
return create_kv_caches_with_random_flash
|
||||
|
@ -61,7 +61,7 @@ def ref_single_query_cached_kv_attention(
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
block_tables: torch.Tensor,
|
||||
context_lens: torch.Tensor,
|
||||
seq_lens: torch.Tensor,
|
||||
scale: float,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
) -> None:
|
||||
@ -72,15 +72,15 @@ def ref_single_query_cached_kv_attention(
|
||||
num_seqs = query.shape[0]
|
||||
|
||||
block_tables = block_tables.cpu().tolist()
|
||||
context_lens = context_lens.cpu().tolist()
|
||||
seq_lens = seq_lens.cpu().tolist()
|
||||
for i in range(num_seqs):
|
||||
q = query[i].unsqueeze(0)
|
||||
block_table = block_tables[i]
|
||||
context_len = int(context_lens[i])
|
||||
seq_len = int(seq_lens[i])
|
||||
|
||||
keys = []
|
||||
values = []
|
||||
for j in range(context_len):
|
||||
for j in range(seq_len):
|
||||
block_number = int(block_table[j // block_size])
|
||||
block_offset = j % block_size
|
||||
|
||||
@ -100,8 +100,8 @@ def ref_single_query_cached_kv_attention(
|
||||
alibi_bias = None
|
||||
if alibi_slopes is not None:
|
||||
# Create the ALiBi bias used in the paged attention kernel.
|
||||
position_ids = torch.arange(context_len).int()
|
||||
alibi_bias = (position_ids - context_len + 1).float()
|
||||
position_ids = torch.arange(seq_len).int()
|
||||
alibi_bias = (position_ids - seq_len + 1).float()
|
||||
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
|
||||
1, 1, -1)
|
||||
|
||||
@ -149,13 +149,13 @@ def test_paged_attention(
|
||||
if use_alibi:
|
||||
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
|
||||
|
||||
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||
context_lens[-1] = MAX_SEQ_LEN
|
||||
max_context_len = max(context_lens)
|
||||
context_lens = torch.tensor(context_lens, dtype=torch.int)
|
||||
seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||
seq_lens[-1] = MAX_SEQ_LEN
|
||||
max_seq_len = max(seq_lens)
|
||||
seq_lens = torch.tensor(seq_lens, dtype=torch.int)
|
||||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
@ -186,16 +186,15 @@ def test_paged_attention(
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
context_lens,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_context_len,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
)
|
||||
elif version == "v2":
|
||||
num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
|
||||
PARTITION_SIZE)
|
||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||
assert PARTITION_SIZE % block_size == 0
|
||||
num_seqs, num_heads, head_size = output.shape
|
||||
tmp_output = torch.empty(
|
||||
@ -218,9 +217,9 @@ def test_paged_attention(
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
context_lens,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_context_len,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
@ -255,7 +254,7 @@ def test_paged_attention(
|
||||
key_cache,
|
||||
value_cache,
|
||||
block_tables,
|
||||
context_lens,
|
||||
seq_lens,
|
||||
scale,
|
||||
alibi_slopes,
|
||||
)
|
||||
|
@ -5,6 +5,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm._C import cache_ops
|
||||
from vllm.utils import is_hip
|
||||
|
||||
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
||||
@ -191,6 +192,82 @@ def test_reshape_and_cache(
|
||||
assert torch.allclose(value_cache, cloned_value_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_reshape_and_cache_flash(
|
||||
kv_cache_factory_flashinfer,
|
||||
num_tokens: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
if kv_cache_dtype == "fp8":
|
||||
pytest.skip()
|
||||
random.seed(seed)
|
||||
torch.random.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
|
||||
# Create a random slot mapping.
|
||||
num_slots = block_size * num_blocks
|
||||
slot_mapping = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device='cuda')
|
||||
|
||||
qkv = torch.randn(num_tokens,
|
||||
3,
|
||||
num_heads,
|
||||
head_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
_, key, value = qkv.unbind(dim=1)
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory_flashinfer(
|
||||
num_blocks,
|
||||
block_size,
|
||||
1,
|
||||
num_heads,
|
||||
head_size,
|
||||
kv_cache_dtype,
|
||||
dtype,
|
||||
)
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
# Clone the KV caches.
|
||||
cloned_key_cache = key_cache.clone()
|
||||
cloned_value_cache = value_cache.clone()
|
||||
|
||||
# Call the reshape_and_cache kernel.
|
||||
cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
|
||||
slot_mapping, kv_cache_dtype)
|
||||
|
||||
# Run the reference implementation.
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
|
||||
block_indicies = block_indicies.cpu().tolist()
|
||||
block_offsets = slot_mapping % block_size
|
||||
block_offsets = block_offsets.cpu().tolist()
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indicies[i]
|
||||
block_offset = block_offsets[i]
|
||||
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
|
||||
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
|
||||
|
||||
assert torch.allclose(key_cache, cloned_key_cache)
|
||||
assert torch.allclose(value_cache, cloned_value_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("direction", COPYING_DIRECTION)
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
|
@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype):
|
||||
for i in range(config.num_local_experts):
|
||||
weights = (hf_moe.experts[i].w1.weight.data,
|
||||
hf_moe.experts[i].w3.weight.data)
|
||||
vllm_moe.ws[i][:] = torch.cat(weights, dim=0)
|
||||
vllm_moe.w2s[i][:] = hf_moe.experts[i].w2.weight.data
|
||||
vllm_moe.w13_weight[i][:] = torch.cat(weights, dim=0)
|
||||
vllm_moe.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
|
||||
|
||||
# Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
|
||||
hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
|
||||
|
@ -15,6 +15,7 @@ DTYPES = [torch.float16]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@ -22,11 +23,13 @@ CUDA_DEVICES = [
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
|
||||
@torch.inference_mode()
|
||||
def test_contexted_kv_attention(
|
||||
num_heads: int,
|
||||
num_queries_per_kv: int,
|
||||
head_size: int,
|
||||
sliding_window: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
) -> None:
|
||||
@ -48,12 +51,12 @@ def test_contexted_kv_attention(
|
||||
cache_size = 640
|
||||
block_size = 32
|
||||
max_block_per_request = 64
|
||||
subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
|
||||
query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
|
||||
ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
|
||||
seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
|
||||
seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
|
||||
num_kv_heads = num_heads // num_queries_per_kv
|
||||
|
||||
num_tokens = sum(subquery_lens)
|
||||
num_tokens = sum(query_lens)
|
||||
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
query.uniform_(-1e-3, 1e-3)
|
||||
output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
@ -72,15 +75,15 @@ def test_contexted_kv_attention(
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
k = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
v = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
values = torch.arange(0, cache_size, dtype=torch.long)
|
||||
values = values[torch.randperm(cache_size)]
|
||||
block_table = values[:BS * max_block_per_request].view(
|
||||
BS, max_block_per_request)
|
||||
b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
|
||||
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
|
||||
b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
|
||||
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
max_input_len = MAX_SEQ_LEN
|
||||
@ -89,7 +92,7 @@ def test_contexted_kv_attention(
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
for i in range(BS):
|
||||
for j in range(subquery_lens[i]):
|
||||
for j in range(query_lens[i]):
|
||||
k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
|
||||
j])
|
||||
v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
|
||||
@ -123,12 +126,32 @@ def test_contexted_kv_attention(
|
||||
|
||||
# Warm up the Triton kernel by calling it once before actually measuring
|
||||
# generation time
|
||||
context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
|
||||
b_start_loc, b_seq_len, b_ctx_len, max_input_len)
|
||||
context_attention_fwd(query,
|
||||
k,
|
||||
v,
|
||||
output,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
sliding_window=sliding_window)
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
|
||||
b_start_loc, b_seq_len, b_ctx_len, max_input_len)
|
||||
context_attention_fwd(query,
|
||||
k,
|
||||
v,
|
||||
output,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
sliding_window=sliding_window)
|
||||
torch.cuda.synchronize()
|
||||
end_time = time.time()
|
||||
print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
|
||||
@ -155,7 +178,10 @@ def test_contexted_kv_attention(
|
||||
value = value.unsqueeze(0)
|
||||
|
||||
attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
|
||||
subquery_lens, seq_lens)
|
||||
query_lens, seq_lens)
|
||||
if sliding_window > 0:
|
||||
attn_bias = attn_bias.make_local_attention_from_bottomright(
|
||||
sliding_window)
|
||||
output_ref = xops.memory_efficient_attention_forward(
|
||||
query,
|
||||
key,
|
||||
|
@ -8,6 +8,10 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.fully_sharded_layers import (
|
||||
ColumnParallelLinearWithShardedLoRA,
|
||||
MergedColumnParallelLinearWithShardedLoRA,
|
||||
MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
@ -524,13 +528,16 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("orientation", ["row", "column"])
|
||||
@pytest.mark.parametrize("fully_shard", [True, False])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
|
||||
def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
|
||||
device) -> None:
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
fully_sharded_loras=fully_shard,
|
||||
lora_dtype=torch.float16)
|
||||
|
||||
def create_random_linear_parallel_layer():
|
||||
@ -540,14 +547,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = RowParallelLinearWithLoRA(linear)
|
||||
lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard
|
||||
else RowParallelLinearWithShardedLoRA(linear))
|
||||
else:
|
||||
linear = ColumnParallelLinear(4096,
|
||||
4096,
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = ColumnParallelLinearWithLoRA(linear)
|
||||
lora_linear = (ColumnParallelLinearWithLoRA(linear)
|
||||
if not fully_shard else
|
||||
ColumnParallelLinearWithShardedLoRA(linear))
|
||||
lora_linear.create_lora_weights(max_loras, lora_config)
|
||||
|
||||
return linear, lora_linear
|
||||
@ -629,13 +639,16 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("repeats", [1, 2, 3])
|
||||
@pytest.mark.parametrize("fully_shard", [True, False])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
|
||||
def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
|
||||
device) -> None:
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
fully_sharded_loras=fully_shard,
|
||||
lora_dtype=torch.float16)
|
||||
|
||||
def create_column_parallel_packed_layer():
|
||||
@ -644,7 +657,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = MergedColumnParallelLinearWithLoRA(linear)
|
||||
lora_linear = (MergedColumnParallelLinearWithLoRA(linear)
|
||||
if not fully_shard else
|
||||
MergedColumnParallelLinearWithShardedLoRA(linear))
|
||||
elif repeats == 3:
|
||||
linear = QKVParallelLinear(4096,
|
||||
64,
|
||||
@ -652,7 +667,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = MergedQKVParallelLinearWithLora(linear)
|
||||
lora_linear = (MergedQKVParallelLinearWithLora(linear)
|
||||
if not fully_shard else
|
||||
MergedQKVParallelLinearWithShardedLora(linear))
|
||||
else:
|
||||
linear = QKVParallelLinear(4096,
|
||||
64,
|
||||
|
@ -34,11 +34,14 @@ def _lora_ref_impl(
|
||||
for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
|
||||
xi = x[i].unsqueeze(0).to(torch.float32)
|
||||
wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
|
||||
wb = wb_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
|
||||
if wb_T_all is not None:
|
||||
wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
|
||||
-2).to(torch.float32)
|
||||
|
||||
tmp = xi @ wa
|
||||
y_stage_1[i] = tmp.squeeze(0)
|
||||
y_final[i] += (tmp @ wb).squeeze(0) * s
|
||||
y_final[i] += ((tmp @ wb).squeeze(0) *
|
||||
s if wb_T_all is not None else y_stage_1[i])
|
||||
return y_final, y_stage_1
|
||||
|
||||
|
||||
@ -91,12 +94,56 @@ H1 = H2 = [
|
||||
128000,
|
||||
128256,
|
||||
]
|
||||
H2 = [64] + H2
|
||||
R = [1, 2, 4]
|
||||
SEED = [0xabcdabcd987]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("r", R)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@torch.inference_mode()
|
||||
def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
|
||||
torch.manual_seed(seed)
|
||||
num_loras = 4
|
||||
num_layers = 1
|
||||
bs = 32
|
||||
dtype = getattr(torch, dtype_str)
|
||||
device = torch.device("cuda")
|
||||
|
||||
wa_T_all = torch.randn(num_loras,
|
||||
num_layers,
|
||||
r,
|
||||
h1,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
x = torch.randn(bs, h1, dtype=dtype, device=device)
|
||||
y = torch.randn(bs, r, dtype=dtype, device=device)
|
||||
|
||||
y_ref = y.clone()
|
||||
_lora_ref_impl(
|
||||
y_ref,
|
||||
x,
|
||||
wa_T_all,
|
||||
None,
|
||||
indices,
|
||||
layer_idx,
|
||||
1.0,
|
||||
)
|
||||
|
||||
y_our = y.clone()
|
||||
punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
|
||||
|
||||
assert_close(y_ref, y_our)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("h2", H2)
|
||||
|
@ -1,4 +1,12 @@
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from prometheus_client import REGISTRY
|
||||
|
||||
from vllm import EngineArgs, LLMEngine
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
@ -68,3 +76,119 @@ def test_metric_counter_generation_tokens(
|
||||
assert vllm_generation_count == metric_count, (
|
||||
f"generation token count: {vllm_generation_count!r}\n"
|
||||
f"metric: {metric_count!r}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize(
|
||||
"served_model_name",
|
||||
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
|
||||
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
|
||||
served_model_name: List[str]) -> None:
|
||||
vllm_model = vllm_runner(model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.3,
|
||||
served_model_name=served_model_name)
|
||||
stat_logger = vllm_model.model.llm_engine.stat_logger
|
||||
metrics_tag_content = stat_logger.labels["model_name"]
|
||||
|
||||
del vllm_model
|
||||
|
||||
if served_model_name is None or served_model_name == []:
|
||||
assert metrics_tag_content == model, (
|
||||
f"Metrics tag model_name is wrong! expect: {model!r}\n"
|
||||
f"actual: {metrics_tag_content!r}")
|
||||
else:
|
||||
assert metrics_tag_content == served_model_name[0], (
|
||||
f"Metrics tag model_name is wrong! expect: "
|
||||
f"{served_model_name[0]!r}\n"
|
||||
f"actual: {metrics_tag_content!r}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("disable_log_stats", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_engine_log_metrics_regression(
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
disable_log_stats: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Regression test ensuring async engine generates metrics
|
||||
when disable_log_stats=False
|
||||
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
|
||||
"""
|
||||
engine_args = AsyncEngineArgs(model=model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=disable_log_stats)
|
||||
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
for i, prompt in enumerate(example_prompts):
|
||||
results = async_engine.generate(
|
||||
prompt,
|
||||
SamplingParams(max_tokens=max_tokens),
|
||||
f"request-id-{i}",
|
||||
)
|
||||
# Exhaust the async iterator to make the async engine work
|
||||
async for _ in results:
|
||||
pass
|
||||
|
||||
assert_metrics(async_engine.engine, disable_log_stats,
|
||||
len(example_prompts))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("disable_log_stats", [True, False])
|
||||
def test_engine_log_metrics_regression(
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
disable_log_stats: bool,
|
||||
) -> None:
|
||||
engine_args = EngineArgs(model=model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=disable_log_stats)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
for i, prompt in enumerate(example_prompts):
|
||||
engine.add_request(
|
||||
f"request-id-{i}",
|
||||
prompt,
|
||||
SamplingParams(max_tokens=max_tokens),
|
||||
)
|
||||
while engine.has_unfinished_requests():
|
||||
engine.step()
|
||||
|
||||
assert_metrics(engine, disable_log_stats, len(example_prompts))
|
||||
|
||||
|
||||
def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
|
||||
num_requests: int) -> None:
|
||||
if disable_log_stats:
|
||||
with pytest.raises(AttributeError):
|
||||
_ = engine.stat_logger
|
||||
else:
|
||||
assert (engine.stat_logger
|
||||
is not None), "engine.stat_logger should be set"
|
||||
# Ensure the count bucket of request-level histogram metrics matches
|
||||
# the number of requests as a simple sanity check to ensure metrics are
|
||||
# generated
|
||||
labels = {'model_name': engine.model_config.model}
|
||||
request_histogram_metrics = [
|
||||
"vllm:e2e_request_latency_seconds",
|
||||
"vllm:request_prompt_tokens",
|
||||
"vllm:request_generation_tokens",
|
||||
"vllm:request_params_best_of",
|
||||
"vllm:request_params_n",
|
||||
]
|
||||
for metric_name in request_histogram_metrics:
|
||||
metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
|
||||
labels)
|
||||
assert (
|
||||
metric_value == num_requests), "Metrics should be collected"
|
||||
|
@ -1,9 +1,12 @@
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import huggingface_hub.constants
|
||||
import pytest
|
||||
from huggingface_hub.utils import LocalEntryNotFoundError
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import enable_hf_transfer
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf, enable_hf_transfer)
|
||||
|
||||
|
||||
def test_hf_transfer_auto_activation():
|
||||
@ -22,5 +25,30 @@ def test_hf_transfer_auto_activation():
|
||||
HF_TRANFER_ACTIVE)
|
||||
|
||||
|
||||
def test_download_weights_from_hf():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# assert LocalEntryNotFoundError error is thrown
|
||||
# if offline is set and model is not cached
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = True
|
||||
with pytest.raises(LocalEntryNotFoundError):
|
||||
download_weights_from_hf("facebook/opt-125m",
|
||||
allow_patterns=["*.safetensors", "*.bin"],
|
||||
cache_dir=tmpdir)
|
||||
|
||||
# download the model
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = False
|
||||
download_weights_from_hf("facebook/opt-125m",
|
||||
allow_patterns=["*.safetensors", "*.bin"],
|
||||
cache_dir=tmpdir)
|
||||
|
||||
# now it should work offline
|
||||
huggingface_hub.constants.HF_HUB_OFFLINE = True
|
||||
assert download_weights_from_hf(
|
||||
"facebook/opt-125m",
|
||||
allow_patterns=["*.safetensors", "*.bin"],
|
||||
cache_dir=tmpdir) is not None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_hf_transfer_auto_activation()
|
||||
test_download_weights_from_hf()
|
||||
|
@ -43,3 +43,18 @@ def test_models(
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_model_print(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
vllm_model = vllm_runner(model, dtype=dtype)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
del vllm_model
|
||||
|
90
tests/models/test_fp8.py
Normal file
90
tests/models/test_fp8.py
Normal file
@ -0,0 +1,90 @@
|
||||
# flake8: noqa
|
||||
"""Tests fp8 models against ground truth generation
|
||||
Note: these tests will only pass on L4 GPU.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = [
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8",
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
|
||||
'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
|
||||
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
|
||||
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
|
||||
'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
|
||||
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
|
||||
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||
'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
|
||||
],
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct": [
|
||||
'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
|
||||
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
|
||||
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
|
||||
'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
|
||||
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
|
||||
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||
'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
|
||||
],
|
||||
}
|
||||
|
||||
capability = torch.cuda.get_device_capability()
|
||||
capability = capability[0] * 10 + capability[1]
|
||||
fp8_not_supported = (capability <
|
||||
QUANTIZATION_METHODS["fp8"].get_min_capability())
|
||||
|
||||
|
||||
@pytest.mark.skipif(fp8_not_supported,
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(
|
||||
example_prompts,
|
||||
model_name,
|
||||
) -> None:
|
||||
model = LLM(model=model_name,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
enforce_eager=True,
|
||||
quantization="fp8")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
formatted_prompts = [
|
||||
tokenizer.apply_chat_template([{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
for prompt in example_prompts
|
||||
]
|
||||
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
generations = []
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
outputs = model.generate(prompt, params)
|
||||
generations.append(outputs[0].outputs[0].text)
|
||||
del model
|
||||
|
||||
print(generations)
|
||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||
for i in range(len(example_prompts)):
|
||||
generated_str = generations[i]
|
||||
expected_str = expected_strs[i]
|
||||
assert expected_str == generated_str, (
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
|
98
tests/models/test_gptq_marlin.py
Normal file
98
tests/models/test_gptq_marlin.py
Normal file
@ -0,0 +1,98 @@
|
||||
"""Compares the outputs of gptq vs gptq_marlin
|
||||
Note: GPTQ and Marlin do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
||||
Note: Marlin internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
||||
up to 3 times to see if we pass.
|
||||
Note: This test currently fails running with --forked with the following:
|
||||
RuntimeError: Cannot re-initialize CUDA in forked subprocess.
|
||||
To use CUDA with multiprocessing, you must use the 'spawn' start method
|
||||
Run `pytest tests/models/test_gptq_marlin.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
capability = torch.cuda.get_device_capability()
|
||||
capability = capability[0] * 10 + capability[1]
|
||||
gptq_marlin_not_supported = (
|
||||
capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
|
||||
|
||||
MODELS = [
|
||||
# act_order==False, group_size=channelwise
|
||||
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
|
||||
# act_order==False, group_size=128
|
||||
("TheBloke/Llama-2-7B-GPTQ", "main"),
|
||||
|
||||
# act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
||||
# act_order==True, group_size=64
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
|
||||
# act_order==True, group_size=32
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
|
||||
|
||||
# 8-bit, act_order==True, group_size=channelwise
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
||||
# 8-bit, act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
|
||||
# 8-bit, act_order==True, group_size=32
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(gptq_marlin_not_supported,
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_name, revision = model
|
||||
|
||||
# Run marlin.
|
||||
gptq_marlin_model = vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype=dtype,
|
||||
quantization="marlin",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1)
|
||||
|
||||
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
del gptq_marlin_model
|
||||
|
||||
# Run gptq.
|
||||
gptq_model = vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype=dtype,
|
||||
quantization="gptq",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1)
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens,
|
||||
num_logprobs)
|
||||
del gptq_model
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=gptq_marlin_outputs,
|
||||
name_0="gptq",
|
||||
name_1="gptq_marlin",
|
||||
)
|
@ -10,12 +10,12 @@ up to 3 times to see if we pass.
|
||||
|
||||
Run `pytest tests/models/test_marlin.py`.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
|
||||
capability = torch.cuda.get_device_capability()
|
||||
@ -55,43 +55,24 @@ def test_models(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype)
|
||||
marlin_model = vllm_runner(model_pair.model_marlin,
|
||||
dtype=dtype,
|
||||
quantization="marlin")
|
||||
marlin_outputs = marlin_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Note: not sure why, but deleting just the model on Ada Lovelace
|
||||
# does not free the GPU memory. On Ampere, deleting the just model
|
||||
# frees the memory.
|
||||
del marlin_model
|
||||
|
||||
gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype)
|
||||
gptq_model = vllm_runner(model_pair.model_gptq,
|
||||
dtype=dtype,
|
||||
quantization="gptq")
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens,
|
||||
num_logprobs)
|
||||
|
||||
# Note: not sure why, but deleting just the model on Ada Lovelace
|
||||
# does not free the GPU memory. On Ampere, deleting the just model
|
||||
# frees the memory.
|
||||
del gptq_model
|
||||
|
||||
# loop through the prompts
|
||||
for prompt_idx in range(len(example_prompts)):
|
||||
gptq_output_ids, gptq_output_str, gptq_logprobs = gptq_outputs[
|
||||
prompt_idx]
|
||||
marlin_output_ids, marlin_output_str, marlin_logprobs = marlin_outputs[
|
||||
prompt_idx]
|
||||
|
||||
for idx, (gptq_output_id, marlin_output_id) in enumerate(
|
||||
zip(gptq_output_ids, marlin_output_ids)):
|
||||
# If sequence is not an exact match,
|
||||
if marlin_output_id != gptq_output_id:
|
||||
# Each predicted token must be in top 5 of the other's
|
||||
assert gptq_output_id in marlin_logprobs[idx], (
|
||||
f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
|
||||
f"Marlin:\t{marlin_output_str!r}")
|
||||
assert marlin_output_id in gptq_logprobs[idx], (
|
||||
f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
|
||||
f"Marlin:\t{marlin_output_str!r}")
|
||||
|
||||
# Break out since sequences will now diverge.
|
||||
break
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=marlin_outputs,
|
||||
name_0="gptq",
|
||||
name_1="marlin",
|
||||
)
|
||||
|
@ -49,3 +49,18 @@ def test_models(
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_model_print(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
vllm_model = vllm_runner(model, dtype=dtype)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
del vllm_model
|
||||
|
29
tests/models/utils.py
Normal file
29
tests/models/utils.py
Normal file
@ -0,0 +1,29 @@
|
||||
def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
|
||||
"""Compare the logprobs of two sequences generated by different models,
|
||||
which should be similar but not necessarily equal.
|
||||
"""
|
||||
# Loop through responses to each prompt.
|
||||
for prompt_idx, (outputs_0,
|
||||
outputs_1) in enumerate(zip(outputs_0_lst,
|
||||
outputs_1_lst)):
|
||||
output_ids_0, output_str_0, logprobs_0 = outputs_0
|
||||
output_ids_1, output_str_1, logprobs_1 = outputs_1
|
||||
|
||||
# Loop through generated tokens.
|
||||
for idx, (output_id_0,
|
||||
output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
|
||||
|
||||
# If generated tokens don't match, then
|
||||
if output_id_0 != output_id_1:
|
||||
# Each predicted token must be in top N logprobs of the other
|
||||
assert output_id_0 in logprobs_1[idx], (
|
||||
f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{output_str_0!r}"
|
||||
f"\n{name_1}:\t{output_str_1!r}")
|
||||
assert output_id_1 in logprobs_0[idx], (
|
||||
f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{output_str_0!r}"
|
||||
f"\n{name_1}:\t{output_str_1!r}")
|
||||
|
||||
# Break out since sequences will now diverge.
|
||||
break
|
@ -1,64 +0,0 @@
|
||||
"""Tests whether Marlin models can be loaded from the autogptq config.
|
||||
|
||||
Run `pytest tests/quantization/test_autogptq_marlin_configs.py --forked`.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_marlin: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
# Model Id // Expected Kernel
|
||||
MODELS_QUANT_TYPE = [
|
||||
# compat: autogptq <=0.7.1 is_marlin_format: bool
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq"),
|
||||
# compat: autogptq >=0.8.0 use checkpoint_format: str
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq")
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_quant_type", MODELS_QUANT_TYPE)
|
||||
def test_auto_gptq(model_quant_type: str, ) -> None:
|
||||
model_path, quant_type = model_quant_type
|
||||
|
||||
model_config_no_quant_arg = ModelConfig(
|
||||
model_path,
|
||||
model_path,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
quantization=None # case 1
|
||||
)
|
||||
|
||||
model_config_quant_arg = ModelConfig(
|
||||
model_path,
|
||||
model_path,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
quantization="gptq" # case 2
|
||||
)
|
||||
|
||||
assert model_config_no_quant_arg.quantization == quant_type, (
|
||||
f"Expected quant_type == {quant_type} for {model_path}, "
|
||||
f"but found {model_config_no_quant_arg.quantization} "
|
||||
"for no --quantization None case")
|
||||
|
||||
assert model_config_quant_arg.quantization == quant_type, (
|
||||
f"Expected quant_type == {quant_type} for {model_path}, "
|
||||
f"but found {model_config_quant_arg.quantization} "
|
||||
"for --quantization gptq case")
|
73
tests/quantization/test_configs.py
Normal file
73
tests/quantization/test_configs.py
Normal file
@ -0,0 +1,73 @@
|
||||
"""Tests whether Marlin models can be loaded from the autogptq config.
|
||||
|
||||
Run `pytest tests/quantization/test_configs.py --forked`.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_marlin: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
# Model Id // Quantization Arg // Expected Type
|
||||
MODEL_ARG_EXPTYPES = [
|
||||
# AUTOGPTQ
|
||||
# compat: autogptq <=0.7.1 is_marlin_format: bool
|
||||
# Model Serialized in Marlin Format should always use Marlin kernel.
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
|
||||
# Model Serialized in Exllama Format.
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
|
||||
# compat: autogptq >=0.8.0 use checkpoint_format: str
|
||||
# Model Serialized in Marlin Format should always use Marlin kernel.
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
|
||||
# Model Serialized in Exllama Format.
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
|
||||
|
||||
# AUTOAWQ
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
|
||||
def test_auto_gptq(model_arg_exptype: str) -> None:
|
||||
model_path, quantization_arg, expected_type = model_arg_exptype
|
||||
|
||||
try:
|
||||
model_config = ModelConfig(model_path,
|
||||
model_path,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
quantization=quantization_arg)
|
||||
found_quantization_type = model_config.quantization
|
||||
except ValueError:
|
||||
found_quantization_type = "ERROR"
|
||||
|
||||
assert found_quantization_type == expected_type, (
|
||||
f"Expected quant_type == {expected_type} for {model_path}, "
|
||||
f"but found {found_quantization_type} "
|
||||
f"for no --quantization {quantization_arg} case")
|
@ -20,5 +20,5 @@ def test_load_fp16_model(vllm_runner) -> None:
|
||||
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
|
||||
fc1 = model.model.decoder.layers[0].fc1
|
||||
assert isinstance(fc1.linear_method, Fp8LinearMethod)
|
||||
assert isinstance(fc1.quant_method, Fp8LinearMethod)
|
||||
assert fc1.weight.dtype == torch.float8_e4m3fn
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user