Compare commits

..

22 Commits

Author SHA1 Message Date
7097f31955 test
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-15 03:22:32 -08:00
f840b53063 fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-15 03:07:17 -08:00
1ca4298b9b Fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-01 18:44:21 -08:00
ba64a0249f Minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-01 18:42:22 -08:00
1260e43230 Minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-01-01 03:16:56 -08:00
a6e5d7b5b7 Merge branch 'main' into v1-blocktable-opt 2025-01-01 03:10:50 -08:00
ebfbe1244b ruff
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-26 20:06:53 -08:00
6ba31aa5f6 Minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-26 19:03:59 -08:00
34d6cc2aea Merge branch 'main' into v1-blocktable-opt 2024-12-26 18:52:19 -08:00
27e8eb2e94 Add kernel test
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-26 11:23:52 -08:00
ca4f9e69a8 minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-26 11:13:41 -08:00
52922193cd Add test for uva
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-26 11:00:19 -08:00
bef68163a0 Minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-26 10:48:29 -08:00
ff5b1033dc Merge branch 'main' into v1-blocktable-opt 2024-12-26 10:12:17 -08:00
b938606993 Merge branch 'main' into v1-blocktable-opt 2024-12-25 15:49:02 -08:00
3fdbd8e2f5 comments
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-22 22:39:03 -08:00
0420fb2c7b Merge branch 'main' into v1-blocktable-opt 2024-12-22 22:16:22 -08:00
ee965c9c69 Use default
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-22 22:16:12 -08:00
0a669eed7b Minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-21 17:39:13 -08:00
03b1e6fdbd Minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-21 17:28:21 -08:00
8a4180c8b6 yapf
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-21 17:11:00 -08:00
1aaced5830 wip
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-12-21 17:07:46 -08:00
278 changed files with 5073 additions and 7977 deletions

View File

@ -1,6 +1,5 @@
steps:
- label: "Wait for container to be ready"
key: wait-for-container-image
agents:
queue: A100
plugins:
@ -11,11 +10,12 @@ steps:
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
plugins:
- kubernetes:
podSpec:
@ -49,7 +49,6 @@ steps:
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
depends_on: wait-for-container-image
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -74,7 +73,7 @@ steps:
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: wait-for-container-image
depends_on: ~
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

View File

@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference.py"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip install pytest pytest-asyncio \
decord einops librosa peft Pillow sentence-transformers soundfile \
@ -46,26 +46,26 @@ function cpu_tests() {
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1

View File

@ -3,18 +3,6 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
set -v
image_name="neuron/vllm-ci"
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@ -25,30 +13,41 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker image prune -f
docker system prune -f
rm -rf "${HF_MOUNT:?}/*"
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t "${image_name}" -f Dockerfile.neuron .
docker build -t neuron -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() {
docker image rm -f "${image_name}" || true;
}
remove_docker_container() { docker rm -f neuron || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'

View File

@ -242,7 +242,7 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" # 9min
@ -363,14 +363,12 @@ steps:
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/audio_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
@ -535,7 +533,6 @@ steps:
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- label: Weight Loading Multiple GPU Test # 33min

View File

@ -9,7 +9,7 @@ body:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
- type: textarea
attributes:
label: The model to consider.

View File

@ -193,6 +193,7 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
@ -200,6 +201,7 @@ set(VLLM_EXT_SRC
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/prepare_inputs/copy_subranges.cu"
"csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -550,7 +552,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -234,8 +234,8 @@ RUN mv vllm test_docs/
#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
@ -247,14 +247,5 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV VLLM_USAGE_SOURCE production-docker-image
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
COPY examples/sagemaker-entrypoint.sh .
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
FROM vllm-openai-base AS vllm-openai
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################

View File

@ -15,8 +15,8 @@ RUN apt-get update && \
ffmpeg libsm6 libxext6 libgl1
### Mount Point ###
# When launching the container, mount the code directory to /workspace
ARG APP_MOUNT=/workspace
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm
@ -25,7 +25,6 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest
COPY . .
ARG GIT_REPO_CHECK=0
@ -43,7 +42,4 @@ RUN --mount=type=bind,source=.git,target=.git \
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
# overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
CMD ["/bin/bash"]

View File

@ -77,7 +77,7 @@ pip install vllm
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
## Contributing

View File

@ -4,7 +4,7 @@
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
---

View File

@ -53,7 +53,7 @@ void paged_attention_v1_launcher(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
const int blocksparse_vert_stride, const int blocksparse_block_size,
const int blocksparse_head_sliding_step) {
@ -176,7 +176,7 @@ void paged_attention_v1(
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& seq_lens, // [num_seqs]
int64_t block_size, int64_t max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,

View File

@ -54,7 +54,7 @@ void paged_attention_v2_launcher(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
const int blocksparse_vert_stride, const int blocksparse_block_size,
const int blocksparse_head_sliding_step) {
@ -187,7 +187,7 @@ void paged_attention_v2(
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& seq_lens, // [num_seqs]
int64_t block_size, int64_t max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,

View File

@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
const std::optional<torch::Tensor>& alibi_slopes) {
const c10::optional<torch::Tensor>& alibi_slopes) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -459,7 +459,7 @@ void paged_attention_v1(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@ -781,7 +781,7 @@ void paged_attention_v2(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,

View File

@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
const torch::Tensor& b, // [IC, OC], column-major
const torch::Tensor& a_scales, // [1] or [M]
const torch::Tensor& b_scales, // [1] or [OC]
const std::optional<torch::Tensor>& bias // [OC]
const c10::optional<torch::Tensor>& bias // [OC]
) {
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
// Checks for conformality
@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
const torch::Tensor& a_scales, // [1] or [M]
const torch::Tensor& b_scales, // [1] or [OC]
const torch::Tensor& azp_adj, // [OC]
const std::optional<torch::Tensor>& azp, // [1] or [M]
const std::optional<torch::Tensor>& bias // [OC]
const c10::optional<torch::Tensor>& azp, // [1] or [M]
const c10::optional<torch::Tensor>& bias // [OC]
) {
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
// Checks for conformality
@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size]
const torch::Tensor& scale,
std::optional<torch::Tensor> const& azp) {
c10::optional<torch::Tensor> const& azp) {
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant(
torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size]
torch::Tensor& scale, // [..., 1]
std::optional<torch::Tensor> const& azp) {
c10::optional<torch::Tensor> const& azp) {
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());

View File

@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids);
void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const torch::Tensor& b, const torch::Tensor& a_scales,
const torch::Tensor& b_scales,
const std::optional<torch::Tensor>& bias);
const c10::optional<torch::Tensor>& bias);
void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
const torch::Tensor& b, const torch::Tensor& a_scales,
const torch::Tensor& b_scales,
const torch::Tensor& azp_adj,
const std::optional<torch::Tensor>& azp,
const std::optional<torch::Tensor>& bias);
const c10::optional<torch::Tensor>& azp,
const c10::optional<torch::Tensor>& bias);
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops

View File

@ -47,3 +47,11 @@
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
#endif
// #ifndef USE_ROCM
// #define VLLM_cudaHostGetDevicePointer(device_ptr, host_ptr, flags) \
// cudaHostGetDevicePointer(device_ptr, host_ptr, flags)
// #else
// #define VLLM_cudaHostGetDevicePointer(device_ptr, host_ptr, flags) \
// hipHostGetDevicePointer(device_ptr, host_ptr, flags)
// #endif

43
csrc/cuda_view.cu Normal file
View File

@ -0,0 +1,43 @@
#include <torch/all.h>
#include <torch/cuda.h>
// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
// memory, and that UVA (Unified Virtual Addressing) is enabled.
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
TORCH_CHECK(cpu_tensor.is_contiguous(), "Input tensor must be contiguous");
// Get raw host pointer from CPU tensor
void* host_ptr = cpu_tensor.data_ptr();
// Get a device pointer corresponding to the pinned host memory
void* device_ptr = nullptr;
cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
TORCH_CHECK(err == cudaSuccess,
"cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
// Construct a CUDA tensor from the device pointer.
// We'll use the same sizes, strides, and dtype as the CPU tensor.
auto sizes = cpu_tensor.sizes();
auto strides = cpu_tensor.strides();
auto options =
cpu_tensor.options().device(torch::kCUDA); // Change device to CUDA
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
// memory, so we don't free it here.
auto deleter = [](void*) {
// no-op, since the memory is owned by the original CPU tensor
};
torch::Tensor cuda_tensor =
torch::from_blob(device_ptr, sizes, strides, deleter, options);
TORCH_CHECK(cuda_tensor.device().is_cuda(),
"Resulting tensor is not on CUDA device");
TORCH_CHECK(cuda_tensor.sizes().equals(sizes), "Size mismatch");
TORCH_CHECK(cuda_tensor.strides().equals(strides), "Stride mismatch");
TORCH_CHECK(cuda_tensor.dtype() == cpu_tensor.dtype(), "Dtype mismatch");
return cuda_tensor;
}

View File

@ -68,7 +68,7 @@ struct ScaledEpilogueBase {
// This overload handles the case where there might not be a tensor, in which
// case a nullptr is passed and a constant (0) is used.
template <typename Descriptor, typename T>
static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
using Arguments = typename Descriptor::Arguments;
auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
static ArgumentType prepare_args(torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
torch::Tensor const& azp,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);

View File

@ -67,7 +67,7 @@ struct ScaledEpilogueBase {
// This overload handles the case where there might not be a tensor, in which
// case a nullptr is passed and a constant (0) is used.
template <typename Descriptor, typename T>
static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
using Arguments = typename Descriptor::Arguments;
auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
static ArgumentType prepare_args(torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
torch::Tensor const& azp,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);

View File

@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
template <typename Stride>
static inline auto maybe_make_cute_layout(
std::optional<torch::Tensor> const& tensor,
c10::optional<torch::Tensor> const& tensor,
std::string_view name = "tensor") {
using Layout = decltype(make_cute_layout<Stride>(*tensor));

View File

@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase &params,
const at::Tensor x,
const at::Tensor weight,
const at::Tensor out,
const std::optional<at::Tensor>& bias,
const c10::optional<at::Tensor>& bias,
bool silu_activation,
int64_t pad_slot_id,
const std::optional<at::Tensor>& query_start_loc = std::nullopt,
const std::optional<at::Tensor>& cache_indices = std::nullopt,
const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
const c10::optional<at::Tensor>& cache_indices = std::nullopt,
const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
// Reset the parameters
memset(&params, 0, sizeof(params));
@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase &params,
void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
const std::optional<at::Tensor> &bias_,
const std::optional<at::Tensor> &conv_states,
const std::optional<at::Tensor> &query_start_loc,
const std::optional<at::Tensor> &cache_indices,
const std::optional<at::Tensor> &has_initial_state,
const c10::optional<at::Tensor> &bias_,
const c10::optional<at::Tensor> &conv_states,
const c10::optional<at::Tensor> &query_start_loc,
const c10::optional<at::Tensor> &cache_indices,
const c10::optional<at::Tensor> &has_initial_state,
bool silu_activation,
// used to identify padding entries if cache_indices provided
// in case of padding, the kernel will return early
@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
void causal_conv1d_update(const at::Tensor &x,
const at::Tensor &conv_state,
const at::Tensor &weight,
const std::optional<at::Tensor> &bias_,
const c10::optional<at::Tensor> &bias_,
bool silu_activation,
const std::optional<at::Tensor> &cache_seqlens_,
const std::optional<at::Tensor> &conv_state_indices_,
const c10::optional<at::Tensor> &cache_seqlens_,
const c10::optional<at::Tensor> &conv_state_indices_,
// used to identify padding entries if cache_indices provided
// in case of padding, the kernel will return early
int64_t pad_slot_id) {

View File

@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
const torch::Tensor out,
const torch::Tensor z,
const torch::Tensor out_z,
const std::optional<at::Tensor>& D,
const std::optional<at::Tensor>& delta_bias,
const c10::optional<at::Tensor>& D,
const c10::optional<at::Tensor>& delta_bias,
const torch::Tensor ssm_states,
bool has_z,
bool delta_softplus,
const std::optional<at::Tensor>& query_start_loc,
const std::optional<at::Tensor>& cache_indices,
const std::optional<at::Tensor>& has_initial_state,
const c10::optional<at::Tensor>& query_start_loc,
const c10::optional<at::Tensor>& cache_indices,
const c10::optional<at::Tensor>& has_initial_state,
bool varlen,
int64_t pad_slot_id) {
@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase &params,
void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
const std::optional<torch::Tensor> &D_,
const std::optional<torch::Tensor> &z_,
const std::optional<torch::Tensor> &delta_bias_,
const c10::optional<torch::Tensor> &D_,
const c10::optional<torch::Tensor> &z_,
const c10::optional<torch::Tensor> &delta_bias_,
bool delta_softplus,
const std::optional<torch::Tensor> &query_start_loc,
const std::optional<torch::Tensor> &cache_indices,
const std::optional<torch::Tensor> &has_initial_state,
const c10::optional<torch::Tensor> &query_start_loc,
const c10::optional<torch::Tensor> &cache_indices,
const c10::optional<torch::Tensor> &has_initial_state,
const torch::Tensor &ssm_states,
// used to identify padding entries if cache_indices provided
// in case of padding, the kernel will return early

View File

@ -33,7 +33,7 @@ void paged_attention_v1(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@ -44,7 +44,7 @@ void paged_attention_v2(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
@ -115,6 +115,11 @@ void advance_step_flashinfer(
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
void copy_subranges(torch::Tensor& matrix_src, torch::Tensor& matrix_diff,
torch::Tensor& matrix_tgt, int64_t n);
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
#ifndef USE_ROCM
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
const torch::Tensor& codebooks,
@ -153,15 +158,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& bias);
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias);
bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
@ -169,7 +174,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& e,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& bias);
bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
torch::Tensor& e, torch::Tensor const& a);
@ -177,11 +182,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
torch::Tensor const& scale,
std::optional<torch::Tensor> const& azp);
c10::optional<torch::Tensor> const& azp);
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
torch::Tensor& scales,
std::optional<torch::Tensor> const& azp);
c10::optional<torch::Tensor> const& azp);
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
torch::Tensor b_gptq_qzeros,
@ -198,34 +203,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
void dynamic_per_token_scaled_fp8_quant(
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
std::optional<torch::Tensor> const& scale_ub);
c10::optional<torch::Tensor> const& scale_ub);
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
const torch::Tensor& A, const torch::Tensor& B,
const torch::Tensor& C,
const std::optional<torch::Tensor>& D_,
const std::optional<torch::Tensor>& z_,
const std::optional<torch::Tensor>& delta_bias_,
const c10::optional<torch::Tensor>& D_,
const c10::optional<torch::Tensor>& z_,
const c10::optional<torch::Tensor>& delta_bias_,
bool delta_softplus,
const std::optional<torch::Tensor>& query_start_loc,
const std::optional<torch::Tensor>& cache_indices,
const std::optional<torch::Tensor>& has_initial_state,
const c10::optional<torch::Tensor>& query_start_loc,
const c10::optional<torch::Tensor>& cache_indices,
const c10::optional<torch::Tensor>& has_initial_state,
const torch::Tensor& ssm_states, int64_t pad_slot_id);
void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
const at::Tensor& weight,
const std::optional<at::Tensor>& bias_,
const c10::optional<at::Tensor>& bias_,
bool silu_activation,
const std::optional<at::Tensor>& cache_seqlens_,
const std::optional<at::Tensor>& conv_state_indices_,
const c10::optional<at::Tensor>& cache_seqlens_,
const c10::optional<at::Tensor>& conv_state_indices_,
int64_t pad_slot_id);
void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
const std::optional<at::Tensor>& bias_,
const std::optional<at::Tensor>& conv_states,
const std::optional<at::Tensor>& query_start_loc,
const std::optional<at::Tensor>& cache_indices,
const std::optional<at::Tensor>& has_initial_state,
const c10::optional<at::Tensor>& bias_,
const c10::optional<at::Tensor>& conv_states,
const c10::optional<at::Tensor>& query_start_loc,
const c10::optional<at::Tensor>& cache_indices,
const c10::optional<at::Tensor>& has_initial_state,
bool silu_activation, int64_t pad_slot_id);
#ifndef USE_ROCM

View File

@ -0,0 +1,75 @@
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
namespace vllm {
__global__ void copy_subranges_kernel(const int* __restrict__ matrix_src,
const int* __restrict__ matrix_diff,
int* __restrict__ matrix_tgt, int64_t M) {
int row_id = blockIdx.x;
int row_offset = row_id * M;
int start = matrix_diff[row_id * 2];
int length = matrix_diff[row_id * 2 + 1];
int end = start + length;
int thread_idx = threadIdx.x;
for (int i = start + thread_idx; i < end; i += blockDim.x) {
int idx = row_offset + i;
matrix_tgt[idx] = matrix_src[idx];
}
}
} // namespace vllm
void copy_subranges(torch::Tensor& matrix_src, torch::Tensor& matrix_diff,
torch::Tensor& matrix_tgt, int64_t n) {
// NOTE(woosuk): Here, we skip most of the error checking to minimize the
// CPU overheads. We assume that the caller will pass the correct inputs.
// Check tensor properties
// TORCH_CHECK(matrix_src.is_cuda(), "matrix_src must be a CUDA tensor");
// TORCH_CHECK(matrix_diff.is_cuda(), "matrix_diff must be a CUDA tensor");
// TORCH_CHECK(matrix_tgt.is_cuda(), "matrix_tgt must be a CUDA tensor");
// TORCH_CHECK(matrix_src.is_contiguous(), "matrix_src must be contiguous");
// TORCH_CHECK(matrix_diff.is_contiguous(), "matrix_diff must be contiguous");
// TORCH_CHECK(matrix_tgt.is_contiguous(), "matrix_tgt must be contiguous");
auto src_sizes = matrix_src.sizes();
auto diff_sizes = matrix_diff.sizes();
auto tgt_sizes = matrix_tgt.sizes();
// TORCH_CHECK(src_sizes.size() == 2, "matrix_src must be 2D");
// TORCH_CHECK(diff_sizes.size() == 2, "matrix_diff must be 2D");
// TORCH_CHECK(tgt_sizes.size() == 2, "matrix_tgt must be 2D");
int64_t N = src_sizes[0];
int64_t M = src_sizes[1];
// TORCH_CHECK(diff_sizes[0] == N, "matrix_diff first dim must match N");
// TORCH_CHECK(diff_sizes[1] == 2, "matrix_diff second dim must be 2");
// TORCH_CHECK(tgt_sizes[0] == N && tgt_sizes[1] == M,
// "matrix_tgt must have same shape as matrix_src");
// TORCH_CHECK(n <= N, "n must be <= N");
const int* d_matrix_src = matrix_src.data_ptr<int>();
const int* d_matrix_diff = matrix_diff.data_ptr<int>();
int* d_matrix_tgt = matrix_tgt.data_ptr<int>();
// One thread block per row.
int blocks = n;
int threads;
if (blocks < 128) {
threads = 1024;
} else if (blocks < 256) {
threads = 512;
} else if (blocks < 512) {
threads = 256;
} else {
threads = 128;
}
const at::cuda::OptionalCUDAGuard device_guard(device_of(matrix_tgt));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
vllm::copy_subranges_kernel<<<blocks, threads, 0, stream>>>(
d_matrix_src, d_matrix_diff, d_matrix_tgt, M);
}

View File

@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
torch::Tensor const& input, // [..., hidden_size]
torch::Tensor const& scale,
std::optional<torch::Tensor> const& azp) {
c10::optional<torch::Tensor> const& azp) {
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scale.numel() == 1);
@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
void dynamic_scaled_int8_quant(
torch::Tensor& out, // [..., hidden_size]
torch::Tensor const& input, // [..., hidden_size]
torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scales.is_contiguous());

View File

@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
if (bias) {
@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
if (bias) {
@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
if (bias) {
@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

View File

@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
if (bias) {
@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

View File

@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& bias);
void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& bias);
void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& bias);
#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& bias);
#endif
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias);
void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias);
void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias);
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias);
#endif
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& b, torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
// Checks for conformality
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& azp_adj,
std::optional<torch::Tensor> const& azp,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& azp,
c10::optional<torch::Tensor> const& bias) {
// Checks for conformality
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&

View File

@ -834,7 +834,6 @@ __global__ void Marlin(
int4* sh_g_idx = sh_b + (stages * b_sh_stage);
int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
int4* sh_s = sh_zp + (stages * zp_sh_stage);
int4* sh_red = sh_s + (stages * s_sh_stage);
// Register storage for double buffer of shared memory reads.
FragA frag_a[2][thread_m_blocks];
@ -933,11 +932,11 @@ __global__ void Marlin(
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
if constexpr (group_blocks >= thread_k_blocks) {
if (s_sh_wr_pred) {
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
}
// Only fetch scales if this tile starts a new group
if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
if (pipe % (group_blocks / thread_k_blocks) == 0) {
if (s_sh_wr_pred) {
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
}
s_gl_rd += s_gl_rd_delta;
}
} else {
@ -1039,7 +1038,9 @@ __global__ void Marlin(
// No act-order case
if constexpr (group_blocks != -1) {
if constexpr (group_blocks >= thread_k_blocks) {
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
int4* sh_s_stage =
sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
(pipe / (group_blocks / thread_k_blocks)));
reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
} else {
int warp_id = threadIdx.x / 32;
@ -1338,15 +1339,15 @@ __global__ void Marlin(
int red_sh_wr =
red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
if (i < red_off) {
float* c_rd = reinterpret_cast<float*>(
&sh_red[red_sh_delta * j + red_sh_rd]);
float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
float* c_rd =
reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
#pragma unroll
for (int k = 0; k < 4; k++)
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
c_rd[k] + c_wr[k];
}
sh_red[red_sh_wr] =
sh[red_sh_wr] =
reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
}
}
@ -1356,7 +1357,7 @@ __global__ void Marlin(
#pragma unroll
for (int i = 0; i < 4 * 2; i++) {
float* c_rd =
reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
#pragma unroll
for (int j = 0; j < 4; j++)
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
@ -1396,7 +1397,7 @@ __global__ void Marlin(
#pragma unroll
for (int i = 0; i < thread_m_blocks * 4; i++) {
cp_async4_pred(
&sh_red[c_sh_wr + c_sh_wr_delta * i],
&sh[c_sh_wr + c_sh_wr_delta * i],
&C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
c_gl_wr_delta_i * (i % 2)],
i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
@ -1409,7 +1410,7 @@ __global__ void Marlin(
for (int i = 0; i < thread_m_blocks * 4; i++) {
if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
if (!first) {
int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
#pragma unroll
for (int j = 0; j < 2 * 4; j++) {
reinterpret_cast<float*>(
@ -1460,10 +1461,10 @@ __global__ void Marlin(
float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
#pragma unroll
for (int k = 0; k < th_size; k++) {
sh_red[threadIdx.x] =
sh[threadIdx.x] =
C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
#pragma unroll
for (int f = 0; f < 4; f++) {
frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
@ -1514,7 +1515,7 @@ __global__ void Marlin(
res = __hmul2(res, s[0]);
}
((scalar_t2*)sh_red)[idx] = res;
((scalar_t2*)sh)[idx] = res;
};
if (threadIdx.x / 32 < thread_n_blocks / 4) {
@ -1542,7 +1543,7 @@ __global__ void Marlin(
i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
i++) {
if (c_gl_wr < c_gl_wr_end) {
C[c_gl_wr] = sh_red[c_sh_rd];
C[c_gl_wr] = sh[c_sh_rd];
c_gl_wr += c_gl_wr_delta;
c_sh_rd += c_sh_rd_delta;
}
@ -1864,12 +1865,9 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
float pipe_size = (a_size + b_size) * pipe_stages;
float reduce_size = max(th_config.num_threads * 32 * 4,
(tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
}
bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,

View File

@ -63,7 +63,7 @@ torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
static inline std::optional<at::ScalarType> maybe_scalartype(
std::optional<at::Tensor> const& t) {
c10::optional<at::Tensor> const& t) {
if (!t) {
return std::nullopt;
} else {

View File

@ -183,11 +183,11 @@ struct MacheteKernelTemplate {
torch::Tensor const& A, // MxK matrix
torch::Tensor const& B, // KxN prepacked matrix
torch::Tensor& D, // MxN matrix
std::optional<torch::Tensor> const& maybe_g_scales, // scale_KxN matrix
std::optional<torch::Tensor> const& maybe_g_zeros, // scale_KxN matrix
std::optional<int64_t> maybe_group_size,
std::optional<torch::Tensor> const& maybe_ch_scales, // len N vector
std::optional<torch::Tensor> const& maybe_tok_scales) // len M vector
c10::optional<torch::Tensor> const& maybe_g_scales, // scale_KxN matrix
c10::optional<torch::Tensor> const& maybe_g_zeros, // scale_KxN matrix
c10::optional<int64_t> maybe_group_size,
c10::optional<torch::Tensor> const& maybe_ch_scales, // len N vector
c10::optional<torch::Tensor> const& maybe_tok_scales) // len M vector
{
static_assert(!with_group_zeropoints || with_group_scales);

View File

@ -13,23 +13,23 @@ struct MMArgs {
torch::Tensor const& A;
torch::Tensor const& B;
vllm::ScalarType const& b_type;
std::optional<at::ScalarType> const& maybe_out_type;
std::optional<torch::Tensor> const& maybe_group_scales;
std::optional<torch::Tensor> const& maybe_group_zeros;
std::optional<int64_t> maybe_group_size;
std::optional<torch::Tensor> const& maybe_channel_scales;
std::optional<torch::Tensor> const& maybe_token_scales;
std::optional<std::string> maybe_schedule;
c10::optional<at::ScalarType> const& maybe_out_type;
c10::optional<torch::Tensor> const& maybe_group_scales;
c10::optional<torch::Tensor> const& maybe_group_zeros;
c10::optional<int64_t> maybe_group_size;
c10::optional<torch::Tensor> const& maybe_channel_scales;
c10::optional<torch::Tensor> const& maybe_token_scales;
c10::optional<std::string> maybe_schedule;
};
struct SupportedSchedulesArgs {
at::ScalarType a_type;
vllm::ScalarType b_type;
std::optional<at::ScalarType> maybe_group_scales_type;
std::optional<at::ScalarType> maybe_group_zeros_type;
std::optional<at::ScalarType> maybe_channel_scales_type;
std::optional<at::ScalarType> maybe_token_scales_type;
std::optional<at::ScalarType> maybe_out_type;
c10::optional<at::ScalarType> maybe_group_scales_type;
c10::optional<at::ScalarType> maybe_group_zeros_type;
c10::optional<at::ScalarType> maybe_channel_scales_type;
c10::optional<at::ScalarType> maybe_token_scales_type;
c10::optional<at::ScalarType> maybe_out_type;
};
torch::Tensor mm_dispatch(MMArgs args);

View File

@ -10,7 +10,7 @@ struct PrepackBArgs {
torch::Tensor const& B;
at::ScalarType a_type;
vllm::ScalarType b_type;
std::optional<at::ScalarType> maybe_group_scales_type;
c10::optional<at::ScalarType> maybe_group_scales_type;
};
template <typename PrepackedLayoutB>

View File

@ -10,11 +10,11 @@ using namespace vllm;
std::vector<std::string> supported_schedules(
at::ScalarType a_type, int64_t b_type_id,
std::optional<at::ScalarType> maybe_group_scales_type,
std::optional<at::ScalarType> maybe_group_zeros_type,
std::optional<at::ScalarType> maybe_channel_scales_type,
std::optional<at::ScalarType> maybe_token_scales_type,
std::optional<at::ScalarType> maybe_out_type) {
c10::optional<at::ScalarType> maybe_group_scales_type,
c10::optional<at::ScalarType> maybe_group_zeros_type,
c10::optional<at::ScalarType> maybe_channel_scales_type,
c10::optional<at::ScalarType> maybe_token_scales_type,
c10::optional<at::ScalarType> maybe_out_type) {
ScalarType const b_type = ScalarType::from_id(b_type_id);
return supported_schedules_dispatch({
.a_type = a_type,
@ -29,13 +29,13 @@ std::vector<std::string> supported_schedules(
torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
int64_t b_type_id,
std::optional<at::ScalarType> const& maybe_out_type,
std::optional<torch::Tensor> const& maybe_group_scales,
std::optional<torch::Tensor> const& maybe_group_zeros,
std::optional<int64_t> maybe_group_size,
std::optional<torch::Tensor> const& maybe_channel_scales,
std::optional<torch::Tensor> const& maybe_token_scales,
std::optional<std::string> maybe_schedule) {
c10::optional<at::ScalarType> const& maybe_out_type,
c10::optional<torch::Tensor> const& maybe_group_scales,
c10::optional<torch::Tensor> const& maybe_group_zeros,
c10::optional<int64_t> maybe_group_size,
c10::optional<torch::Tensor> const& maybe_channel_scales,
c10::optional<torch::Tensor> const& maybe_token_scales,
c10::optional<std::string> maybe_schedule) {
ScalarType const b_type = ScalarType::from_id(b_type_id);
return mm_dispatch({.A = A,
.B = B,
@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
torch::Tensor prepack_B(
torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
std::optional<at::ScalarType> const& maybe_group_scales_type) {
c10::optional<at::ScalarType> const& maybe_group_scales_type) {
ScalarType const b_type = ScalarType::from_id(b_type_id);
return prepack_B_dispatch(
{.B = B,

View File

@ -928,7 +928,7 @@ void paged_attention_custom_launcher(
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, const int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& context_lens,
int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
int max_context_len, const c10::optional<torch::Tensor>& alibi_slopes,
float k_scale, float v_scale) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
@ -1086,7 +1086,7 @@ void paged_attention(
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& context_lens, // [num_seqs]
int64_t block_size, int64_t max_context_len,
const std::optional<torch::Tensor>& alibi_slopes,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale) {
const int head_size = query.size(2);
if (kv_cache_dtype == "auto") {

View File

@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
double scale, torch::Tensor& block_tables,
torch::Tensor& context_lens, int64_t block_size,
int64_t max_context_len,
const std::optional<torch::Tensor>& alibi_slopes,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale,
double v_scale);

View File

@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& bt_meta,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
if (bias) {

View File

@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& e,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias);
c10::optional<torch::Tensor> const& bias);
#endif
void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
torch::Tensor const& bt_meta,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
std::optional<torch::Tensor> const& bias) {
c10::optional<torch::Tensor> const& bias) {
// Checks for conformality
TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&

View File

@ -21,6 +21,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
&get_cuda_view_from_cpu_tensor);
// Attention ops
// Compute the attention between an input query and the cached
// keys/values using PagedAttention.
@ -98,6 +102,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
") -> ()");
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
ops.def(
"copy_subranges(Tensor matrix_src, Tensor matrix_diff, Tensor! "
"matrix_tgt, int n) -> ()");
ops.impl("copy_subranges", torch::kCUDA, &copy_subranges);
// Layernorm
// Apply Root Mean Square (RMS) Normalization to the input tensor.
ops.def(

View File

Before

Width:  |  Height:  |  Size: 102 KiB

After

Width:  |  Height:  |  Size: 102 KiB

View File

Before

Width:  |  Height:  |  Size: 173 KiB

After

Width:  |  Height:  |  Size: 173 KiB

View File

@ -1,13 +1,13 @@
(automatic-prefix-caching)=
(apc)=
# Automatic Prefix Caching
# Introduction
## Introduction
## What is Automatic Prefix Caching
Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
```{note}
Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
Technical details on how vLLM implements APC are in the next page.
```
## Enabling APC in vLLM

View File

@ -1,8 +1,6 @@
(design-automatic-prefix-caching)=
# Implementation
# Automatic Prefix Caching
The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.

View File

@ -1,7 +1,7 @@
# Dockerfile
We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
More information about deploying with Docker can be found [here](#deployment-docker).
More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:

View File

@ -1,115 +0,0 @@
(new-model-basic)=
# Basic Implementation
This guide walks you through the steps to implement a basic vLLM model.
## 1. Bring your model code
First, clone the PyTorch model code from the source repository.
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
```{warning}
Make sure to review and adhere to the original code's copyright and licensing terms!
```
## 2. Make your code compatible with vLLM
To ensure compatibility with vLLM, your model must meet the following requirements:
### Initialization Code
All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
The initialization code should look like this:
```python
from torch import nn
from vllm.config import VllmConfig
from vllm.attention import Attention
class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__()
self.attn = Attention(prefix=f"{prefix}.attn")
class MyDecoderLayer(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__()
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
class MyModel(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__()
self.layers = nn.ModuleList(
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
)
class MyModelForCausalLM(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
```
### Computation Code
Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
```python
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
...
```
```{note}
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
```
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
## 3. (Optional) Implement tensor parallelism and quantization support
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
When it comes to the linear layers, we provide the following options to parallelize them:
- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
## 4. Implement the weight loading logic
You now need to implement the `load_weights` method in your `*ForCausalLM` class.
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
## 5. Register your model
See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
## Frequently Asked Questions
### How to support models with interleaving sliding windows?
For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
To support a model with interleaving sliding windows, we need to take care of the following details:
- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
With these two steps, interleave sliding windows should work with the model.

View File

@ -1,26 +0,0 @@
(new-model)=
# Adding a New Model
This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
```{toctree}
:caption: Contents
:maxdepth: 1
basic
registration
multimodal
```
```{note}
The complexity of adding a new model depends heavily on the model's architecture.
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
```
```{tip}
If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
or ask on our [developer slack](https://slack.vllm.ai).
We will be happy to help you out!
```

View File

@ -1,56 +0,0 @@
(new-model-registration)=
# Model Registration
vLLM relies on a model registry to determine how to run each model.
A list of pre-registered architectures can be found [here](#supported-models).
If your model is not on this list, you must register it to vLLM.
This page provides detailed instructions on how to do so.
## Built-in models
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
This gives you the ability to modify the codebase and test your model.
After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
Finally, update our [list of supported models](#supported-models) to promote your model!
```{important}
The list of models in each section should be maintained in alphabetical order.
```
## Out-of-tree models
You can load an external model using a plugin without modifying the vLLM codebase.
```{seealso}
[vLLM's Plugin System](#plugin-system)
```
To register the model, use the following code:
```python
from vllm import ModelRegistry
from your_code import YourModelForCausalLM
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
```
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
```python
from vllm import ModelRegistry
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
```
```{important}
If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
Read more about that [here](#enabling-multimodal-inputs).
```
```{note}
Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
```

View File

@ -1,43 +0,0 @@
# Vulnerability Management
## Reporting Vulnerabilities
As mentioned in the [security
policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
vulnerabilities may be reported privately to the project via
[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
## Vulnerability Management Team
Once a vulnerability has been reported to the project, the Vulnerability
Management Team (VMT) is responsible for managing the vulnerability. The VMT is
responsible for:
- Triaging the vulnerability.
- Coordinating with reporters and project maintainers on vulnerability analysis
and resolution.
- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
- Coordination with project maintainers on a coordinated release of the fix and
security advisory.
### Security Advisories
Advisories are published via GitHub through the same system used to report
vulnerabilities. More information on the process can be found in the [GitHub
documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
### Team Members
We prefer to keep all vulnerability-related communication on the security report
on GitHub. However, if you need to contact the VMT directly for an urgent issue,
you may contact the following individuals:
- Simon Mo - simon.mo@hey.com
- Russell Bryant - rbryant@redhat.com
## Slack Discussion
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
to discuss security-related topics. However, please do not disclose any
vulnerabilities in this channel. If you need to report a vulnerability, please
use the GitHub security advisory system or contact a VMT member privately.

View File

@ -1,13 +0,0 @@
# Using other frameworks
```{toctree}
:maxdepth: 1
bentoml
cerebrium
dstack
helm
lws
skypilot
triton
```

View File

@ -1,9 +0,0 @@
# External Integrations
```{toctree}
:maxdepth: 1
kserve
kubeai
llamastack
```

View File

@ -57,7 +57,7 @@ More API details can be found in the {doc}`Offline Inference
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
### OpenAI-Compatible API Server
### OpenAI-compatible API server
The second primary interface to vLLM is via its OpenAI-compatible API server.
This server can be started using the `vllm serve` command.
@ -77,7 +77,8 @@ python -m vllm.entrypoints.openai.api_server --model <model>
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
More details on the API server can be found in the {doc}`OpenAI Compatible
Server </serving/openai_compatible_server>` document.
## LLM Engine

View File

@ -1,5 +1,3 @@
(design-paged-attention)=
# vLLM Paged Attention
- Currently, vLLM utilizes its own implementation of a multi-head query

View File

@ -2,7 +2,7 @@
## Debugging
Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
Please see the [Debugging Tips](#debugging-python-multiprocessing)
page for information on known issues and how to solve them.
## Introduction

View File

@ -1,7 +1,6 @@
# Offline Inference
```{toctree}
:caption: Contents
:maxdepth: 1
llm

View File

@ -1,19 +0,0 @@
(quantization-index)=
# Quantization
Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
```{toctree}
:caption: Contents
:maxdepth: 1
supported_hardware
auto_awq
bnb
gguf
int8
fp8
fp8_e5m2_kvcache
fp8_e4m3_kvcache
```

View File

@ -1,6 +1,6 @@
(installation-rocm)=
# Installation for ROCm
# Installation with ROCm
vLLM supports AMD GPUs with ROCm 6.2.
@ -148,7 +148,7 @@ $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
$ python3 setup.py develop
```
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
```{tip}
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.

View File

@ -2,7 +2,7 @@
# Installation for ARM CPUs
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
- CPU backend inference capabilities
- Relevant runtime environment variables

View File

@ -1,6 +1,6 @@
(installation-x86)=
(installation-cpu)=
# Installation for x86 CPUs
# Installation with CPU
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
@ -151,4 +151,4 @@ $ python examples/offline_inference.py
$ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
```
- Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
- Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).

View File

@ -1,8 +1,8 @@
(troubleshooting)=
(debugging)=
# Troubleshooting
# Debugging Tips
This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
```{note}
Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
@ -47,7 +47,6 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
(troubleshooting-incorrect-hardware-driver)=
## Incorrect hardware/driver
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
@ -140,7 +139,7 @@ A multi-node environment is more complicated than a single-node one. If you see
Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
```
(troubleshooting-python-multiprocessing)=
(debugging-python-multiprocessing)=
## Python multiprocessing
### `RuntimeError` Exception
@ -151,7 +150,7 @@ If you have seen a warning in your logs like this:
WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
initialized. We must use the `spawn` multiprocessing start method. Setting
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing
https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
for more information.
```

View File

@ -1,6 +1,4 @@
(installation-gaudi)=
# Installation for Intel® Gaudi®
# Installation with Intel® Gaudi® AI Accelerators
This README provides instructions on running vLLM with Intel Gaudi devices.
@ -82,7 +80,7 @@ $ python setup.py develop
## Supported Features
- [Offline inference](#offline-inference)
- [Offline batched inference](#offline-batched-inference)
- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
- HPU autodetection - no need to manually select device within vLLM
- Paged KV cache with algorithms enabled for Intel Gaudi accelerators

View File

@ -1,8 +1,8 @@
(installation-cuda)=
(installation)=
# Installation for CUDA
# Installation
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.4) binaries.
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
## Requirements
@ -12,43 +12,24 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.4) bin
## Install released versions
### Create a new Python environment
You can create a new Python environment using `conda`:
You can install vLLM using pip:
```console
$ # (Recommended) Create a new conda environment.
$ conda create -n myenv python=3.12 -y
$ conda activate myenv
$ # Install vLLM with CUDA 12.1.
$ pip install vllm
```
```{note}
[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
```
Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
```console
$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
$ uv venv myenv --python 3.12 --seed
$ source myenv/bin/activate
```
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
### Install vLLM
You can install vLLM using either `pip` or `uv pip`:
```console
$ # Install vLLM with CUDA 12.4.
$ pip install vllm # If you are using pip.
$ uv pip install vllm # If you are using uv.
```
As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
````{note}
As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
```console
$ # Install vLLM with CUDA 11.8.
@ -57,47 +38,29 @@ $ export PYTHON_VERSION=310
$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
```
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
````
(install-the-latest-code)=
## Install the latest code
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
### Install the latest code using `pip`
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
```console
$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
```
`--pre` is required for `pip` to consider pre-released versions.
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
```console
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
```
Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
### Install the latest code using `uv`
Another way to install the latest code is to use `uv`:
```console
$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
```
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
```console
$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
```
The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
### Install the latest code using `docker`
Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
Another way to access the latest code is to use the docker images:
@ -126,7 +89,7 @@ $ cd vllm
$ VLLM_USE_PRECOMPILED=1 pip install --editable .
```
This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
This will download the latest nightly wheel and use the compiled libraries from there in the install.
The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):

View File

@ -1,19 +0,0 @@
(installation-index)=
# Installation
vLLM supports the following hardware platforms:
```{toctree}
:maxdepth: 1
gpu-cuda
gpu-rocm
cpu-x86
cpu-arm
hpu-gaudi
tpu
xpu
openvino
neuron
```

View File

@ -1,6 +1,6 @@
(installation-neuron)=
# Installation for Neuron
# Installation with Neuron
vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
Paged Attention and Chunked Prefill are currently in development and will be available soon.

View File

@ -1,8 +1,8 @@
(installation-openvino)=
# Installation for OpenVINO
# Installation with OpenVINO
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
- Prefix caching (`--enable-prefix-caching`)
- Chunked prefill (`--enable-chunked-prefill`)

View File

@ -2,20 +2,20 @@
# Quickstart
This guide will help you quickly get started with vLLM to perform:
This guide will help you quickly get started with vLLM to:
- [Offline batched inference](#quickstart-offline)
- [Online inference using OpenAI-compatible server](#quickstart-online)
- [Run offline batched inference](#offline-batched-inference)
- [Run OpenAI-compatible inference](#openai-compatible-server)
## Prerequisites
- OS: Linux
- Python: 3.9 -- 3.12
- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
## Installation
If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
```console
$ conda create -n myenv python=3.10 -y
@ -23,11 +23,9 @@ $ conda activate myenv
$ pip install vllm
```
```{note}
For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
```
Please refer to the {ref}`installation documentation <installation>` for more details on installing vLLM.
(quickstart-offline)=
(offline-batched-inference)=
## Offline Batched Inference
@ -75,7 +73,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
(quickstart-online)=
(openai-compatible-server)=
## OpenAI-Compatible Server

View File

@ -1,6 +1,6 @@
(installation-tpu)=
# Installation for TPUs
# Installation with TPU
Tensor Processing Units (TPUs) are Google's custom-developed application-specific
integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs

View File

@ -1,6 +1,6 @@
(installation-xpu)=
# Installation for XPUs
# Installation with XPU
vLLM initially supports basic model inferencing and serving on Intel GPU platform.

View File

@ -50,7 +50,7 @@ For more information, check out the following:
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
- [vLLM Meetups](#meetups)
- {ref}`vLLM Meetups <meetups>`.
## Documentation
@ -58,68 +58,91 @@ For more information, check out the following:
:caption: Getting Started
:maxdepth: 1
getting_started/installation/index
getting_started/installation
getting_started/amd-installation
getting_started/openvino-installation
getting_started/cpu-installation
getting_started/gaudi-installation
getting_started/arm-installation
getting_started/neuron-installation
getting_started/tpu-installation
getting_started/xpu-installation
getting_started/quickstart
getting_started/debugging
getting_started/examples/examples_index
getting_started/troubleshooting
getting_started/faq
```
```{toctree}
:caption: Serving
:maxdepth: 1
serving/openai_compatible_server
serving/deploying_with_docker
serving/deploying_with_k8s
serving/deploying_with_helm
serving/deploying_with_nginx
serving/distributed_serving
serving/metrics
serving/integrations
serving/tensorizer
serving/runai_model_streamer
```
```{toctree}
:caption: Models
:maxdepth: 1
models/supported_models
models/generative_models
models/pooling_models
models/supported_models
models/extensions/index
models/adding_model
models/enabling_multimodal_inputs
```
```{toctree}
:caption: Features
:caption: Usage
:maxdepth: 1
features/quantization/index
features/lora
features/tool_calling
features/structured_outputs
features/automatic_prefix_caching
features/disagg_prefill
features/spec_decode
features/compatibility_matrix
usage/lora
usage/multimodal_inputs
usage/tool_calling
usage/structured_outputs
usage/spec_decode
usage/compatibility_matrix
usage/performance
usage/faq
usage/engine_args
usage/env_vars
usage/usage_stats
usage/disagg_prefill
```
```{toctree}
:caption: Inference and Serving
:caption: Quantization
:maxdepth: 1
serving/offline_inference
serving/openai_compatible_server
serving/multimodal_inputs
serving/distributed_serving
serving/metrics
serving/engine_args
serving/env_vars
serving/usage_stats
serving/integrations/index
quantization/supported_hardware
quantization/auto_awq
quantization/bnb
quantization/gguf
quantization/int8
quantization/fp8
quantization/fp8_e5m2_kvcache
quantization/fp8_e4m3_kvcache
```
```{toctree}
:caption: Deployment
:caption: Automatic Prefix Caching
:maxdepth: 1
deployment/docker
deployment/k8s
deployment/nginx
deployment/frameworks/index
deployment/integrations/index
automatic_prefix_caching/apc
automatic_prefix_caching/details
```
```{toctree}
:caption: Performance
:maxdepth: 1
performance/optimization
performance/benchmarks
```
@ -133,8 +156,10 @@ community/meetups
community/sponsors
```
% API Documentation: API reference aimed at vllm library usage
```{toctree}
:caption: API Reference
:caption: API Documentation
:maxdepth: 2
dev/sampling_params
@ -143,33 +168,30 @@ dev/offline_inference/offline_index
dev/engine/engine_index
```
% Design Documents: Details about vLLM internals
% Design: docs about vLLM internals
```{toctree}
:caption: Design Documents
:caption: Design
:maxdepth: 2
design/arch_overview
design/huggingface_integration
design/plugin_system
design/kernel/paged_attention
design/input_processing/model_inputs_index
design/kernel/paged_attention
design/multimodal/multimodal_index
design/automatic_prefix_caching
design/multiprocessing
```
% Developer Guide: How to contribute to the vLLM project
% For Developers: contributing to the vLLM project
```{toctree}
:caption: Developer Guide
:caption: For Developers
:maxdepth: 2
contributing/overview
contributing/profiling/profiling_index
contributing/dockerfile/dockerfile
contributing/model/index
contributing/vulnerability_management
```
# Indices and tables

View File

@ -0,0 +1,155 @@
(adding-a-new-model)=
# Adding a New Model
This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
```{note}
The complexity of adding a new model depends heavily on the model's architecture.
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
```
```{note}
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
please follow [this guide](#enabling-multimodal-inputs) after implementing the model here.
```
```{tip}
If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository.
We will be happy to help you out!
```
## 0. Fork the vLLM repository
Start by forking our [GitHub] repository and then [build it from source](#build-from-source).
This gives you the ability to modify the codebase and test your model.
```{tip}
If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
```
## 1. Bring your model code
Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the <gh-dir:vllm/model_executor/models> directory.
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
```{warning}
When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
```
## 2. Make your code compatible with vLLM
To ensure compatibility with vLLM, your model must meet the following requirements:
### Initialization Code
All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
The initialization code should look like this:
```python
from torch import nn
from vllm.config import VllmConfig
from vllm.attention import Attention
class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__()
self.attn = Attention(prefix=f"{prefix}.attn")
class MyDecoderLayer(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__()
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
class MyModel(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__()
self.layers = nn.ModuleList(
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
)
class MyModelForCausalLM(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
```
### Computation Code
Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
```python
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
...
```
```{note}
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
```
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
## 3. (Optional) Implement tensor parallelism and quantization support
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`.
When it comes to the linear layers, we provide the following options to parallelize them:
- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
## 4. Implement the weight loading logic
You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class.
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
## 5. Register your model
Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py>.
## 6. Out-of-Tree Model Integration
You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system).
To register the model, use the following code:
```python
from vllm import ModelRegistry
from your_code import YourModelForCausalLM
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
```
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
```python
from vllm import ModelRegistry
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
```
```{important}
If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
Read more about that [here](#enabling-multimodal-inputs).
```
```{note}
Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
```

View File

@ -2,11 +2,15 @@
# Enabling Multimodal Inputs
This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
```{seealso}
[Adding a New Model](adding-a-new-model)
```
## 1. Update the base vLLM model
It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
Further update the model as follows:
- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.

View File

@ -1,8 +0,0 @@
# Built-in Extensions
```{toctree}
:maxdepth: 1
runai_model_streamer
tensorizer
```

View File

@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
## Online Inference
Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
- [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.

View File

@ -106,7 +106,7 @@ A code example can be found here: <gh-file:examples/offline_inference_scoring.py
## Online Inference
Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.

View File

@ -1,9 +1,9 @@
(supported-models)=
# List of Supported Models
# Supported Models
vLLM supports generative and pooling models across various tasks.
If a model supports more than one task, you can set the task via the `--task` argument.
If a model supports more than one task, you can set the task via the {code}`--task` argument.
For each task, we list the model architectures that have been implemented in vLLM.
Alongside each architecture, we include some popular models that use it.
@ -14,8 +14,8 @@ Alongside each architecture, we include some popular models that use it.
By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository.
If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
````{tip}
The easiest way to check if your model is really supported at runtime is to run the program below:
@ -37,7 +37,7 @@ print(output)
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
````
Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM.
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
### ModelScope
@ -48,7 +48,7 @@ To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFac
$ export VLLM_USE_MODELSCOPE=True
```
And use with `trust_remote_code=True`.
And use with {code}`trust_remote_code=True`.
```python
from vllm import LLM
@ -420,19 +420,16 @@ you should explicitly specify the task type to ensure that the model is used in
```
```{note}
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`.
```
```{note}
Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
despite being described otherwise on its model card.
Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
```
If your model is not in the above list, we will try to automatically convert the model using
@ -471,8 +468,8 @@ If your model is not in the above list, we will try to automatically convert the
{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
```{important}
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
```
#### Classification (`--task classify`)
@ -540,13 +537,13 @@ The following modalities are supported depending on the model:
- **V**ideo
- **A**udio
Any combination of modalities joined by `+` are supported.
Any combination of modalities joined by {code}`+` are supported.
- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
On the other hand, modalities separated by `/` are mutually exclusive.
On the other hand, modalities separated by {code}`/` are mutually exclusive.
- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
@ -643,14 +640,14 @@ See [this page](#generative-models) for more information on how to use generativ
- `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-
- ✅︎
- ✅︎
-
* - `LlavaOnevisionForConditionalGeneration`
- LLaVA-Onevision
- T + I<sup>+</sup> + V<sup>+</sup>
- `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-
- ✅︎
- ✅︎
-
* - `MiniCPMV`
- MiniCPM-V
- T + I<sup>E+</sup>
@ -713,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `Qwen/Qwen2-Audio-7B-Instruct`
-
- ✅︎
- ✅︎
-
* - `Qwen2VLForConditionalGeneration`
- Qwen2-VL
- T + I<sup>E+</sup> + V<sup>E+</sup>
@ -727,15 +724,15 @@ See [this page](#generative-models) for more information on how to use generativ
- `fixie-ai/ultravox-v0_3`
-
- ✅︎
- ✅︎
-
```
<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
````{important}
To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
```python
llm = LLM(
@ -754,11 +751,11 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
```
```{note}
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
```
```{note}
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
```
@ -773,7 +770,7 @@ you should explicitly specify the task type to ensure that the model is used in
#### Text Embedding (`--task embed`)
Any text generation model can be converted into an embedding model by passing `--task embed`.
Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
```{note}
To get the best results, you should use pooling models that are specifically trained as such.
@ -821,7 +818,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
```{tip}
When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
```
3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.

View File

@ -37,10 +37,3 @@ model_id = "huggyllama/llama-7b"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes")
```
## OpenAI Compatible Server
Append the following to your 4bit model arguments:
```
--quantization bitsandbytes --load-format bitsandbytes
```

View File

@ -1,6 +1,6 @@
(quantization-supported-hardware)=
(supported-hardware-for-quantization)=
# Supported Hardware
# Supported Hardware for Quantization Kernels
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
@ -120,12 +120,12 @@ The table below shows the compatibility of various quantization implementations
- ✗
```
## Notes:
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
- "✅︎" indicates that the quantization method is supported on the specified hardware.
- "✗" indicates that the quantization method is not supported on the specified hardware.
```{note}
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
```

View File

Before

Width:  |  Height:  |  Size: 968 KiB

After

Width:  |  Height:  |  Size: 968 KiB

View File

@ -1,6 +1,6 @@
(deployment-bentoml)=
(deploying-with-bentoml)=
# BentoML
# Deploying with BentoML
[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.

View File

@ -1,6 +1,6 @@
(deployment-cerebrium)=
(deploying-with-cerebrium)=
# Cerebrium
# Deploying with Cerebrium
```{raw} html
<p align="center">

View File

@ -1,6 +1,6 @@
(deployment-docker)=
(deploying-with-docker)=
# Using Docker
# Deploying with Docker
## Use vLLM's Official Docker Image

View File

@ -1,6 +1,6 @@
(deployment-dstack)=
(deploying-with-dstack)=
# dstack
# Deploying with dstack
```{raw} html
<p align="center">

View File

@ -1,6 +1,6 @@
(deployment-helm)=
(deploying-with-helm)=
# Helm
# Deploying with Helm
A Helm chart to deploy vLLM for Kubernetes
@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release.
## Architecture
```{image} /assets/deployment/architecture_helm_deployment.png
```{image} architecture_helm_deployment.png
```
## Values

View File

@ -1,6 +1,6 @@
(deployment-k8s)=
(deploying-with-k8s)=
# Using Kubernetes
# Deploying with Kubernetes
Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
@ -43,7 +43,7 @@ metadata:
name: hf-token-secret
namespace: default
type: Opaque
stringData:
data:
token: "REPLACE_WITH_TOKEN"
```

View File

@ -1,6 +1,6 @@
(deployment-kserve)=
(deploying-with-kserve)=
# KServe
# Deploying with KServe
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.

View File

@ -1,6 +1,6 @@
(deployment-kubeai)=
(deploying-with-kubeai)=
# KubeAI
# Deploying with KubeAI
[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.

View File

@ -1,6 +1,6 @@
(deployment-lws)=
(deploying-with-lws)=
# LWS
# Deploying with LWS
LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
A major use case is for multi-host/multi-node distributed inference.

View File

@ -1,6 +1,6 @@
(nginxloadbalancer)=
# Using Nginx
# Deploying with Nginx Loadbalancer
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.

View File

@ -1,5 +1,5 @@
(deployment-triton)=
(deploying-with-triton)=
# NVIDIA Triton
# Deploying with NVIDIA Triton
The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.

View File

@ -18,13 +18,13 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
```
## Running vLLM on a single node
## Details for Distributed Inference and Serving
vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
```python
from vllm import LLM
@ -32,14 +32,14 @@ llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
output = llm.generate("San Franciso is a")
```
To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
```console
$ vllm serve facebook/opt-13b \
$ --tensor-parallel-size 4
```
You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
```console
$ vllm serve gpt2 \
@ -47,7 +47,7 @@ $ --tensor-parallel-size 4 \
$ --pipeline-parallel-size 2
```
## Running vLLM on multiple nodes
## Multi-Node Inference and Serving
If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
@ -95,7 +95,7 @@ $ --tensor-parallel-size 16
To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
```{warning}
After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
```
```{warning}

View File

@ -0,0 +1,17 @@
# Integrations
```{toctree}
:maxdepth: 1
run_on_sky
deploying_with_kserve
deploying_with_kubeai
deploying_with_triton
deploying_with_bentoml
deploying_with_cerebrium
deploying_with_lws
deploying_with_dstack
serving_with_langchain
serving_with_llamaindex
serving_with_llamastack
```

View File

@ -1,8 +0,0 @@
# External Integrations
```{toctree}
:maxdepth: 1
langchain
llamaindex
```

View File

@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
OpenAI compatible API server.
You can start the server using Python, or using [Docker](#deployment-docker):
You can start the server using Python, or using [Docker](deploying_with_docker.md):
```console
$ vllm serve unsloth/Llama-3.2-1B-Instruct

View File

@ -1,79 +0,0 @@
(offline-inference)=
# Offline Inference
You can run vLLM in your own code on a list of prompts.
The offline API is based on the {class}`~vllm.LLM` class.
To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
and runs it in vLLM using the default configuration.
```python
llm = LLM(model="facebook/opt-125m")
```
After initializing the `LLM` instance, you can perform model inference using various APIs.
The available APIs depend on the type of model that is being run:
- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text.
- [Pooling models](#pooling-models) output their hidden states directly.
Please refer to the above pages for more details about each API.
```{seealso}
[API Reference](/dev/offline_inference/offline_index)
```
## Configuration Options
This section lists the most common options for running the vLLM engine.
For a full list, refer to the [Engine Arguments](#engine-args) page.
### Reducing memory usage
Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
#### Tensor Parallelism (TP)
Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
The following code splits the model across 2 GPUs.
```python
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
tensor_parallel_size=2)
```
```{important}
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
```
#### Quantization
Quantized models take less memory at the cost of lower precision.
Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic))
and used directly without extra configuration.
Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details.
#### Context length and batch size
You can further reduce memory usage by limit the context length of the model (`max_model_len` option)
and the maximum batch size (`max_num_seqs` option).
```python
llm = LLM(model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2)
```
### Performance optimization and tuning
You can potentially improve the performance of vLLM by finetuning various options.
Please refer to [this guide](#optimization-and-tuning) for more details.

View File

@ -1,10 +1,8 @@
(openai-compatible-server)=
# OpenAI Compatible Server
# OpenAI-Compatible Server
vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md):
```bash
vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
```
@ -219,7 +217,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
- *Note: `image_url.detail` parameter is not supported.*
Code example: <gh-file:examples/openai_chat_completion_client.py>

Some files were not shown because too many files have changed in this diff Show More