mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
22 Commits
correct-do
...
v1-blockta
Author | SHA1 | Date | |
---|---|---|---|
7097f31955 | |||
f840b53063 | |||
1ca4298b9b | |||
ba64a0249f | |||
1260e43230 | |||
a6e5d7b5b7 | |||
ebfbe1244b | |||
6ba31aa5f6 | |||
34d6cc2aea | |||
27e8eb2e94 | |||
ca4f9e69a8 | |||
52922193cd | |||
bef68163a0 | |||
ff5b1033dc | |||
b938606993 | |||
3fdbd8e2f5 | |||
0420fb2c7b | |||
ee965c9c69 | |||
0a669eed7b | |||
03b1e6fdbd | |||
8a4180c8b6 | |||
1aaced5830 |
@ -1,6 +1,5 @@
|
||||
steps:
|
||||
- label: "Wait for container to be ready"
|
||||
key: wait-for-container-image
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
@ -11,11 +10,12 @@ steps:
|
||||
command:
|
||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
|
||||
- wait
|
||||
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
@ -49,7 +49,6 @@ steps:
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
@ -74,7 +73,7 @@ steps:
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: wait-for-container-image
|
||||
depends_on: ~
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
|
@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
@ -46,26 +46,26 @@ function cpu_tests() {
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run chunked-prefill and prefix-cache test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v -k cpu_model \
|
||||
tests/basic_correctness/test_chunked_prefill.py"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||
|
@ -3,18 +3,6 @@
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
set -v
|
||||
|
||||
image_name="neuron/vllm-ci"
|
||||
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
@ -25,30 +13,41 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
docker image prune -f
|
||||
docker system prune -f
|
||||
rm -rf "${HF_MOUNT:?}/*"
|
||||
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
|
||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||
docker build -t neuron -f Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker image rm -f "${image_name}" || true;
|
||||
}
|
||||
remove_docker_container() { docker rm -f neuron || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image
|
||||
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
|
||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||
|
||||
# Wait for the server to start
|
||||
wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
echo "Timeout after $timeout seconds"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
wait_for_server_to_start
|
||||
|
||||
# Test a simple prompt
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
localhost:8000/generate \
|
||||
-d '{"prompt": "San Francisco is a"}'
|
||||
|
@ -242,7 +242,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||
@ -363,14 +363,12 @@ steps:
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/audio_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
|
||||
@ -535,7 +533,6 @@ steps:
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
|
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: The model to consider.
|
||||
|
@ -193,6 +193,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
@ -200,6 +201,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/prepare_inputs/advance_step.cu"
|
||||
"csrc/prepare_inputs/copy_subranges.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
@ -550,7 +552,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
|
||||
GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
13
Dockerfile
13
Dockerfile
@ -234,8 +234,8 @@ RUN mv vllm test_docs/
|
||||
#################### TEST IMAGE ####################
|
||||
|
||||
#################### OPENAI API SERVER ####################
|
||||
# base openai image with additional requirements, for any subsequent openai-style images
|
||||
FROM vllm-base AS vllm-openai-base
|
||||
# openai api server alternative
|
||||
FROM vllm-base AS vllm-openai
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
@ -247,14 +247,5 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
# define sagemaker first, so it is not default from `docker build`
|
||||
FROM vllm-openai-base AS vllm-sagemaker
|
||||
|
||||
COPY examples/sagemaker-entrypoint.sh .
|
||||
RUN chmod +x sagemaker-entrypoint.sh
|
||||
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
||||
|
||||
FROM vllm-openai-base AS vllm-openai
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
#################### OPENAI API SERVER ####################
|
||||
|
@ -15,8 +15,8 @@ RUN apt-get update && \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
### Mount Point ###
|
||||
# When launching the container, mount the code directory to /workspace
|
||||
ARG APP_MOUNT=/workspace
|
||||
# When launching the container, mount the code directory to /app
|
||||
ARG APP_MOUNT=/app
|
||||
VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
@ -25,7 +25,6 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install pytest
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
@ -43,7 +42,4 @@ RUN --mount=type=bind,source=.git,target=.git \
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# overwrite entrypoint to run bash script
|
||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
@ -77,7 +77,7 @@ pip install vllm
|
||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
||||
- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||
|
||||
## Contributing
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||
|
||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/).
|
||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||
|
||||
---
|
||||
|
||||
|
@ -53,7 +53,7 @@ void paged_attention_v1_launcher(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
@ -176,7 +176,7 @@ void paged_attention_v1(
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
|
@ -54,7 +54,7 @@ void paged_attention_v2_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||
const int blocksparse_head_sliding_step) {
|
||||
@ -187,7 +187,7 @@ void paged_attention_v2(
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
|
@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes) {
|
||||
const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -459,7 +459,7 @@ void paged_attention_v1(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
||||
int max_seq_len, const std::optional<torch::Tensor>& alibi_slopes) {
|
||||
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -781,7 +781,7 @@ void paged_attention_v2(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
|
@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
|
||||
const torch::Tensor& b, // [IC, OC], column-major
|
||||
const torch::Tensor& a_scales, // [1] or [M]
|
||||
const torch::Tensor& b_scales, // [1] or [OC]
|
||||
const std::optional<torch::Tensor>& bias // [OC]
|
||||
const c10::optional<torch::Tensor>& bias // [OC]
|
||||
) {
|
||||
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
|
||||
// Checks for conformality
|
||||
@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
|
||||
const torch::Tensor& a_scales, // [1] or [M]
|
||||
const torch::Tensor& b_scales, // [1] or [OC]
|
||||
const torch::Tensor& azp_adj, // [OC]
|
||||
const std::optional<torch::Tensor>& azp, // [1] or [M]
|
||||
const std::optional<torch::Tensor>& bias // [OC]
|
||||
const c10::optional<torch::Tensor>& azp, // [1] or [M]
|
||||
const c10::optional<torch::Tensor>& bias // [OC]
|
||||
) {
|
||||
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
|
||||
// Checks for conformality
|
||||
@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major
|
||||
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||
const torch::Tensor& input, // [..., hidden_size]
|
||||
const torch::Tensor& scale,
|
||||
std::optional<torch::Tensor> const& azp) {
|
||||
c10::optional<torch::Tensor> const& azp) {
|
||||
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant(
|
||||
torch::Tensor& out, // [..., hidden_size]
|
||||
const torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& scale, // [..., 1]
|
||||
std::optional<torch::Tensor> const& azp) {
|
||||
c10::optional<torch::Tensor> const& azp) {
|
||||
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
|
@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids);
|
||||
void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||
const torch::Tensor& b, const torch::Tensor& a_scales,
|
||||
const torch::Tensor& b_scales,
|
||||
const std::optional<torch::Tensor>& bias);
|
||||
const c10::optional<torch::Tensor>& bias);
|
||||
|
||||
void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
|
||||
const torch::Tensor& b, const torch::Tensor& a_scales,
|
||||
const torch::Tensor& b_scales,
|
||||
const torch::Tensor& azp_adj,
|
||||
const std::optional<torch::Tensor>& azp,
|
||||
const std::optional<torch::Tensor>& bias);
|
||||
const c10::optional<torch::Tensor>& azp,
|
||||
const c10::optional<torch::Tensor>& bias);
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
@ -47,3 +47,11 @@
|
||||
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
|
||||
hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
|
||||
#endif
|
||||
|
||||
// #ifndef USE_ROCM
|
||||
// #define VLLM_cudaHostGetDevicePointer(device_ptr, host_ptr, flags) \
|
||||
// cudaHostGetDevicePointer(device_ptr, host_ptr, flags)
|
||||
// #else
|
||||
// #define VLLM_cudaHostGetDevicePointer(device_ptr, host_ptr, flags) \
|
||||
// hipHostGetDevicePointer(device_ptr, host_ptr, flags)
|
||||
// #endif
|
||||
|
43
csrc/cuda_view.cu
Normal file
43
csrc/cuda_view.cu
Normal file
@ -0,0 +1,43 @@
|
||||
#include <torch/all.h>
|
||||
#include <torch/cuda.h>
|
||||
|
||||
// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
|
||||
// memory, and that UVA (Unified Virtual Addressing) is enabled.
|
||||
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
|
||||
TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
|
||||
TORCH_CHECK(cpu_tensor.is_contiguous(), "Input tensor must be contiguous");
|
||||
|
||||
// Get raw host pointer from CPU tensor
|
||||
void* host_ptr = cpu_tensor.data_ptr();
|
||||
|
||||
// Get a device pointer corresponding to the pinned host memory
|
||||
void* device_ptr = nullptr;
|
||||
cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
|
||||
TORCH_CHECK(err == cudaSuccess,
|
||||
"cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
|
||||
|
||||
// Construct a CUDA tensor from the device pointer.
|
||||
// We'll use the same sizes, strides, and dtype as the CPU tensor.
|
||||
auto sizes = cpu_tensor.sizes();
|
||||
auto strides = cpu_tensor.strides();
|
||||
auto options =
|
||||
cpu_tensor.options().device(torch::kCUDA); // Change device to CUDA
|
||||
|
||||
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
|
||||
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
|
||||
// memory, so we don't free it here.
|
||||
auto deleter = [](void*) {
|
||||
// no-op, since the memory is owned by the original CPU tensor
|
||||
};
|
||||
|
||||
torch::Tensor cuda_tensor =
|
||||
torch::from_blob(device_ptr, sizes, strides, deleter, options);
|
||||
|
||||
TORCH_CHECK(cuda_tensor.device().is_cuda(),
|
||||
"Resulting tensor is not on CUDA device");
|
||||
TORCH_CHECK(cuda_tensor.sizes().equals(sizes), "Size mismatch");
|
||||
TORCH_CHECK(cuda_tensor.strides().equals(strides), "Stride mismatch");
|
||||
TORCH_CHECK(cuda_tensor.dtype() == cpu_tensor.dtype(), "Dtype mismatch");
|
||||
|
||||
return cuda_tensor;
|
||||
}
|
@ -68,7 +68,7 @@ struct ScaledEpilogueBase {
|
||||
// This overload handles the case where there might not be a tensor, in which
|
||||
// case a nullptr is passed and a constant (0) is used.
|
||||
template <typename Descriptor, typename T>
|
||||
static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
|
||||
static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
|
||||
static_assert(std::is_same_v<Descriptor, RowOrZeroLoad<T>>);
|
||||
using Arguments = typename Descriptor::Arguments;
|
||||
auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
|
||||
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
|
||||
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
torch::Tensor const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
|
@ -67,7 +67,7 @@ struct ScaledEpilogueBase {
|
||||
// This overload handles the case where there might not be a tensor, in which
|
||||
// case a nullptr is passed and a constant (0) is used.
|
||||
template <typename Descriptor, typename T>
|
||||
static auto args_from_tensor(std::optional<torch::Tensor> const& tensor) {
|
||||
static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
|
||||
using Arguments = typename Descriptor::Arguments;
|
||||
auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
|
||||
static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
|
||||
@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp
|
||||
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
torch::Tensor const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
|
||||
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
|
||||
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
|
||||
|
@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
|
||||
|
||||
template <typename Stride>
|
||||
static inline auto maybe_make_cute_layout(
|
||||
std::optional<torch::Tensor> const& tensor,
|
||||
c10::optional<torch::Tensor> const& tensor,
|
||||
std::string_view name = "tensor") {
|
||||
using Layout = decltype(make_cute_layout<Stride>(*tensor));
|
||||
|
||||
|
@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
||||
const at::Tensor x,
|
||||
const at::Tensor weight,
|
||||
const at::Tensor out,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const c10::optional<at::Tensor>& bias,
|
||||
bool silu_activation,
|
||||
int64_t pad_slot_id,
|
||||
const std::optional<at::Tensor>& query_start_loc = std::nullopt,
|
||||
const std::optional<at::Tensor>& cache_indices = std::nullopt,
|
||||
const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
|
||||
const c10::optional<at::Tensor>& query_start_loc = std::nullopt,
|
||||
const c10::optional<at::Tensor>& cache_indices = std::nullopt,
|
||||
const c10::optional<at::Tensor>& has_initial_state = std::nullopt) {
|
||||
|
||||
// Reset the parameters
|
||||
memset(¶ms, 0, sizeof(params));
|
||||
@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms,
|
||||
|
||||
|
||||
void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
||||
const std::optional<at::Tensor> &bias_,
|
||||
const std::optional<at::Tensor> &conv_states,
|
||||
const std::optional<at::Tensor> &query_start_loc,
|
||||
const std::optional<at::Tensor> &cache_indices,
|
||||
const std::optional<at::Tensor> &has_initial_state,
|
||||
const c10::optional<at::Tensor> &bias_,
|
||||
const c10::optional<at::Tensor> &conv_states,
|
||||
const c10::optional<at::Tensor> &query_start_loc,
|
||||
const c10::optional<at::Tensor> &cache_indices,
|
||||
const c10::optional<at::Tensor> &has_initial_state,
|
||||
bool silu_activation,
|
||||
// used to identify padding entries if cache_indices provided
|
||||
// in case of padding, the kernel will return early
|
||||
@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
||||
void causal_conv1d_update(const at::Tensor &x,
|
||||
const at::Tensor &conv_state,
|
||||
const at::Tensor &weight,
|
||||
const std::optional<at::Tensor> &bias_,
|
||||
const c10::optional<at::Tensor> &bias_,
|
||||
bool silu_activation,
|
||||
const std::optional<at::Tensor> &cache_seqlens_,
|
||||
const std::optional<at::Tensor> &conv_state_indices_,
|
||||
const c10::optional<at::Tensor> &cache_seqlens_,
|
||||
const c10::optional<at::Tensor> &conv_state_indices_,
|
||||
// used to identify padding entries if cache_indices provided
|
||||
// in case of padding, the kernel will return early
|
||||
int64_t pad_slot_id) {
|
||||
|
@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
||||
const torch::Tensor out,
|
||||
const torch::Tensor z,
|
||||
const torch::Tensor out_z,
|
||||
const std::optional<at::Tensor>& D,
|
||||
const std::optional<at::Tensor>& delta_bias,
|
||||
const c10::optional<at::Tensor>& D,
|
||||
const c10::optional<at::Tensor>& delta_bias,
|
||||
const torch::Tensor ssm_states,
|
||||
bool has_z,
|
||||
bool delta_softplus,
|
||||
const std::optional<at::Tensor>& query_start_loc,
|
||||
const std::optional<at::Tensor>& cache_indices,
|
||||
const std::optional<at::Tensor>& has_initial_state,
|
||||
const c10::optional<at::Tensor>& query_start_loc,
|
||||
const c10::optional<at::Tensor>& cache_indices,
|
||||
const c10::optional<at::Tensor>& has_initial_state,
|
||||
bool varlen,
|
||||
int64_t pad_slot_id) {
|
||||
|
||||
@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
||||
|
||||
void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||
const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C,
|
||||
const std::optional<torch::Tensor> &D_,
|
||||
const std::optional<torch::Tensor> &z_,
|
||||
const std::optional<torch::Tensor> &delta_bias_,
|
||||
const c10::optional<torch::Tensor> &D_,
|
||||
const c10::optional<torch::Tensor> &z_,
|
||||
const c10::optional<torch::Tensor> &delta_bias_,
|
||||
bool delta_softplus,
|
||||
const std::optional<torch::Tensor> &query_start_loc,
|
||||
const std::optional<torch::Tensor> &cache_indices,
|
||||
const std::optional<torch::Tensor> &has_initial_state,
|
||||
const c10::optional<torch::Tensor> &query_start_loc,
|
||||
const c10::optional<torch::Tensor> &cache_indices,
|
||||
const c10::optional<torch::Tensor> &has_initial_state,
|
||||
const torch::Tensor &ssm_states,
|
||||
// used to identify padding entries if cache_indices provided
|
||||
// in case of padding, the kernel will return early
|
||||
|
51
csrc/ops.h
51
csrc/ops.h
@ -33,7 +33,7 @@ void paged_attention_v1(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
@ -44,7 +44,7 @@ void paged_attention_v2(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
@ -115,6 +115,11 @@ void advance_step_flashinfer(
|
||||
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
|
||||
|
||||
void copy_subranges(torch::Tensor& matrix_src, torch::Tensor& matrix_diff,
|
||||
torch::Tensor& matrix_tgt, int64_t n);
|
||||
|
||||
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||
const torch::Tensor& codebooks,
|
||||
@ -153,15 +158,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
|
||||
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability);
|
||||
|
||||
@ -169,7 +174,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b, torch::Tensor const& e,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
|
||||
torch::Tensor& e, torch::Tensor const& a);
|
||||
@ -177,11 +182,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
|
||||
|
||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> const& azp);
|
||||
c10::optional<torch::Tensor> const& azp);
|
||||
|
||||
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
torch::Tensor& scales,
|
||||
std::optional<torch::Tensor> const& azp);
|
||||
c10::optional<torch::Tensor> const& azp);
|
||||
|
||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||
torch::Tensor b_gptq_qzeros,
|
||||
@ -198,34 +203,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
|
||||
void dynamic_per_token_scaled_fp8_quant(
|
||||
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
|
||||
std::optional<torch::Tensor> const& scale_ub);
|
||||
c10::optional<torch::Tensor> const& scale_ub);
|
||||
|
||||
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
|
||||
const torch::Tensor& A, const torch::Tensor& B,
|
||||
const torch::Tensor& C,
|
||||
const std::optional<torch::Tensor>& D_,
|
||||
const std::optional<torch::Tensor>& z_,
|
||||
const std::optional<torch::Tensor>& delta_bias_,
|
||||
const c10::optional<torch::Tensor>& D_,
|
||||
const c10::optional<torch::Tensor>& z_,
|
||||
const c10::optional<torch::Tensor>& delta_bias_,
|
||||
bool delta_softplus,
|
||||
const std::optional<torch::Tensor>& query_start_loc,
|
||||
const std::optional<torch::Tensor>& cache_indices,
|
||||
const std::optional<torch::Tensor>& has_initial_state,
|
||||
const c10::optional<torch::Tensor>& query_start_loc,
|
||||
const c10::optional<torch::Tensor>& cache_indices,
|
||||
const c10::optional<torch::Tensor>& has_initial_state,
|
||||
const torch::Tensor& ssm_states, int64_t pad_slot_id);
|
||||
|
||||
void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state,
|
||||
const at::Tensor& weight,
|
||||
const std::optional<at::Tensor>& bias_,
|
||||
const c10::optional<at::Tensor>& bias_,
|
||||
bool silu_activation,
|
||||
const std::optional<at::Tensor>& cache_seqlens_,
|
||||
const std::optional<at::Tensor>& conv_state_indices_,
|
||||
const c10::optional<at::Tensor>& cache_seqlens_,
|
||||
const c10::optional<at::Tensor>& conv_state_indices_,
|
||||
int64_t pad_slot_id);
|
||||
|
||||
void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
|
||||
const std::optional<at::Tensor>& bias_,
|
||||
const std::optional<at::Tensor>& conv_states,
|
||||
const std::optional<at::Tensor>& query_start_loc,
|
||||
const std::optional<at::Tensor>& cache_indices,
|
||||
const std::optional<at::Tensor>& has_initial_state,
|
||||
const c10::optional<at::Tensor>& bias_,
|
||||
const c10::optional<at::Tensor>& conv_states,
|
||||
const c10::optional<at::Tensor>& query_start_loc,
|
||||
const c10::optional<at::Tensor>& cache_indices,
|
||||
const c10::optional<at::Tensor>& has_initial_state,
|
||||
bool silu_activation, int64_t pad_slot_id);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
75
csrc/prepare_inputs/copy_subranges.cu
Normal file
75
csrc/prepare_inputs/copy_subranges.cu
Normal file
@ -0,0 +1,75 @@
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
namespace vllm {
|
||||
__global__ void copy_subranges_kernel(const int* __restrict__ matrix_src,
|
||||
const int* __restrict__ matrix_diff,
|
||||
int* __restrict__ matrix_tgt, int64_t M) {
|
||||
int row_id = blockIdx.x;
|
||||
int row_offset = row_id * M;
|
||||
|
||||
int start = matrix_diff[row_id * 2];
|
||||
int length = matrix_diff[row_id * 2 + 1];
|
||||
int end = start + length;
|
||||
int thread_idx = threadIdx.x;
|
||||
for (int i = start + thread_idx; i < end; i += blockDim.x) {
|
||||
int idx = row_offset + i;
|
||||
matrix_tgt[idx] = matrix_src[idx];
|
||||
}
|
||||
}
|
||||
} // namespace vllm
|
||||
|
||||
void copy_subranges(torch::Tensor& matrix_src, torch::Tensor& matrix_diff,
|
||||
torch::Tensor& matrix_tgt, int64_t n) {
|
||||
// NOTE(woosuk): Here, we skip most of the error checking to minimize the
|
||||
// CPU overheads. We assume that the caller will pass the correct inputs.
|
||||
|
||||
// Check tensor properties
|
||||
// TORCH_CHECK(matrix_src.is_cuda(), "matrix_src must be a CUDA tensor");
|
||||
// TORCH_CHECK(matrix_diff.is_cuda(), "matrix_diff must be a CUDA tensor");
|
||||
// TORCH_CHECK(matrix_tgt.is_cuda(), "matrix_tgt must be a CUDA tensor");
|
||||
// TORCH_CHECK(matrix_src.is_contiguous(), "matrix_src must be contiguous");
|
||||
// TORCH_CHECK(matrix_diff.is_contiguous(), "matrix_diff must be contiguous");
|
||||
// TORCH_CHECK(matrix_tgt.is_contiguous(), "matrix_tgt must be contiguous");
|
||||
|
||||
auto src_sizes = matrix_src.sizes();
|
||||
auto diff_sizes = matrix_diff.sizes();
|
||||
auto tgt_sizes = matrix_tgt.sizes();
|
||||
|
||||
// TORCH_CHECK(src_sizes.size() == 2, "matrix_src must be 2D");
|
||||
// TORCH_CHECK(diff_sizes.size() == 2, "matrix_diff must be 2D");
|
||||
// TORCH_CHECK(tgt_sizes.size() == 2, "matrix_tgt must be 2D");
|
||||
|
||||
int64_t N = src_sizes[0];
|
||||
int64_t M = src_sizes[1];
|
||||
|
||||
// TORCH_CHECK(diff_sizes[0] == N, "matrix_diff first dim must match N");
|
||||
// TORCH_CHECK(diff_sizes[1] == 2, "matrix_diff second dim must be 2");
|
||||
// TORCH_CHECK(tgt_sizes[0] == N && tgt_sizes[1] == M,
|
||||
// "matrix_tgt must have same shape as matrix_src");
|
||||
|
||||
// TORCH_CHECK(n <= N, "n must be <= N");
|
||||
|
||||
const int* d_matrix_src = matrix_src.data_ptr<int>();
|
||||
const int* d_matrix_diff = matrix_diff.data_ptr<int>();
|
||||
int* d_matrix_tgt = matrix_tgt.data_ptr<int>();
|
||||
|
||||
// One thread block per row.
|
||||
int blocks = n;
|
||||
int threads;
|
||||
if (blocks < 128) {
|
||||
threads = 1024;
|
||||
} else if (blocks < 256) {
|
||||
threads = 512;
|
||||
} else if (blocks < 512) {
|
||||
threads = 256;
|
||||
} else {
|
||||
threads = 128;
|
||||
}
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(matrix_tgt));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
vllm::copy_subranges_kernel<<<blocks, threads, 0, stream>>>(
|
||||
d_matrix_src, d_matrix_diff, d_matrix_tgt, M);
|
||||
}
|
@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel(
|
||||
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor const& input, // [..., hidden_size]
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> const& azp) {
|
||||
c10::optional<torch::Tensor> const& azp) {
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(scale.numel() == 1);
|
||||
@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||
void dynamic_scaled_int8_quant(
|
||||
torch::Tensor& out, // [..., hidden_size]
|
||||
torch::Tensor const& input, // [..., hidden_size]
|
||||
torch::Tensor& scales, std::optional<torch::Tensor> const& azp) {
|
||||
torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
|
||||
TORCH_CHECK(input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(scales.is_contiguous());
|
||||
|
@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
|
@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
|
||||
|
@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
|
||||
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||
@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
|
||||
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
||||
void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
|
||||
@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
|
||||
void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
// Checks for conformality
|
||||
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
|
||||
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
|
||||
@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
torch::Tensor const& azp_adj,
|
||||
std::optional<torch::Tensor> const& azp,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& azp,
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
// Checks for conformality
|
||||
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
|
||||
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
|
||||
|
@ -834,7 +834,6 @@ __global__ void Marlin(
|
||||
int4* sh_g_idx = sh_b + (stages * b_sh_stage);
|
||||
int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
|
||||
int4* sh_s = sh_zp + (stages * zp_sh_stage);
|
||||
int4* sh_red = sh_s + (stages * s_sh_stage);
|
||||
|
||||
// Register storage for double buffer of shared memory reads.
|
||||
FragA frag_a[2][thread_m_blocks];
|
||||
@ -933,11 +932,11 @@ __global__ void Marlin(
|
||||
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
|
||||
|
||||
if constexpr (group_blocks >= thread_k_blocks) {
|
||||
if (s_sh_wr_pred) {
|
||||
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
|
||||
}
|
||||
// Only fetch scales if this tile starts a new group
|
||||
if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
|
||||
if (pipe % (group_blocks / thread_k_blocks) == 0) {
|
||||
if (s_sh_wr_pred) {
|
||||
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
|
||||
}
|
||||
s_gl_rd += s_gl_rd_delta;
|
||||
}
|
||||
} else {
|
||||
@ -1039,7 +1038,9 @@ __global__ void Marlin(
|
||||
// No act-order case
|
||||
if constexpr (group_blocks != -1) {
|
||||
if constexpr (group_blocks >= thread_k_blocks) {
|
||||
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
|
||||
int4* sh_s_stage =
|
||||
sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
|
||||
(pipe / (group_blocks / thread_k_blocks)));
|
||||
reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
|
||||
} else {
|
||||
int warp_id = threadIdx.x / 32;
|
||||
@ -1338,15 +1339,15 @@ __global__ void Marlin(
|
||||
int red_sh_wr =
|
||||
red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
|
||||
if (i < red_off) {
|
||||
float* c_rd = reinterpret_cast<float*>(
|
||||
&sh_red[red_sh_delta * j + red_sh_rd]);
|
||||
float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
|
||||
float* c_rd =
|
||||
reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
|
||||
float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 4; k++)
|
||||
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
|
||||
c_rd[k] + c_wr[k];
|
||||
}
|
||||
sh_red[red_sh_wr] =
|
||||
sh[red_sh_wr] =
|
||||
reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
|
||||
}
|
||||
}
|
||||
@ -1356,7 +1357,7 @@ __global__ void Marlin(
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4 * 2; i++) {
|
||||
float* c_rd =
|
||||
reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
|
||||
reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 4; j++)
|
||||
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
|
||||
@ -1396,7 +1397,7 @@ __global__ void Marlin(
|
||||
#pragma unroll
|
||||
for (int i = 0; i < thread_m_blocks * 4; i++) {
|
||||
cp_async4_pred(
|
||||
&sh_red[c_sh_wr + c_sh_wr_delta * i],
|
||||
&sh[c_sh_wr + c_sh_wr_delta * i],
|
||||
&C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
|
||||
c_gl_wr_delta_i * (i % 2)],
|
||||
i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
|
||||
@ -1409,7 +1410,7 @@ __global__ void Marlin(
|
||||
for (int i = 0; i < thread_m_blocks * 4; i++) {
|
||||
if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
|
||||
if (!first) {
|
||||
int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
|
||||
int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 2 * 4; j++) {
|
||||
reinterpret_cast<float*>(
|
||||
@ -1460,10 +1461,10 @@ __global__ void Marlin(
|
||||
float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < th_size; k++) {
|
||||
sh_red[threadIdx.x] =
|
||||
sh[threadIdx.x] =
|
||||
C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
|
||||
|
||||
float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
|
||||
float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
|
||||
#pragma unroll
|
||||
for (int f = 0; f < 4; f++) {
|
||||
frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
|
||||
@ -1514,7 +1515,7 @@ __global__ void Marlin(
|
||||
res = __hmul2(res, s[0]);
|
||||
}
|
||||
|
||||
((scalar_t2*)sh_red)[idx] = res;
|
||||
((scalar_t2*)sh)[idx] = res;
|
||||
};
|
||||
|
||||
if (threadIdx.x / 32 < thread_n_blocks / 4) {
|
||||
@ -1542,7 +1543,7 @@ __global__ void Marlin(
|
||||
i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
|
||||
i++) {
|
||||
if (c_gl_wr < c_gl_wr_end) {
|
||||
C[c_gl_wr] = sh_red[c_sh_rd];
|
||||
C[c_gl_wr] = sh[c_sh_rd];
|
||||
c_gl_wr += c_gl_wr_delta;
|
||||
c_sh_rd += c_sh_rd_delta;
|
||||
}
|
||||
@ -1864,12 +1865,9 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
|
||||
|
||||
float pipe_size = (a_size + b_size) * pipe_stages;
|
||||
|
||||
float reduce_size = max(th_config.num_threads * 32 * 4,
|
||||
(tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
|
||||
|
||||
TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
|
||||
|
||||
return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
|
||||
return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
|
||||
}
|
||||
|
||||
bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
|
||||
|
@ -63,7 +63,7 @@ torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) {
|
||||
|
||||
|
||||
static inline std::optional<at::ScalarType> maybe_scalartype(
|
||||
std::optional<at::Tensor> const& t) {
|
||||
c10::optional<at::Tensor> const& t) {
|
||||
if (!t) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
|
@ -183,11 +183,11 @@ struct MacheteKernelTemplate {
|
||||
torch::Tensor const& A, // MxK matrix
|
||||
torch::Tensor const& B, // KxN prepacked matrix
|
||||
torch::Tensor& D, // MxN matrix
|
||||
std::optional<torch::Tensor> const& maybe_g_scales, // scale_KxN matrix
|
||||
std::optional<torch::Tensor> const& maybe_g_zeros, // scale_KxN matrix
|
||||
std::optional<int64_t> maybe_group_size,
|
||||
std::optional<torch::Tensor> const& maybe_ch_scales, // len N vector
|
||||
std::optional<torch::Tensor> const& maybe_tok_scales) // len M vector
|
||||
c10::optional<torch::Tensor> const& maybe_g_scales, // scale_KxN matrix
|
||||
c10::optional<torch::Tensor> const& maybe_g_zeros, // scale_KxN matrix
|
||||
c10::optional<int64_t> maybe_group_size,
|
||||
c10::optional<torch::Tensor> const& maybe_ch_scales, // len N vector
|
||||
c10::optional<torch::Tensor> const& maybe_tok_scales) // len M vector
|
||||
{
|
||||
static_assert(!with_group_zeropoints || with_group_scales);
|
||||
|
||||
|
@ -13,23 +13,23 @@ struct MMArgs {
|
||||
torch::Tensor const& A;
|
||||
torch::Tensor const& B;
|
||||
vllm::ScalarType const& b_type;
|
||||
std::optional<at::ScalarType> const& maybe_out_type;
|
||||
std::optional<torch::Tensor> const& maybe_group_scales;
|
||||
std::optional<torch::Tensor> const& maybe_group_zeros;
|
||||
std::optional<int64_t> maybe_group_size;
|
||||
std::optional<torch::Tensor> const& maybe_channel_scales;
|
||||
std::optional<torch::Tensor> const& maybe_token_scales;
|
||||
std::optional<std::string> maybe_schedule;
|
||||
c10::optional<at::ScalarType> const& maybe_out_type;
|
||||
c10::optional<torch::Tensor> const& maybe_group_scales;
|
||||
c10::optional<torch::Tensor> const& maybe_group_zeros;
|
||||
c10::optional<int64_t> maybe_group_size;
|
||||
c10::optional<torch::Tensor> const& maybe_channel_scales;
|
||||
c10::optional<torch::Tensor> const& maybe_token_scales;
|
||||
c10::optional<std::string> maybe_schedule;
|
||||
};
|
||||
|
||||
struct SupportedSchedulesArgs {
|
||||
at::ScalarType a_type;
|
||||
vllm::ScalarType b_type;
|
||||
std::optional<at::ScalarType> maybe_group_scales_type;
|
||||
std::optional<at::ScalarType> maybe_group_zeros_type;
|
||||
std::optional<at::ScalarType> maybe_channel_scales_type;
|
||||
std::optional<at::ScalarType> maybe_token_scales_type;
|
||||
std::optional<at::ScalarType> maybe_out_type;
|
||||
c10::optional<at::ScalarType> maybe_group_scales_type;
|
||||
c10::optional<at::ScalarType> maybe_group_zeros_type;
|
||||
c10::optional<at::ScalarType> maybe_channel_scales_type;
|
||||
c10::optional<at::ScalarType> maybe_token_scales_type;
|
||||
c10::optional<at::ScalarType> maybe_out_type;
|
||||
};
|
||||
|
||||
torch::Tensor mm_dispatch(MMArgs args);
|
||||
|
@ -10,7 +10,7 @@ struct PrepackBArgs {
|
||||
torch::Tensor const& B;
|
||||
at::ScalarType a_type;
|
||||
vllm::ScalarType b_type;
|
||||
std::optional<at::ScalarType> maybe_group_scales_type;
|
||||
c10::optional<at::ScalarType> maybe_group_scales_type;
|
||||
};
|
||||
|
||||
template <typename PrepackedLayoutB>
|
||||
|
@ -10,11 +10,11 @@ using namespace vllm;
|
||||
|
||||
std::vector<std::string> supported_schedules(
|
||||
at::ScalarType a_type, int64_t b_type_id,
|
||||
std::optional<at::ScalarType> maybe_group_scales_type,
|
||||
std::optional<at::ScalarType> maybe_group_zeros_type,
|
||||
std::optional<at::ScalarType> maybe_channel_scales_type,
|
||||
std::optional<at::ScalarType> maybe_token_scales_type,
|
||||
std::optional<at::ScalarType> maybe_out_type) {
|
||||
c10::optional<at::ScalarType> maybe_group_scales_type,
|
||||
c10::optional<at::ScalarType> maybe_group_zeros_type,
|
||||
c10::optional<at::ScalarType> maybe_channel_scales_type,
|
||||
c10::optional<at::ScalarType> maybe_token_scales_type,
|
||||
c10::optional<at::ScalarType> maybe_out_type) {
|
||||
ScalarType const b_type = ScalarType::from_id(b_type_id);
|
||||
return supported_schedules_dispatch({
|
||||
.a_type = a_type,
|
||||
@ -29,13 +29,13 @@ std::vector<std::string> supported_schedules(
|
||||
|
||||
torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
|
||||
int64_t b_type_id,
|
||||
std::optional<at::ScalarType> const& maybe_out_type,
|
||||
std::optional<torch::Tensor> const& maybe_group_scales,
|
||||
std::optional<torch::Tensor> const& maybe_group_zeros,
|
||||
std::optional<int64_t> maybe_group_size,
|
||||
std::optional<torch::Tensor> const& maybe_channel_scales,
|
||||
std::optional<torch::Tensor> const& maybe_token_scales,
|
||||
std::optional<std::string> maybe_schedule) {
|
||||
c10::optional<at::ScalarType> const& maybe_out_type,
|
||||
c10::optional<torch::Tensor> const& maybe_group_scales,
|
||||
c10::optional<torch::Tensor> const& maybe_group_zeros,
|
||||
c10::optional<int64_t> maybe_group_size,
|
||||
c10::optional<torch::Tensor> const& maybe_channel_scales,
|
||||
c10::optional<torch::Tensor> const& maybe_token_scales,
|
||||
c10::optional<std::string> maybe_schedule) {
|
||||
ScalarType const b_type = ScalarType::from_id(b_type_id);
|
||||
return mm_dispatch({.A = A,
|
||||
.B = B,
|
||||
@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
|
||||
|
||||
torch::Tensor prepack_B(
|
||||
torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
|
||||
std::optional<at::ScalarType> const& maybe_group_scales_type) {
|
||||
c10::optional<at::ScalarType> const& maybe_group_scales_type) {
|
||||
ScalarType const b_type = ScalarType::from_id(b_type_id);
|
||||
return prepack_B_dispatch(
|
||||
{.B = B,
|
||||
|
@ -928,7 +928,7 @@ void paged_attention_custom_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, const int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& context_lens,
|
||||
int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
int max_context_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
float k_scale, float v_scale) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
@ -1086,7 +1086,7 @@ void paged_attention(
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& context_lens, // [num_seqs]
|
||||
int64_t block_size, int64_t max_context_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale, double v_scale) {
|
||||
const int head_size = query.size(2);
|
||||
if (kv_cache_dtype == "auto") {
|
||||
|
@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
|
||||
double scale, torch::Tensor& block_tables,
|
||||
torch::Tensor& context_lens, int64_t block_size,
|
||||
int64_t max_context_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double k_scale,
|
||||
double v_scale);
|
||||
|
@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
|
||||
torch::Tensor const& bt_meta,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
if (bias) {
|
||||
|
@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& e,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
c10::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
torch::Tensor const& bt_meta,
|
||||
torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales,
|
||||
std::optional<torch::Tensor> const& bias) {
|
||||
c10::optional<torch::Tensor> const& bias) {
|
||||
// Checks for conformality
|
||||
TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2);
|
||||
TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&
|
||||
|
@ -21,6 +21,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
|
||||
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
|
||||
|
||||
ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
|
||||
ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
|
||||
&get_cuda_view_from_cpu_tensor);
|
||||
|
||||
// Attention ops
|
||||
// Compute the attention between an input query and the cached
|
||||
// keys/values using PagedAttention.
|
||||
@ -98,6 +102,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
") -> ()");
|
||||
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
|
||||
|
||||
ops.def(
|
||||
"copy_subranges(Tensor matrix_src, Tensor matrix_diff, Tensor! "
|
||||
"matrix_tgt, int n) -> ()");
|
||||
ops.impl("copy_subranges", torch::kCUDA, ©_subranges);
|
||||
|
||||
// Layernorm
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
ops.def(
|
||||
|
Before Width: | Height: | Size: 102 KiB After Width: | Height: | Size: 102 KiB |
Before Width: | Height: | Size: 173 KiB After Width: | Height: | Size: 173 KiB |
@ -1,13 +1,13 @@
|
||||
(automatic-prefix-caching)=
|
||||
(apc)=
|
||||
|
||||
# Automatic Prefix Caching
|
||||
# Introduction
|
||||
|
||||
## Introduction
|
||||
## What is Automatic Prefix Caching
|
||||
|
||||
Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
|
||||
|
||||
```{note}
|
||||
Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
|
||||
Technical details on how vLLM implements APC are in the next page.
|
||||
```
|
||||
|
||||
## Enabling APC in vLLM
|
@ -1,8 +1,6 @@
|
||||
(design-automatic-prefix-caching)=
|
||||
# Implementation
|
||||
|
||||
# Automatic Prefix Caching
|
||||
|
||||
The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
|
||||
The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
|
||||
|
||||
To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
|
||||
|
@ -1,7 +1,7 @@
|
||||
# Dockerfile
|
||||
|
||||
We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||
More information about deploying with Docker can be found [here](#deployment-docker).
|
||||
More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md).
|
||||
|
||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||
|
||||
|
@ -1,115 +0,0 @@
|
||||
(new-model-basic)=
|
||||
|
||||
# Basic Implementation
|
||||
|
||||
This guide walks you through the steps to implement a basic vLLM model.
|
||||
|
||||
## 1. Bring your model code
|
||||
|
||||
First, clone the PyTorch model code from the source repository.
|
||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
|
||||
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||
|
||||
```{warning}
|
||||
Make sure to review and adhere to the original code's copyright and licensing terms!
|
||||
```
|
||||
|
||||
## 2. Make your code compatible with vLLM
|
||||
|
||||
To ensure compatibility with vLLM, your model must meet the following requirements:
|
||||
|
||||
### Initialization Code
|
||||
|
||||
All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
|
||||
|
||||
- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
|
||||
- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
|
||||
|
||||
The initialization code should look like this:
|
||||
|
||||
```python
|
||||
from torch import nn
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.attention import Attention
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.attn = Attention(prefix=f"{prefix}.attn")
|
||||
|
||||
class MyDecoderLayer(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
|
||||
|
||||
class MyModel(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList(
|
||||
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
|
||||
)
|
||||
|
||||
class MyModelForCausalLM(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
|
||||
```
|
||||
|
||||
### Computation Code
|
||||
|
||||
Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
|
||||
|
||||
```python
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
```{note}
|
||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||
```
|
||||
|
||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||
|
||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||
|
||||
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
|
||||
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
|
||||
For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
|
||||
When it comes to the linear layers, we provide the following options to parallelize them:
|
||||
|
||||
- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
||||
- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
||||
- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
||||
- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
||||
- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
||||
|
||||
Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
||||
|
||||
## 4. Implement the weight loading logic
|
||||
|
||||
You now need to implement the `load_weights` method in your `*ForCausalLM` class.
|
||||
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
||||
|
||||
## 5. Register your model
|
||||
|
||||
See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
### How to support models with interleaving sliding windows?
|
||||
|
||||
For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation.
|
||||
|
||||
To support a model with interleaving sliding windows, we need to take care of the following details:
|
||||
|
||||
- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
|
||||
- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
|
||||
|
||||
With these two steps, interleave sliding windows should work with the model.
|
@ -1,26 +0,0 @@
|
||||
(new-model)=
|
||||
|
||||
# Adding a New Model
|
||||
|
||||
This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
|
||||
|
||||
```{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
basic
|
||||
registration
|
||||
multimodal
|
||||
```
|
||||
|
||||
```{note}
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
```
|
||||
|
||||
```{tip}
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
|
||||
or ask on our [developer slack](https://slack.vllm.ai).
|
||||
We will be happy to help you out!
|
||||
```
|
@ -1,56 +0,0 @@
|
||||
(new-model-registration)=
|
||||
|
||||
# Model Registration
|
||||
|
||||
vLLM relies on a model registry to determine how to run each model.
|
||||
A list of pre-registered architectures can be found [here](#supported-models).
|
||||
|
||||
If your model is not on this list, you must register it to vLLM.
|
||||
This page provides detailed instructions on how to do so.
|
||||
|
||||
## Built-in models
|
||||
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||
You should also include an example HuggingFace repository for this model in <gh-file:tests/models/registry.py> to run the unit tests.
|
||||
Finally, update our [list of supported models](#supported-models) to promote your model!
|
||||
|
||||
```{important}
|
||||
The list of models in each section should be maintained in alphabetical order.
|
||||
```
|
||||
|
||||
## Out-of-tree models
|
||||
|
||||
You can load an external model using a plugin without modifying the vLLM codebase.
|
||||
|
||||
```{seealso}
|
||||
[vLLM's Plugin System](#plugin-system)
|
||||
```
|
||||
|
||||
To register the model, use the following code:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
from your_code import YourModelForCausalLM
|
||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||
```
|
||||
|
||||
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
|
||||
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
|
||||
```
|
||||
|
||||
```{important}
|
||||
If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
Read more about that [here](#enabling-multimodal-inputs).
|
||||
```
|
||||
|
||||
```{note}
|
||||
Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
|
||||
```
|
@ -1,43 +0,0 @@
|
||||
# Vulnerability Management
|
||||
|
||||
## Reporting Vulnerabilities
|
||||
|
||||
As mentioned in the [security
|
||||
policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security
|
||||
vulnerabilities may be reported privately to the project via
|
||||
[GitHub](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||
|
||||
## Vulnerability Management Team
|
||||
|
||||
Once a vulnerability has been reported to the project, the Vulnerability
|
||||
Management Team (VMT) is responsible for managing the vulnerability. The VMT is
|
||||
responsible for:
|
||||
|
||||
- Triaging the vulnerability.
|
||||
- Coordinating with reporters and project maintainers on vulnerability analysis
|
||||
and resolution.
|
||||
- Drafting of security advisories for confirmed vulnerabilities, as appropriate.
|
||||
- Coordination with project maintainers on a coordinated release of the fix and
|
||||
security advisory.
|
||||
|
||||
### Security Advisories
|
||||
|
||||
Advisories are published via GitHub through the same system used to report
|
||||
vulnerabilities. More information on the process can be found in the [GitHub
|
||||
documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories).
|
||||
|
||||
### Team Members
|
||||
|
||||
We prefer to keep all vulnerability-related communication on the security report
|
||||
on GitHub. However, if you need to contact the VMT directly for an urgent issue,
|
||||
you may contact the following individuals:
|
||||
|
||||
- Simon Mo - simon.mo@hey.com
|
||||
- Russell Bryant - rbryant@redhat.com
|
||||
|
||||
## Slack Discussion
|
||||
|
||||
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
|
||||
to discuss security-related topics. However, please do not disclose any
|
||||
vulnerabilities in this channel. If you need to report a vulnerability, please
|
||||
use the GitHub security advisory system or contact a VMT member privately.
|
@ -1,13 +0,0 @@
|
||||
# Using other frameworks
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
bentoml
|
||||
cerebrium
|
||||
dstack
|
||||
helm
|
||||
lws
|
||||
skypilot
|
||||
triton
|
||||
```
|
@ -1,9 +0,0 @@
|
||||
# External Integrations
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
kserve
|
||||
kubeai
|
||||
llamastack
|
||||
```
|
@ -57,7 +57,7 @@ More API details can be found in the {doc}`Offline Inference
|
||||
|
||||
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||
|
||||
### OpenAI-Compatible API Server
|
||||
### OpenAI-compatible API server
|
||||
|
||||
The second primary interface to vLLM is via its OpenAI-compatible API server.
|
||||
This server can be started using the `vllm serve` command.
|
||||
@ -77,7 +77,8 @@ python -m vllm.entrypoints.openai.api_server --model <model>
|
||||
|
||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||
|
||||
More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
|
||||
More details on the API server can be found in the {doc}`OpenAI Compatible
|
||||
Server </serving/openai_compatible_server>` document.
|
||||
|
||||
## LLM Engine
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
(design-paged-attention)=
|
||||
|
||||
# vLLM Paged Attention
|
||||
|
||||
- Currently, vLLM utilizes its own implementation of a multi-head query
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## Debugging
|
||||
|
||||
Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
|
||||
Please see the [Debugging Tips](#debugging-python-multiprocessing)
|
||||
page for information on known issues and how to solve them.
|
||||
|
||||
## Introduction
|
||||
|
@ -1,7 +1,6 @@
|
||||
# Offline Inference
|
||||
|
||||
```{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
llm
|
||||
|
@ -1,19 +0,0 @@
|
||||
(quantization-index)=
|
||||
|
||||
# Quantization
|
||||
|
||||
Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
|
||||
|
||||
```{toctree}
|
||||
:caption: Contents
|
||||
:maxdepth: 1
|
||||
|
||||
supported_hardware
|
||||
auto_awq
|
||||
bnb
|
||||
gguf
|
||||
int8
|
||||
fp8
|
||||
fp8_e5m2_kvcache
|
||||
fp8_e4m3_kvcache
|
||||
```
|
@ -1,6 +1,6 @@
|
||||
(installation-rocm)=
|
||||
|
||||
# Installation for ROCm
|
||||
# Installation with ROCm
|
||||
|
||||
vLLM supports AMD GPUs with ROCm 6.2.
|
||||
|
||||
@ -148,7 +148,7 @@ $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
|
||||
$ python3 setup.py develop
|
||||
```
|
||||
|
||||
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
||||
This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation.
|
||||
|
||||
```{tip}
|
||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
|
@ -2,7 +2,7 @@
|
||||
|
||||
# Installation for ARM CPUs
|
||||
|
||||
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
|
||||
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
|
||||
|
||||
- CPU backend inference capabilities
|
||||
- Relevant runtime environment variables
|
@ -1,6 +1,6 @@
|
||||
(installation-x86)=
|
||||
(installation-cpu)=
|
||||
|
||||
# Installation for x86 CPUs
|
||||
# Installation with CPU
|
||||
|
||||
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
|
||||
|
||||
@ -151,4 +151,4 @@ $ python examples/offline_inference.py
|
||||
$ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
|
||||
```
|
||||
|
||||
- Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
|
||||
- Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md).
|
@ -1,8 +1,8 @@
|
||||
(troubleshooting)=
|
||||
(debugging)=
|
||||
|
||||
# Troubleshooting
|
||||
# Debugging Tips
|
||||
|
||||
This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
|
||||
```{note}
|
||||
Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
|
||||
@ -47,7 +47,6 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
|
||||
If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
|
||||
To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
|
||||
|
||||
(troubleshooting-incorrect-hardware-driver)=
|
||||
## Incorrect hardware/driver
|
||||
|
||||
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
|
||||
@ -140,7 +139,7 @@ A multi-node environment is more complicated than a single-node one. If you see
|
||||
Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
|
||||
```
|
||||
|
||||
(troubleshooting-python-multiprocessing)=
|
||||
(debugging-python-multiprocessing)=
|
||||
## Python multiprocessing
|
||||
|
||||
### `RuntimeError` Exception
|
||||
@ -151,7 +150,7 @@ If you have seen a warning in your logs like this:
|
||||
WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
|
||||
initialized. We must use the `spawn` multiprocessing start method. Setting
|
||||
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
|
||||
https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing
|
||||
https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
|
||||
for more information.
|
||||
```
|
||||
|
@ -1,6 +1,4 @@
|
||||
(installation-gaudi)=
|
||||
|
||||
# Installation for Intel® Gaudi®
|
||||
# Installation with Intel® Gaudi® AI Accelerators
|
||||
|
||||
This README provides instructions on running vLLM with Intel Gaudi devices.
|
||||
|
||||
@ -82,7 +80,7 @@ $ python setup.py develop
|
||||
|
||||
## Supported Features
|
||||
|
||||
- [Offline inference](#offline-inference)
|
||||
- [Offline batched inference](#offline-batched-inference)
|
||||
- Online inference via [OpenAI-Compatible Server](#openai-compatible-server)
|
||||
- HPU autodetection - no need to manually select device within vLLM
|
||||
- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
|
@ -1,8 +1,8 @@
|
||||
(installation-cuda)=
|
||||
(installation)=
|
||||
|
||||
# Installation for CUDA
|
||||
# Installation
|
||||
|
||||
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.4) binaries.
|
||||
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
|
||||
|
||||
## Requirements
|
||||
|
||||
@ -12,43 +12,24 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.4) bin
|
||||
|
||||
## Install released versions
|
||||
|
||||
### Create a new Python environment
|
||||
|
||||
You can create a new Python environment using `conda`:
|
||||
You can install vLLM using pip:
|
||||
|
||||
```console
|
||||
$ # (Recommended) Create a new conda environment.
|
||||
$ conda create -n myenv python=3.12 -y
|
||||
$ conda activate myenv
|
||||
|
||||
$ # Install vLLM with CUDA 12.1.
|
||||
$ pip install vllm
|
||||
```
|
||||
|
||||
```{note}
|
||||
[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
|
||||
Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
|
||||
```
|
||||
|
||||
Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
|
||||
|
||||
```console
|
||||
$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
|
||||
$ uv venv myenv --python 3.12 --seed
|
||||
$ source myenv/bin/activate
|
||||
```
|
||||
|
||||
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
|
||||
|
||||
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
|
||||
|
||||
### Install vLLM
|
||||
|
||||
You can install vLLM using either `pip` or `uv pip`:
|
||||
|
||||
```console
|
||||
$ # Install vLLM with CUDA 12.4.
|
||||
$ pip install vllm # If you are using pip.
|
||||
$ uv pip install vllm # If you are using uv.
|
||||
```
|
||||
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
|
||||
````{note}
|
||||
As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
|
||||
We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
|
||||
|
||||
```console
|
||||
$ # Install vLLM with CUDA 11.8.
|
||||
@ -57,47 +38,29 @@ $ export PYTHON_VERSION=310
|
||||
$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
```
|
||||
|
||||
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
|
||||
|
||||
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
|
||||
````
|
||||
|
||||
(install-the-latest-code)=
|
||||
|
||||
## Install the latest code
|
||||
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
||||
|
||||
### Install the latest code using `pip`
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command:
|
||||
|
||||
```console
|
||||
$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
|
||||
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
```
|
||||
|
||||
`--pre` is required for `pip` to consider pre-released versions.
|
||||
|
||||
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
|
||||
If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
|
||||
|
||||
```console
|
||||
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
|
||||
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
```
|
||||
|
||||
Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
|
||||
|
||||
### Install the latest code using `uv`
|
||||
|
||||
Another way to install the latest code is to use `uv`:
|
||||
|
||||
```console
|
||||
$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
|
||||
```
|
||||
|
||||
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
|
||||
|
||||
```console
|
||||
$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
|
||||
$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
|
||||
```
|
||||
|
||||
The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
|
||||
|
||||
### Install the latest code using `docker`
|
||||
Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
|
||||
|
||||
Another way to access the latest code is to use the docker images:
|
||||
|
||||
@ -126,7 +89,7 @@ $ cd vllm
|
||||
$ VLLM_USE_PRECOMPILED=1 pip install --editable .
|
||||
```
|
||||
|
||||
This will download the latest nightly wheel from https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation.
|
||||
This will download the latest nightly wheel and use the compiled libraries from there in the install.
|
||||
|
||||
The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
|
||||
|
@ -1,19 +0,0 @@
|
||||
(installation-index)=
|
||||
|
||||
# Installation
|
||||
|
||||
vLLM supports the following hardware platforms:
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
gpu-cuda
|
||||
gpu-rocm
|
||||
cpu-x86
|
||||
cpu-arm
|
||||
hpu-gaudi
|
||||
tpu
|
||||
xpu
|
||||
openvino
|
||||
neuron
|
||||
```
|
@ -1,6 +1,6 @@
|
||||
(installation-neuron)=
|
||||
|
||||
# Installation for Neuron
|
||||
# Installation with Neuron
|
||||
|
||||
vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
|
||||
Paged Attention and Chunked Prefill are currently in development and will be available soon.
|
@ -1,8 +1,8 @@
|
||||
(installation-openvino)=
|
||||
|
||||
# Installation for OpenVINO
|
||||
# Installation with OpenVINO
|
||||
|
||||
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
|
||||
- Prefix caching (`--enable-prefix-caching`)
|
||||
- Chunked prefill (`--enable-chunked-prefill`)
|
@ -2,20 +2,20 @@
|
||||
|
||||
# Quickstart
|
||||
|
||||
This guide will help you quickly get started with vLLM to perform:
|
||||
This guide will help you quickly get started with vLLM to:
|
||||
|
||||
- [Offline batched inference](#quickstart-offline)
|
||||
- [Online inference using OpenAI-compatible server](#quickstart-online)
|
||||
- [Run offline batched inference](#offline-batched-inference)
|
||||
- [Run OpenAI-compatible inference](#openai-compatible-server)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- OS: Linux
|
||||
- Python: 3.9 -- 3.12
|
||||
- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
|
||||
|
||||
## Installation
|
||||
|
||||
If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly.
|
||||
It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
|
||||
You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
|
||||
|
||||
```console
|
||||
$ conda create -n myenv python=3.10 -y
|
||||
@ -23,11 +23,9 @@ $ conda activate myenv
|
||||
$ pip install vllm
|
||||
```
|
||||
|
||||
```{note}
|
||||
For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
|
||||
```
|
||||
Please refer to the {ref}`installation documentation <installation>` for more details on installing vLLM.
|
||||
|
||||
(quickstart-offline)=
|
||||
(offline-batched-inference)=
|
||||
|
||||
## Offline Batched Inference
|
||||
|
||||
@ -75,7 +73,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
(quickstart-online)=
|
||||
(openai-compatible-server)=
|
||||
|
||||
## OpenAI-Compatible Server
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
(installation-tpu)=
|
||||
|
||||
# Installation for TPUs
|
||||
# Installation with TPU
|
||||
|
||||
Tensor Processing Units (TPUs) are Google's custom-developed application-specific
|
||||
integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
|
@ -1,6 +1,6 @@
|
||||
(installation-xpu)=
|
||||
|
||||
# Installation for XPUs
|
||||
# Installation with XPU
|
||||
|
||||
vLLM initially supports basic model inferencing and serving on Intel GPU platform.
|
||||
|
@ -50,7 +50,7 @@ For more information, check out the following:
|
||||
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
|
||||
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
|
||||
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
|
||||
- [vLLM Meetups](#meetups)
|
||||
- {ref}`vLLM Meetups <meetups>`.
|
||||
|
||||
## Documentation
|
||||
|
||||
@ -58,68 +58,91 @@ For more information, check out the following:
|
||||
:caption: Getting Started
|
||||
:maxdepth: 1
|
||||
|
||||
getting_started/installation/index
|
||||
getting_started/installation
|
||||
getting_started/amd-installation
|
||||
getting_started/openvino-installation
|
||||
getting_started/cpu-installation
|
||||
getting_started/gaudi-installation
|
||||
getting_started/arm-installation
|
||||
getting_started/neuron-installation
|
||||
getting_started/tpu-installation
|
||||
getting_started/xpu-installation
|
||||
getting_started/quickstart
|
||||
getting_started/debugging
|
||||
getting_started/examples/examples_index
|
||||
getting_started/troubleshooting
|
||||
getting_started/faq
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Serving
|
||||
:maxdepth: 1
|
||||
|
||||
serving/openai_compatible_server
|
||||
serving/deploying_with_docker
|
||||
serving/deploying_with_k8s
|
||||
serving/deploying_with_helm
|
||||
serving/deploying_with_nginx
|
||||
serving/distributed_serving
|
||||
serving/metrics
|
||||
serving/integrations
|
||||
serving/tensorizer
|
||||
serving/runai_model_streamer
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Models
|
||||
:maxdepth: 1
|
||||
|
||||
models/supported_models
|
||||
models/generative_models
|
||||
models/pooling_models
|
||||
models/supported_models
|
||||
models/extensions/index
|
||||
models/adding_model
|
||||
models/enabling_multimodal_inputs
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Features
|
||||
:caption: Usage
|
||||
:maxdepth: 1
|
||||
|
||||
features/quantization/index
|
||||
features/lora
|
||||
features/tool_calling
|
||||
features/structured_outputs
|
||||
features/automatic_prefix_caching
|
||||
features/disagg_prefill
|
||||
features/spec_decode
|
||||
features/compatibility_matrix
|
||||
usage/lora
|
||||
usage/multimodal_inputs
|
||||
usage/tool_calling
|
||||
usage/structured_outputs
|
||||
usage/spec_decode
|
||||
usage/compatibility_matrix
|
||||
usage/performance
|
||||
usage/faq
|
||||
usage/engine_args
|
||||
usage/env_vars
|
||||
usage/usage_stats
|
||||
usage/disagg_prefill
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Inference and Serving
|
||||
:caption: Quantization
|
||||
:maxdepth: 1
|
||||
|
||||
serving/offline_inference
|
||||
serving/openai_compatible_server
|
||||
serving/multimodal_inputs
|
||||
serving/distributed_serving
|
||||
serving/metrics
|
||||
serving/engine_args
|
||||
serving/env_vars
|
||||
serving/usage_stats
|
||||
serving/integrations/index
|
||||
quantization/supported_hardware
|
||||
quantization/auto_awq
|
||||
quantization/bnb
|
||||
quantization/gguf
|
||||
quantization/int8
|
||||
quantization/fp8
|
||||
quantization/fp8_e5m2_kvcache
|
||||
quantization/fp8_e4m3_kvcache
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Deployment
|
||||
:caption: Automatic Prefix Caching
|
||||
:maxdepth: 1
|
||||
|
||||
deployment/docker
|
||||
deployment/k8s
|
||||
deployment/nginx
|
||||
deployment/frameworks/index
|
||||
deployment/integrations/index
|
||||
automatic_prefix_caching/apc
|
||||
automatic_prefix_caching/details
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
:caption: Performance
|
||||
:maxdepth: 1
|
||||
|
||||
performance/optimization
|
||||
performance/benchmarks
|
||||
```
|
||||
|
||||
@ -133,8 +156,10 @@ community/meetups
|
||||
community/sponsors
|
||||
```
|
||||
|
||||
% API Documentation: API reference aimed at vllm library usage
|
||||
|
||||
```{toctree}
|
||||
:caption: API Reference
|
||||
:caption: API Documentation
|
||||
:maxdepth: 2
|
||||
|
||||
dev/sampling_params
|
||||
@ -143,33 +168,30 @@ dev/offline_inference/offline_index
|
||||
dev/engine/engine_index
|
||||
```
|
||||
|
||||
% Design Documents: Details about vLLM internals
|
||||
% Design: docs about vLLM internals
|
||||
|
||||
```{toctree}
|
||||
:caption: Design Documents
|
||||
:caption: Design
|
||||
:maxdepth: 2
|
||||
|
||||
design/arch_overview
|
||||
design/huggingface_integration
|
||||
design/plugin_system
|
||||
design/kernel/paged_attention
|
||||
design/input_processing/model_inputs_index
|
||||
design/kernel/paged_attention
|
||||
design/multimodal/multimodal_index
|
||||
design/automatic_prefix_caching
|
||||
design/multiprocessing
|
||||
```
|
||||
|
||||
% Developer Guide: How to contribute to the vLLM project
|
||||
% For Developers: contributing to the vLLM project
|
||||
|
||||
```{toctree}
|
||||
:caption: Developer Guide
|
||||
:caption: For Developers
|
||||
:maxdepth: 2
|
||||
|
||||
contributing/overview
|
||||
contributing/profiling/profiling_index
|
||||
contributing/dockerfile/dockerfile
|
||||
contributing/model/index
|
||||
contributing/vulnerability_management
|
||||
```
|
||||
|
||||
# Indices and tables
|
||||
|
155
docs/source/models/adding_model.md
Normal file
155
docs/source/models/adding_model.md
Normal file
@ -0,0 +1,155 @@
|
||||
(adding-a-new-model)=
|
||||
|
||||
# Adding a New Model
|
||||
|
||||
This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM.
|
||||
|
||||
```{note}
|
||||
The complexity of adding a new model depends heavily on the model's architecture.
|
||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||
However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
|
||||
```
|
||||
|
||||
```{note}
|
||||
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
|
||||
please follow [this guide](#enabling-multimodal-inputs) after implementing the model here.
|
||||
```
|
||||
|
||||
```{tip}
|
||||
If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository.
|
||||
We will be happy to help you out!
|
||||
```
|
||||
|
||||
## 0. Fork the vLLM repository
|
||||
|
||||
Start by forking our [GitHub] repository and then [build it from source](#build-from-source).
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
```{tip}
|
||||
If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
|
||||
```
|
||||
|
||||
## 1. Bring your model code
|
||||
|
||||
Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||
|
||||
```{warning}
|
||||
When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
|
||||
```
|
||||
|
||||
## 2. Make your code compatible with vLLM
|
||||
|
||||
To ensure compatibility with vLLM, your model must meet the following requirements:
|
||||
|
||||
### Initialization Code
|
||||
|
||||
All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for:
|
||||
|
||||
- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts.
|
||||
- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode.
|
||||
|
||||
The initialization code should look like this:
|
||||
|
||||
```python
|
||||
from torch import nn
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.attention import Attention
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.attn = Attention(prefix=f"{prefix}.attn")
|
||||
|
||||
class MyDecoderLayer(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
|
||||
|
||||
class MyModel(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList(
|
||||
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
|
||||
)
|
||||
|
||||
class MyModelForCausalLM(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
|
||||
```
|
||||
|
||||
### Computation Code
|
||||
|
||||
Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
|
||||
|
||||
```python
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
```{note}
|
||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||
```
|
||||
|
||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||
|
||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||
|
||||
If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
|
||||
To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
|
||||
For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`.
|
||||
When it comes to the linear layers, we provide the following options to parallelize them:
|
||||
|
||||
- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
|
||||
- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
|
||||
- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
|
||||
- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
|
||||
- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
|
||||
|
||||
Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
|
||||
|
||||
## 4. Implement the weight loading logic
|
||||
|
||||
You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class.
|
||||
This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
|
||||
|
||||
## 5. Register your model
|
||||
|
||||
Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py>.
|
||||
|
||||
## 6. Out-of-Tree Model Integration
|
||||
|
||||
You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system).
|
||||
|
||||
To register the model, use the following code:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
from your_code import YourModelForCausalLM
|
||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||
```
|
||||
|
||||
If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
|
||||
|
||||
```python
|
||||
from vllm import ModelRegistry
|
||||
|
||||
ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
|
||||
```
|
||||
|
||||
```{important}
|
||||
If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
Read more about that [here](#enabling-multimodal-inputs).
|
||||
```
|
||||
|
||||
```{note}
|
||||
Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
|
||||
```
|
@ -2,11 +2,15 @@
|
||||
|
||||
# Enabling Multimodal Inputs
|
||||
|
||||
This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
|
||||
This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs).
|
||||
|
||||
```{seealso}
|
||||
[Adding a New Model](adding-a-new-model)
|
||||
```
|
||||
|
||||
## 1. Update the base vLLM model
|
||||
|
||||
It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
|
||||
It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model).
|
||||
Further update the model as follows:
|
||||
|
||||
- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
@ -1,8 +0,0 @@
|
||||
# Built-in Extensions
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
runai_model_streamer
|
||||
tensorizer
|
||||
```
|
@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
|
||||
|
||||
## Online Inference
|
||||
|
||||
Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
|
||||
Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
|
||||
- [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
|
||||
|
@ -106,7 +106,7 @@ A code example can be found here: <gh-file:examples/offline_inference_scoring.py
|
||||
|
||||
## Online Inference
|
||||
|
||||
Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
|
||||
Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
|
||||
|
||||
- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
|
||||
|
@ -1,9 +1,9 @@
|
||||
(supported-models)=
|
||||
|
||||
# List of Supported Models
|
||||
# Supported Models
|
||||
|
||||
vLLM supports generative and pooling models across various tasks.
|
||||
If a model supports more than one task, you can set the task via the `--task` argument.
|
||||
If a model supports more than one task, you can set the task via the {code}`--task` argument.
|
||||
|
||||
For each task, we list the model architectures that have been implemented in vLLM.
|
||||
Alongside each architecture, we include some popular models that use it.
|
||||
@ -14,8 +14,8 @@ Alongside each architecture, we include some popular models that use it.
|
||||
|
||||
By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
|
||||
|
||||
To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
|
||||
If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
|
||||
To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository.
|
||||
If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory.
|
||||
|
||||
````{tip}
|
||||
The easiest way to check if your model is really supported at runtime is to run the program below:
|
||||
@ -37,7 +37,7 @@ print(output)
|
||||
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
|
||||
````
|
||||
|
||||
Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
|
||||
Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM.
|
||||
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
|
||||
|
||||
### ModelScope
|
||||
@ -48,7 +48,7 @@ To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFac
|
||||
$ export VLLM_USE_MODELSCOPE=True
|
||||
```
|
||||
|
||||
And use with `trust_remote_code=True`.
|
||||
And use with {code}`trust_remote_code=True`.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
@ -420,19 +420,16 @@ you should explicitly specify the task type to ensure that the model is used in
|
||||
```
|
||||
|
||||
```{note}
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
```
|
||||
|
||||
```{note}
|
||||
Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
|
||||
You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
|
||||
Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
|
||||
You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
|
||||
|
||||
On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
|
||||
On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
|
||||
despite being described otherwise on its model card.
|
||||
|
||||
Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
|
||||
loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
|
||||
```
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
@ -471,8 +468,8 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
|
||||
|
||||
```{important}
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
```
|
||||
|
||||
#### Classification (`--task classify`)
|
||||
@ -540,13 +537,13 @@ The following modalities are supported depending on the model:
|
||||
- **V**ideo
|
||||
- **A**udio
|
||||
|
||||
Any combination of modalities joined by `+` are supported.
|
||||
Any combination of modalities joined by {code}`+` are supported.
|
||||
|
||||
- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
|
||||
- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
|
||||
|
||||
On the other hand, modalities separated by `/` are mutually exclusive.
|
||||
On the other hand, modalities separated by {code}`/` are mutually exclusive.
|
||||
|
||||
- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
|
||||
- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
|
||||
|
||||
See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
|
||||
|
||||
@ -643,14 +640,14 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
- `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
|
||||
-
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
-
|
||||
* - `LlavaOnevisionForConditionalGeneration`
|
||||
- LLaVA-Onevision
|
||||
- T + I<sup>+</sup> + V<sup>+</sup>
|
||||
- `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
|
||||
-
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
-
|
||||
* - `MiniCPMV`
|
||||
- MiniCPM-V
|
||||
- T + I<sup>E+</sup>
|
||||
@ -713,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
- `Qwen/Qwen2-Audio-7B-Instruct`
|
||||
-
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
-
|
||||
* - `Qwen2VLForConditionalGeneration`
|
||||
- Qwen2-VL
|
||||
- T + I<sup>E+</sup> + V<sup>E+</sup>
|
||||
@ -727,15 +724,15 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
- `fixie-ai/ultravox-v0_3`
|
||||
-
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
-
|
||||
```
|
||||
|
||||
<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
|
||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||
|
||||
````{important}
|
||||
To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
|
||||
or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
|
||||
To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference)
|
||||
or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt:
|
||||
|
||||
```python
|
||||
llm = LLM(
|
||||
@ -754,11 +751,11 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal
|
||||
```
|
||||
|
||||
```{note}
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
```
|
||||
|
||||
```{note}
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||
```
|
||||
|
||||
@ -773,7 +770,7 @@ you should explicitly specify the task type to ensure that the model is used in
|
||||
|
||||
#### Text Embedding (`--task embed`)
|
||||
|
||||
Any text generation model can be converted into an embedding model by passing `--task embed`.
|
||||
Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
|
||||
|
||||
```{note}
|
||||
To get the best results, you should use pooling models that are specifically trained as such.
|
||||
@ -821,7 +818,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
|
||||
2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
|
||||
|
||||
```{tip}
|
||||
When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
|
||||
When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
|
||||
```
|
||||
|
||||
3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
|
||||
|
@ -37,10 +37,3 @@ model_id = "huggyllama/llama-7b"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
||||
```
|
||||
## OpenAI Compatible Server
|
||||
|
||||
Append the following to your 4bit model arguments:
|
||||
|
||||
```
|
||||
--quantization bitsandbytes --load-format bitsandbytes
|
||||
```
|
@ -1,6 +1,6 @@
|
||||
(quantization-supported-hardware)=
|
||||
(supported-hardware-for-quantization)=
|
||||
|
||||
# Supported Hardware
|
||||
# Supported Hardware for Quantization Kernels
|
||||
|
||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||
|
||||
@ -120,12 +120,12 @@ The table below shows the compatibility of various quantization implementations
|
||||
- ✗
|
||||
```
|
||||
|
||||
## Notes:
|
||||
|
||||
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
|
||||
- "✅︎" indicates that the quantization method is supported on the specified hardware.
|
||||
- "✗" indicates that the quantization method is not supported on the specified hardware.
|
||||
|
||||
```{note}
|
||||
This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||
Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
|
||||
|
||||
For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
|
||||
```
|
Before Width: | Height: | Size: 968 KiB After Width: | Height: | Size: 968 KiB |
@ -1,6 +1,6 @@
|
||||
(deployment-bentoml)=
|
||||
(deploying-with-bentoml)=
|
||||
|
||||
# BentoML
|
||||
# Deploying with BentoML
|
||||
|
||||
[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
|
||||
|
@ -1,6 +1,6 @@
|
||||
(deployment-cerebrium)=
|
||||
(deploying-with-cerebrium)=
|
||||
|
||||
# Cerebrium
|
||||
# Deploying with Cerebrium
|
||||
|
||||
```{raw} html
|
||||
<p align="center">
|
@ -1,6 +1,6 @@
|
||||
(deployment-docker)=
|
||||
(deploying-with-docker)=
|
||||
|
||||
# Using Docker
|
||||
# Deploying with Docker
|
||||
|
||||
## Use vLLM's Official Docker Image
|
||||
|
@ -1,6 +1,6 @@
|
||||
(deployment-dstack)=
|
||||
(deploying-with-dstack)=
|
||||
|
||||
# dstack
|
||||
# Deploying with dstack
|
||||
|
||||
```{raw} html
|
||||
<p align="center">
|
@ -1,6 +1,6 @@
|
||||
(deployment-helm)=
|
||||
(deploying-with-helm)=
|
||||
|
||||
# Helm
|
||||
# Deploying with Helm
|
||||
|
||||
A Helm chart to deploy vLLM for Kubernetes
|
||||
|
||||
@ -38,7 +38,7 @@ chart **including persistent volumes** and deletes the release.
|
||||
|
||||
## Architecture
|
||||
|
||||
```{image} /assets/deployment/architecture_helm_deployment.png
|
||||
```{image} architecture_helm_deployment.png
|
||||
```
|
||||
|
||||
## Values
|
@ -1,6 +1,6 @@
|
||||
(deployment-k8s)=
|
||||
(deploying-with-k8s)=
|
||||
|
||||
# Using Kubernetes
|
||||
# Deploying with Kubernetes
|
||||
|
||||
Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
|
||||
|
||||
@ -43,7 +43,7 @@ metadata:
|
||||
name: hf-token-secret
|
||||
namespace: default
|
||||
type: Opaque
|
||||
stringData:
|
||||
data:
|
||||
token: "REPLACE_WITH_TOKEN"
|
||||
```
|
||||
|
@ -1,6 +1,6 @@
|
||||
(deployment-kserve)=
|
||||
(deploying-with-kserve)=
|
||||
|
||||
# KServe
|
||||
# Deploying with KServe
|
||||
|
||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||
|
@ -1,6 +1,6 @@
|
||||
(deployment-kubeai)=
|
||||
(deploying-with-kubeai)=
|
||||
|
||||
# KubeAI
|
||||
# Deploying with KubeAI
|
||||
|
||||
[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
|
||||
|
@ -1,6 +1,6 @@
|
||||
(deployment-lws)=
|
||||
(deploying-with-lws)=
|
||||
|
||||
# LWS
|
||||
# Deploying with LWS
|
||||
|
||||
LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
|
||||
A major use case is for multi-host/multi-node distributed inference.
|
@ -1,6 +1,6 @@
|
||||
(nginxloadbalancer)=
|
||||
|
||||
# Using Nginx
|
||||
# Deploying with Nginx Loadbalancer
|
||||
|
||||
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
||||
|
@ -1,5 +1,5 @@
|
||||
(deployment-triton)=
|
||||
(deploying-with-triton)=
|
||||
|
||||
# NVIDIA Triton
|
||||
# Deploying with NVIDIA Triton
|
||||
|
||||
The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
|
@ -18,13 +18,13 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
|
||||
There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
|
||||
```
|
||||
|
||||
## Running vLLM on a single node
|
||||
## Details for Distributed Inference and Serving
|
||||
|
||||
vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
|
||||
|
||||
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
|
||||
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
|
||||
|
||||
To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
|
||||
To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
@ -32,14 +32,14 @@ llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
|
||||
output = llm.generate("San Franciso is a")
|
||||
```
|
||||
|
||||
To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
|
||||
To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
|
||||
|
||||
```console
|
||||
$ vllm serve facebook/opt-13b \
|
||||
$ --tensor-parallel-size 4
|
||||
```
|
||||
|
||||
You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
|
||||
You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
|
||||
|
||||
```console
|
||||
$ vllm serve gpt2 \
|
||||
@ -47,7 +47,7 @@ $ --tensor-parallel-size 4 \
|
||||
$ --pipeline-parallel-size 2
|
||||
```
|
||||
|
||||
## Running vLLM on multiple nodes
|
||||
## Multi-Node Inference and Serving
|
||||
|
||||
If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
|
||||
|
||||
@ -95,7 +95,7 @@ $ --tensor-parallel-size 16
|
||||
To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
|
||||
|
||||
```{warning}
|
||||
After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
|
||||
After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
|
||||
```
|
||||
|
||||
```{warning}
|
||||
|
17
docs/source/serving/integrations.md
Normal file
17
docs/source/serving/integrations.md
Normal file
@ -0,0 +1,17 @@
|
||||
# Integrations
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
run_on_sky
|
||||
deploying_with_kserve
|
||||
deploying_with_kubeai
|
||||
deploying_with_triton
|
||||
deploying_with_bentoml
|
||||
deploying_with_cerebrium
|
||||
deploying_with_lws
|
||||
deploying_with_dstack
|
||||
serving_with_langchain
|
||||
serving_with_llamaindex
|
||||
serving_with_llamastack
|
||||
```
|
@ -1,8 +0,0 @@
|
||||
# External Integrations
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
langchain
|
||||
llamaindex
|
||||
```
|
@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
|
||||
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
|
||||
OpenAI compatible API server.
|
||||
|
||||
You can start the server using Python, or using [Docker](#deployment-docker):
|
||||
You can start the server using Python, or using [Docker](deploying_with_docker.md):
|
||||
|
||||
```console
|
||||
$ vllm serve unsloth/Llama-3.2-1B-Instruct
|
||||
|
@ -1,79 +0,0 @@
|
||||
(offline-inference)=
|
||||
|
||||
# Offline Inference
|
||||
|
||||
You can run vLLM in your own code on a list of prompts.
|
||||
|
||||
The offline API is based on the {class}`~vllm.LLM` class.
|
||||
To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
|
||||
|
||||
For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
|
||||
and runs it in vLLM using the default configuration.
|
||||
|
||||
```python
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
```
|
||||
|
||||
After initializing the `LLM` instance, you can perform model inference using various APIs.
|
||||
The available APIs depend on the type of model that is being run:
|
||||
|
||||
- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text.
|
||||
- [Pooling models](#pooling-models) output their hidden states directly.
|
||||
|
||||
Please refer to the above pages for more details about each API.
|
||||
|
||||
```{seealso}
|
||||
[API Reference](/dev/offline_inference/offline_index)
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
This section lists the most common options for running the vLLM engine.
|
||||
For a full list, refer to the [Engine Arguments](#engine-args) page.
|
||||
|
||||
### Reducing memory usage
|
||||
|
||||
Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
|
||||
|
||||
#### Tensor Parallelism (TP)
|
||||
|
||||
Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
|
||||
|
||||
The following code splits the model across 2 GPUs.
|
||||
|
||||
```python
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
tensor_parallel_size=2)
|
||||
```
|
||||
|
||||
```{important}
|
||||
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
|
||||
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
||||
|
||||
To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
|
||||
```
|
||||
|
||||
#### Quantization
|
||||
|
||||
Quantized models take less memory at the cost of lower precision.
|
||||
|
||||
Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Neural Magic](https://huggingface.co/neuralmagic))
|
||||
and used directly without extra configuration.
|
||||
|
||||
Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details.
|
||||
|
||||
#### Context length and batch size
|
||||
|
||||
You can further reduce memory usage by limit the context length of the model (`max_model_len` option)
|
||||
and the maximum batch size (`max_num_seqs` option).
|
||||
|
||||
```python
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2)
|
||||
```
|
||||
|
||||
### Performance optimization and tuning
|
||||
|
||||
You can potentially improve the performance of vLLM by finetuning various options.
|
||||
Please refer to [this guide](#optimization-and-tuning) for more details.
|
@ -1,10 +1,8 @@
|
||||
(openai-compatible-server)=
|
||||
# OpenAI Compatible Server
|
||||
|
||||
# OpenAI-Compatible Server
|
||||
vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more!
|
||||
|
||||
vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
|
||||
|
||||
You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
|
||||
You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md):
|
||||
```bash
|
||||
vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
|
||||
```
|
||||
@ -219,7 +217,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
|
||||
|
||||
We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
|
||||
[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
|
||||
see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
|
||||
see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information.
|
||||
- *Note: `image_url.detail` parameter is not supported.*
|
||||
|
||||
Code example: <gh-file:examples/openai_chat_completion_client.py>
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user