Compare commits

..

1 Commits

Author SHA1 Message Date
728c365e4d Use uv to install python in Dockerfile
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-02 11:05:47 -04:00
200 changed files with 1809 additions and 3951 deletions

View File

@ -181,14 +181,18 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="vllm serve $model \
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="vllm serve $model \
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi

View File

@ -365,7 +365,8 @@ run_serving_tests() {
continue
fi
server_command="$server_envs vllm serve \
server_command="$server_envs python3 \
-m vllm.entrypoints.openai.api_server \
$server_args"
# run the server

View File

@ -1,191 +0,0 @@
#!/bin/bash
# This script build the Ascend NPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Base ubuntu image with basic ascend development libraries and python installed
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
VLLM_ASCEND_TMP_DIR=
# Get the test run configuration file from the vllm-ascend repository
fetch_vllm_test_cfg() {
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
cleanup() {
rm -rf "${VLLM_ASCEND_TMP_DIR}"
}
trap cleanup EXIT
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
exit 1
fi
# If the file already exists locally, just overwrite it
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
rm -rf "${VLLM_ASCEND_TMP_DIR}"
trap - EXIT
}
# Downloads test run configuration file from a remote URL.
# Loads the configuration into the current script environment.
get_config() {
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1
fi
source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0
}
# get test running configuration.
fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script.
if [ $? -ne 0 ]; then
exit 1
fi
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p ${builder_cache_dir}
# Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest=${builder_cache_dir},mode=max \
--progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
# Install for pytest to make the docker build cache layer always valid
RUN --mount=type=cache,target=/root/.cache/pip \
pip install pytest>=6.0 modelscope
WORKDIR /workspace/vllm
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements/common.txt
COPY . .
# Install vLLM
RUN --mount=type=cache,target=/root/.cache/pip \
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton
# Install vllm-ascend
WORKDIR /workspace
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
ARG VLLM_ASCEND_TAG=main
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r /workspace/vllm-ascend/requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV VLLM_USE_MODELSCOPE=True
WORKDIR /workspace/vllm-ascend
CMD ["/bin/bash"]
EOF
# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() {
local input="$1"
local index cards_num
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
index="${BASH_REMATCH[1]}"
cards_num="${BASH_REMATCH[2]}"
else
echo "parse error" >&2
return 1
fi
local devices=""
local i=0
while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i ))
devices="$devices --device /dev/davinci${dev_idx}"
((i++))
done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
}
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p ${model_cache_dir}
docker run \
${devices} \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v ${model_cache_dir}:/root/.cache/modelscope \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
pytest -v -s tests/e2e/vllm_interface/
'

View File

@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
vllm serve meta-llama/Llama-2-7b-chat-hf &
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

30
.github/mergify.yml vendored
View File

@ -2,7 +2,6 @@ pull_request_rules:
- name: label-documentation
description: Automatically apply documentation label
conditions:
- label != stale
- or:
- files~=^[^/]+\.md$
- files~=^docs/
@ -15,7 +14,6 @@ pull_request_rules:
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
- label != stale
- or:
- files~=^\.github/
- files~=\.buildkite/
@ -32,7 +30,6 @@ pull_request_rules:
- name: label-deepseek
description: Automatically apply deepseek label
conditions:
- label != stale
- or:
- files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py
@ -49,7 +46,6 @@ pull_request_rules:
- name: label-frontend
description: Automatically apply frontend label
conditions:
- label != stale
- files~=^vllm/entrypoints/
actions:
label:
@ -59,7 +55,6 @@ pull_request_rules:
- name: label-llama
description: Automatically apply llama label
conditions:
- label != stale
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
@ -75,7 +70,6 @@ pull_request_rules:
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
- label != stale
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
@ -89,7 +83,6 @@ pull_request_rules:
- name: label-new-model
description: Automatically apply new-model label
conditions:
- label != stale
- and:
- files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py
@ -101,7 +94,6 @@ pull_request_rules:
- name: label-performance
description: Automatically apply performance label
conditions:
- label != stale
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
@ -115,7 +107,6 @@ pull_request_rules:
- name: label-qwen
description: Automatically apply qwen label
conditions:
- label != stale
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
@ -130,7 +121,6 @@ pull_request_rules:
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
- label != stale
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
@ -152,7 +142,6 @@ pull_request_rules:
- name: label-rocm
description: Automatically apply rocm label
conditions:
- label != stale
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
@ -173,7 +162,6 @@ pull_request_rules:
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
- label != stale
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
@ -193,7 +181,6 @@ pull_request_rules:
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
- label != stale
- or:
- files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/
@ -209,7 +196,6 @@ pull_request_rules:
- name: label-v1
description: Automatically apply v1 label
conditions:
- label != stale
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
@ -222,7 +208,6 @@ pull_request_rules:
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
- label != stale
- or:
- files~=tpu.py
- files~=_tpu
@ -238,7 +223,6 @@ pull_request_rules:
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
- label != stale
- and:
- -files~=tpu.py
- -files~=_tpu
@ -253,7 +237,6 @@ pull_request_rules:
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- label != stale
- or:
- files~=^tests/tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
@ -272,9 +255,8 @@ pull_request_rules:
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- label != stale
- conflict
- -closed
- conflict
- -closed
actions:
label:
add:
@ -288,8 +270,6 @@ pull_request_rules:
- name: assign reviewer for tensorizer changes
conditions:
- label != stale
- or:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
@ -301,7 +281,6 @@ pull_request_rules:
- name: assign reviewer for modelopt changes
conditions:
- label != stale
- or:
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
@ -316,8 +295,8 @@ pull_request_rules:
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- -conflict
- -closed
- -conflict
- -closed
actions:
label:
remove:
@ -326,7 +305,6 @@ pull_request_rules:
- name: label-kv-connector
description: Automatically apply kv-connector label
conditions:
- label != stale
- or:
- files~=^examples/online_serving/disaggregated[^/]*/.*
- files~=^examples/offline_inference/disaggregated[^/]*/.*

View File

@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()

View File

@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_triton_block_scaled_mm,
w8a8_block_fp8_matmul,
)
from vllm.utils import FlexibleArgumentParser, cdiv
@ -158,7 +158,7 @@ def bench_fp8(
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
),
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(

View File

@ -55,7 +55,9 @@ benchmark() {
output_len=$2
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
@ -63,7 +65,9 @@ benchmark() {
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \

View File

@ -38,12 +38,16 @@ wait_for_server() {
launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len 10000 \
--enable-chunked-prefill \
--gpu-memory-utilization 0.6 &
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len 10000 \
--enable-chunked-prefill \
@ -58,14 +62,18 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \

View File

@ -1,174 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time
import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)
logger = init_logger(__name__)
@torch.inference_mode()
def run_benchmark(
num_tokens: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
kv_cache_dtype: str,
num_iters: int,
benchmark_mode: str,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
current_platform.seed_everything(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
value = torch.randn_like(key)
# prepare the slot mapping.
# each token is assigned a unique slot in the KV-cache.
num_slots = block_size * num_blocks
if num_tokens > num_slots:
raise ValueError("num_tokens cannot exceed the total number of cache slots")
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
key_caches, value_caches = create_kv_caches_with_random(
num_blocks,
block_size,
1, # num_layers
num_heads,
head_size,
kv_cache_dtype,
dtype,
device=device,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# to free unused memory
del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
function_under_test = lambda: ops.reshape_and_cache(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
if benchmark_mode == "cudagraph":
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
function_under_test()
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
# warm-up
run_cuda_benchmark(3)
lat = run_cuda_benchmark(num_iters)
# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.cuda.empty_cache()
return lat
def main(args):
rows = []
for exp in range(1, 17):
n_tok = 2**exp
lat = run_benchmark(
num_tokens=n_tok,
num_heads=args.num_heads,
head_size=args.head_size,
block_size=args.block_size,
num_blocks=args.num_blocks,
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
kv_cache_dtype=args.kv_cache_dtype,
num_iters=args.iters,
benchmark_mode=args.mode,
device="cuda",
)
rows.append([n_tok, lat * 1e6]) # convert to microseconds
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--num-heads", type=int, default=128)
parser.add_argument(
"--head-size",
type=int,
choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128,
)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--num-blocks", type=int, default=128 * 128)
parser.add_argument(
"--dtype",
type=str,
choices=["half", "bfloat16", "float"],
default="bfloat16",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8"],
default="auto",
)
parser.add_argument("--iters", type=int, default=200)
parser.add_argument(
"--mode",
type=str,
choices=["cudagraph", "no_graph"],
default="cudagraph",
)
args = parser.parse_args()
main(args)

View File

@ -9,7 +9,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
w8a8_triton_block_scaled_mm,
w8a8_block_fp8_matmul,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import (
@ -63,7 +63,7 @@ def benchmark_shape(m: int,
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
return w8a8_triton_block_scaled_mm(A_vllm,
return w8a8_block_fp8_matmul(A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -17,6 +17,8 @@
#include <algorithm>
#include <cassert>
#include <cfloat> // FLT_MIN
#include <map>
#include <vector>
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
@ -208,20 +210,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
namespace vllm {
// Used to copy/convert one element
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
struct CopyWithScaleOp {
float scale;
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst = static_cast<OutT>(src);
} else {
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
}
}
};
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void reshape_and_cache_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
@ -237,51 +225,59 @@ __global__ void reshape_and_cache_kernel(
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
if (slot_idx < 0) {
// Padding token that should be ignored.
return;
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
const int h_block_count = head_size / x; // head_size//x
const int h_block_idx = threadIdx.x;
if (h_block_idx >= num_heads * h_block_count) {
return;
}
const int n = num_heads * head_size;
for (int i = threadIdx.x; i < n; i += blockDim.x) {
const int64_t src_key_idx = token_idx * key_stride + i;
const int64_t src_value_idx = token_idx * value_stride + i;
const int head_idx = h_block_idx / h_block_count;
const int h_block = h_block_idx % h_block_count;
const int head_idx = i / head_size;
const int head_offset = i % head_size;
const int x_idx = head_offset / x;
const int x_offset = head_offset % x;
const scalar_t* __restrict__ key_src =
key + token_idx * key_stride + head_idx * head_size + h_block * x;
const int64_t src_value_start =
token_idx * value_stride + head_idx * head_size + h_block * x;
cache_t* __restrict__ key_dst =
key_cache + block_idx * num_heads * h_block_count * block_size * x +
head_idx * h_block_count * block_size * x + h_block * block_size * x +
block_offset * x;
const int64_t tgt_value_start =
block_idx * num_heads * h_block_count * x * block_size +
head_idx * h_block_count * x * block_size + h_block * x * block_size +
block_offset;
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
const scalar_t* __restrict__ value_src = value + src_value_start;
cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
#pragma unroll
for (int i = 0; i < x; i++) {
v_op(value_dst[i * block_size], value_src[i]);
const int64_t tgt_key_idx =
block_idx * num_heads * (head_size / x) * block_size * x +
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
block_offset * x + x_offset;
const int64_t tgt_value_idx =
block_idx * num_heads * head_size * block_size +
head_idx * head_size * block_size + head_offset * block_size +
block_offset;
scalar_t tgt_key = key[src_key_idx];
scalar_t tgt_value = value[src_value_idx];
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
key_cache[tgt_key_idx] = tgt_key;
value_cache[tgt_value_idx] = tgt_value;
} else {
key_cache[tgt_key_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
value_cache[tgt_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
}
}
}
// Used by vectorization_utils to copy/convert one element
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
struct CopyWithScaleOp {
float scale;
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst = static_cast<OutT>(src);
} else {
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
}
}
};
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void reshape_and_cache_flash_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
@ -428,80 +424,84 @@ __global__ void concat_and_cache_ds_mla_kernel(
const int64_t dst_idx_start =
block_idx * block_stride + block_offset * entry_stride;
// For the NoPE part, each tile of 128 elements is handled by half of one warp
// (16 threads). There are 4 total tiles, so 2 warps (64 threads).
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
// The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
// So in total, we use 3 warps (96 threads) per block.
// Create 4 tile scales in shared memory
__shared__ float smem[20];
float* shard_abs_max = smem;
float* tile_scales = smem + 16;
// For the NoPE part, each tile of 128 elements is handled by 4 warps
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
// The first thread of the first warp in each tile writes the scale
// value for the tile. The RoPE part (last 64 elements) is handled
// by another 2 warps (64 threads).
// So in total, we use 18 warps (576 threads) per block.
// Cast kv_cache to 16_bit for RoPE values
scalar_t* kv_cache_16bit =
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
// The last warp handles the RoPE part
if (threadIdx.x >= 64) {
// Each thread handles two elements of RoPE
const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
// Vectorized load of two 16-bit values, performed as one 32-bit load
const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
// The last 64 threads handle the RoPE part
if (threadIdx.x >= kv_lora_rank) {
const int8_t pe_idx = threadIdx.x - kv_lora_rank;
const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
// RoPE values start after the packed 8-bit NoPE values and the
// 32-bit scales
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
// Vectorized store of two 16-bit values, performed as one 32-bit store
*reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
kv_cache_16bit[dst_idx] = k_pe[src_idx];
return;
}
// The first two warps handle the NoPE part
const int8_t warp_idx = threadIdx.x >> 5;
const int8_t lane_idx = threadIdx.x & 31;
const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
// Determine the scale for each chunk of NoPE
const int16_t tile_idx = threadIdx.x >> 7;
const int16_t warp_idx = (threadIdx.x & 127) >> 5;
const int16_t lane_idx = threadIdx.x & 31;
// Each thread handles 8 elements of NoPE
// Load the NoPE elements for this thread into registers
const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
// Vectorized load of eight 16-bit values, performed as an int4 load
const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
// Load the NoPE element for this thread into registers
const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
const scalar_t src_val = kv_c[src_idx];
// Max absolute value of this thread's elements
float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
// Warp-level reduction to find the max absolute value in each half-warp
// Warp-level reduction to find the max absolute value in the warp
float max_abs = fabsf(src_val);
#pragma unroll
for (int offset = 8; offset > 0; offset /= 2) {
max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
for (int offset = 16; offset > 0; offset /= 2) {
#ifdef USE_ROCM
max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
#else
max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
#endif
}
// Compute the scale for the tile
float tile_scale = max_abs / 448.f;
// The first lane of each warp in each tile writes the max_abs of this part
// of the tile to shared memory
if (lane_idx == 0) {
shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
}
__syncthreads();
// The first lane of each half-warp writes the scale to kv_cache
if ((lane_idx == 0) || (lane_idx == 16)) {
// The first lane of the first warp in each tile computes the scale for the
// tile and writes it to shared memory and to kv_cache
if (warp_idx == 0 && lane_idx == 0) {
float4 shard_abs_max_vec =
reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
448.f;
// Avoid division by zero in `scaled_convert`
tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
kv_cache_32bit[dst_idx] = tile_scale;
kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
}
// Now all threads in the block scale and write their elements
// NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
__syncthreads();
uint8_t result[8];
#pragma unroll
for (int i = 0; i < 8; i++) {
result[i] =
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
vals[i], tile_scale);
}
// Store as aligned 64-bit writes
*reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
*reinterpret_cast<const uint64_t*>(result);
// Now all threads in the block scale and write their element
const float scale_val = tile_scales[tile_idx];
const int64_t dst_idx = dst_idx_start + threadIdx.x;
kv_cache[dst_idx] =
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
src_val, scale_val);
}
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@ -606,10 +606,9 @@ void reshape_and_cache(
int key_stride = key.stride(0);
int value_stride = value.stride(0);
int head_div_x = head_size / x;
dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_div_x, 512));
dim3 block(std::min(num_heads * head_size, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -742,12 +741,13 @@ void concat_and_cache_mla(
if (kv_cache_dtype == "fp8_ds_mla") {
dim3 grid(num_tokens);
// For the NoPE part, each tile of 128 elements is handled by half of one
// warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
// The RoPE part (last 64 elements) is handled by another 1 warp (32
// threads). So in total, we use 3 warps (96 threads) per block.
dim3 block(96);
// For the NoPE part, each tile of 128 elements is handled by 4 warps
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
// The first thread of the first warp in each tile writes the scale
// value for the tile. The RoPE part (last 64 elements) is handled
// by another 2 warps (64 threads).
// So in total, we use 18 warps (576 threads) per block.
dim3 block(576);
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
CALL_CONCAT_AND_CACHE_DS_MLA);
} else {

View File

@ -254,7 +254,7 @@ void cutlass_moe_mm(
bool per_act_token, bool per_out_ch) {
int32_t version_num = get_sm_version_num();
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
if (version_num >= 100 && version_num < 110) {
if (version_num >= 100) {
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, per_act_token, per_out_ch);
@ -262,7 +262,7 @@ void cutlass_moe_mm(
}
#endif
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
if (version_num >= 90 && version_num < 100) {
if (version_num >= 90) {
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides,
c_strides, per_act_token, per_out_ch);

View File

@ -14,8 +14,6 @@
* limitations under the License.
*/
#include "core/registration.h"
#include <torch/all.h>
#include <cutlass/arch/arch.h>
@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
"12.8 or above.");
#endif
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
}

View File

@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
{stride_tag});
// conditionally compiled so impl registration is in source file
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization, as well as bias

View File

@ -13,13 +13,8 @@ ARG PYTHON_VERSION=3.12
# private registries that use a different repository naming conventions.
#
# Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Important: We build with an old version of Ubuntu to maintain broad
# compatibility with other Linux OSes. The main reason for this is that the
# glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# TODO: Restore to base image after FlashInfer AOT wheel fixed
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
@ -80,20 +75,15 @@ ARG TARGETPLATFORM
ARG INSTALL_KV_CONNECTORS=false
ENV DEBIAN_FRONTEND=noninteractive
ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
# Install system dependencies and uv, then create Python virtual environment
# Install minimal dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
&& python3 --version && python3 -m pip --version
&& apt-get install -y ccache software-properties-common git curl sudo
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -101,9 +91,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Activate virtual environment and add uv to PATH
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv"
# Install uv and Python
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
RUN uv python install ${PYTHON_VERSION} --default --verbose
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
@ -132,7 +122,7 @@ WORKDIR /workspace
COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
uv pip install --system -r requirements/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch
@ -162,7 +152,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
uv pip install --system -r requirements/build.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
COPY . .
@ -259,7 +249,7 @@ COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
uv pip install --system -r requirements/dev.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### DEV IMAGE ####################
@ -286,32 +276,12 @@ ARG GET_PIP_URL
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies
# Install minimal dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
fi ; \
else \
for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done ; \
fi \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@ -319,9 +289,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# Install uv and Python
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
RUN uv python install ${PYTHON_VERSION} --default --verbose
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
@ -555,5 +525,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
FROM vllm-openai-base AS vllm-openai
ENTRYPOINT ["vllm", "serve"]
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################

View File

@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl
ENTRYPOINT ["vllm", "serve"]
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -314,4 +314,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["vllm", "serve"]
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -309,4 +309,4 @@ USER 2000
WORKDIR /home/vllm
# Set the default entrypoint
ENTRYPOINT ["vllm", "serve"]
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["vllm", "serve"]
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 119 KiB

After

Width:  |  Height:  |  Size: 127 KiB

View File

@ -661,7 +661,8 @@ Benchmark the performance of multi-modal requests in vLLM.
Start vLLM:
```bash
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"image": 1}' \
--allowed-local-media-path /path/to/sharegpt4v/images
@ -687,7 +688,8 @@ vllm bench serve \
Start vLLM:
```bash
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"video": 1}' \
--allowed-local-media-path /path/to/sharegpt4video/videos

View File

@ -258,21 +258,17 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = \
self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides)
num_images=num_images)
}
```
@ -442,20 +438,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict:
target_width, target_height = \
self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides)
num_images=num_images)
}
```

View File

@ -39,7 +39,8 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
vllm serve meta-llama/Meta-Llama-3-70B
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3-70B
```
vllm bench command:

View File

@ -19,7 +19,8 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
1. Start the vLLM server with the supported chat completion model, e.g.
```bash
vllm serve mistralai/Mistral-7B-Instruct-v0.2
python -m vllm.entrypoints.openai.api_server \
--model mistralai/Mistral-7B-Instruct-v0.2
```
1. Call it with AutoGen:

View File

@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
For example:
```console
vllm serve <model> --host 0.0.0.0 --port 8000
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
```
3. Start the Open WebUI Docker container:

View File

@ -32,7 +32,6 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
ports: 8081 # Expose to internet traffic.
envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
@ -48,8 +47,9 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
run: |
conda activate vllm
echo 'Starting vllm api server...'
vllm serve $MODEL_NAME \
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
@ -131,7 +131,6 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
ports: 8081 # Expose to internet traffic.
envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
@ -147,8 +146,9 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
run: |
conda activate vllm
echo 'Starting vllm api server...'
vllm serve $MODEL_NAME \
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
@ -243,7 +243,6 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
ports: 8081 # Expose to internet traffic.
envs:
PYTHONUNBUFFERED: 1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
@ -259,8 +258,9 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
run: |
conda activate vllm
echo 'Starting vllm api server...'
vllm serve $MODEL_NAME \
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log

View File

@ -69,11 +69,6 @@ Sometimes you may see the API server entrypoint used directly instead of via the
python -m vllm.entrypoints.openai.api_server --model <model>
```
!!! warning
`python -m vllm.entrypoints.openai.api_server` is deprecated
and may become unsupported in a future release.
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.

View File

@ -8,9 +8,6 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup
!!! tip
When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP redirects from being followed to bypass domain restrictions.
This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
## Offline Inference

View File

@ -64,7 +64,8 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
```bash
VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-0.6B \
--enable-sleep-mode \
--port 8000
```

View File

@ -48,9 +48,10 @@ The following code configures vLLM in an offline mode to use speculative decodin
To perform the same with an online mode launch the server:
```bash
vllm serve facebook/opt-6.7b \
python -m vllm.entrypoints.openai.api_server \
--host 0.0.0.0 \
--port 8000 \
--model facebook/opt-6.7b \
--seed 42 \
-tp 1 \
--gpu_memory_utilization 0.8 \

View File

@ -67,7 +67,8 @@ docker run -it \
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
```bash
vllm serve facebook/opt-13b \
python -m vllm.entrypoints.openai.api_server \
--model=facebook/opt-13b \
--dtype=bfloat16 \
--max_model_len=1024 \
--distributed-executor-backend=mp \

View File

@ -17,12 +17,12 @@ These models are what we list in [supported-text-models][supported-text-models]
### Transformers
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
Currently, the Transformers backend works for the following:
- Modalities: embedding models, language models and vision-language models*
- Architectures: encoder-only, decoder-only, mixture-of-experts
- Architectures: encoder-only, decoder-only
- Attention types: full attention and/or sliding attention
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
@ -31,7 +31,6 @@ If the Transformers model implementation follows all the steps in [writing a cus
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
- Any combination of the following vLLM parallelisation schemes:
- Data parallel
- Pipeline parallel
- Tensor parallel
@ -677,7 +676,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |

View File

@ -66,9 +66,6 @@ Restrict domains that vLLM can access for media URLs by setting
`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP
redirects from being followed to bypass domain restrictions.
## Security and Firewalls: Protecting Exposed vLLM Systems
While vLLM is designed to allow unsafe network services to be isolated to

View File

@ -576,7 +576,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
# Intern-S1
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1-mini"
model_name = "internlm/Intern-S1"
engine_args = EngineArgs(
model=model_name,

View File

@ -309,7 +309,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "internlm/Intern-S1-mini"
model_name = "internlm/Intern-S1"
engine_args = EngineArgs(
model=model_name,

View File

@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
done < <(env | grep "^${PREFIX}")
# Pass the collected arguments to the main entrypoint
exec vllm serve "${ARGS[@]}"
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"

View File

@ -1,2 +1,2 @@
lmcache
nixl >= 0.6.0 # Required for disaggregated prefill
nixl >= 0.5.1 # Required for disaggregated prefill

View File

@ -11,8 +11,8 @@ import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from tests.v1.attention.utils import _Backend
from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import _Backend
from vllm.attention.selector import global_force_attn_backend_context_manager
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
PassConfig)

View File

@ -8,11 +8,11 @@ import torch._dynamo
from tests.compile.backend import LazyInitPass, TestBackend
from tests.models.utils import check_outputs_equal
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata)
from vllm import LLM, SamplingParams
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.registry import _Backend
from vllm.attention.selector import global_force_attn_backend_context_manager
from vllm.compilation.fusion import QUANT_OPS
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass

View File

@ -756,7 +756,7 @@ class VllmRunner:
def get_inputs(
self,
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
prompts: Union[list[str], list[torch.Tensor], list[int]],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,

View File

@ -86,16 +86,3 @@ def test_max_model_len():
# It can be less if generation finishes due to other reasons (e.g., EOS)
# before reaching the absolute model length limit.
assert num_total_tokens <= max_model_len
def test_log_stats():
llm = LLM(
model=MODEL_NAME,
disable_log_stats=False,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
outputs = llm.generate(PROMPTS, sampling_params=None)
# disable_log_stats is False, every output should have metrics
assert all(output.metrics is not None for output in outputs)

View File

@ -122,9 +122,6 @@ def mock_serving_setup():
models,
request_logger=None)
serving_completion._process_inputs = AsyncMock(return_value=(MagicMock(
name="engine_request"), {}))
return mock_engine, serving_completion

View File

@ -698,22 +698,6 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_system_message_with_tools(client: OpenAI, model_name: str):
from vllm.entrypoints.harmony_utils import get_system_message
# Test with custom tools enabled - commentary channel should be available
sys_msg = get_system_message(with_custom_tools=True)
valid_channels = sys_msg.content[0].channel_config.valid_channels
assert "commentary" in valid_channels
# Test with custom tools disabled - commentary channel should be removed
sys_msg = get_system_message(with_custom_tools=False)
valid_channels = sys_msg.content[0].channel_config.valid_channels
assert "commentary" not in valid_channels
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_full_history(client: OpenAI, model_name: str):

View File

@ -7,7 +7,7 @@ import asyncio
from contextlib import suppress
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Optional
from unittest.mock import AsyncMock, MagicMock
from unittest.mock import MagicMock
import pytest
import pytest_asyncio
@ -230,7 +230,6 @@ class MockHFConfig:
@dataclass
class MockModelConfig:
task = "generate"
runner_type = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
@ -245,33 +244,11 @@ class MockModelConfig:
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
def _build_serving_chat(engine: AsyncLLM,
model_config: MockModelConfig) -> OpenAIServingChat:
models = OpenAIServingModels(engine_client=engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=model_config)
serving_chat = OpenAIServingChat(engine,
model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
async def _fake_process_inputs(request_id, engine_prompt, sampling_params,
*, lora_request, trace_headers, priority):
return dict(engine_prompt), {}
serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
return serving_chat
@dataclass
class MockEngine:
@ -305,7 +282,16 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=MockModelConfig())
serving_chat = OpenAIServingChat(mock_engine,
MockModelConfig(),
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
messages = [{"role": "user", "content": "what is 1+1?"}]
async def return_model_name(*args):
@ -332,7 +318,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=MockModelConfig())
serving_chat = OpenAIServingChat(mock_engine,
MockModelConfig(),
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
req = ChatCompletionRequest(
model=MODEL_NAME,
@ -366,7 +361,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.errored = False
# Initialize the serving chat
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_chat = OpenAIServingChat(mock_engine,
mock_model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
# Test Case 1: No max_tokens specified in request
req = ChatCompletionRequest(
@ -411,7 +415,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.errored = False
# Initialize the serving chat
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_chat = OpenAIServingChat(mock_engine,
mock_model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
# Test case 1: No max_tokens specified, defaults to context_window
req = ChatCompletionRequest(
@ -458,7 +471,16 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine.errored = False
# Initialize the serving chat
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_chat = OpenAIServingChat(mock_engine,
mock_model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
req = ChatCompletionRequest(
model=MODEL_NAME,
@ -503,7 +525,17 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
# Initialize the serving chat
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_chat = OpenAIServingChat(mock_engine,
mock_model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
# Test cache_salt
req = ChatCompletionRequest(
@ -517,12 +549,10 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
# By default, cache_salt in the engine prompt is not set
with suppress(Exception):
await serving_chat.create_chat_completion(req)
engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
assert "cache_salt" not in engine_prompt
assert "cache_salt" not in mock_engine.generate.call_args.args[0]
# Test with certain cache_salt
req.cache_salt = "test_salt"
with suppress(Exception):
await serving_chat.create_chat_completion(req)
engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
assert engine_prompt.get("cache_salt") == "test_salt"
assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"

View File

@ -1,129 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import AsyncExitStack
from unittest.mock import AsyncMock, MagicMock
import pytest
import pytest_asyncio
from vllm.entrypoints.context import ConversationContext
from vllm.entrypoints.openai.protocol import ResponsesRequest
from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
from vllm.entrypoints.tool_server import ToolServer
class MockConversationContext(ConversationContext):
"""Mock conversation context for testing"""
def __init__(self):
self.init_tool_sessions_called = False
self.init_tool_sessions_args = None
self.init_tool_sessions_kwargs = None
def append_output(self, output) -> None:
pass
async def call_tool(self):
return []
def need_builtin_tool_call(self) -> bool:
return False
def render_for_completion(self):
return []
async def init_tool_sessions(self, tool_server, exit_stack, request_id,
mcp_tools):
self.init_tool_sessions_called = True
self.init_tool_sessions_args = (tool_server, exit_stack, request_id,
mcp_tools)
async def cleanup_session(self) -> None:
pass
@pytest.fixture
def mock_serving_responses():
"""Create a mock OpenAIServingResponses instance"""
serving_responses = MagicMock(spec=OpenAIServingResponses)
serving_responses.tool_server = MagicMock(spec=ToolServer)
return serving_responses
@pytest.fixture
def mock_context():
"""Create a mock conversation context"""
return MockConversationContext()
@pytest.fixture
def mock_exit_stack():
"""Create a mock async exit stack"""
return MagicMock(spec=AsyncExitStack)
class TestInitializeToolSessions:
"""Test class for _initialize_tool_sessions method"""
@pytest_asyncio.fixture
async def serving_responses_instance(self):
"""Create a real OpenAIServingResponses instance for testing"""
# Create minimal mocks for required dependencies
engine_client = MagicMock()
engine_client.get_model_config = AsyncMock()
model_config = MagicMock()
model_config.hf_config.model_type = "test"
model_config.get_diff_sampling_param.return_value = {}
models = MagicMock()
tool_server = MagicMock(spec=ToolServer)
# Create the actual instance
instance = OpenAIServingResponses(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
tool_server=tool_server,
)
return instance
@pytest.mark.asyncio
async def test_initialize_tool_sessions(self, serving_responses_instance,
mock_context, mock_exit_stack):
"""Test that method works correctly with only MCP tools"""
request = ResponsesRequest(input="test input", tools=[])
# Call the method
await serving_responses_instance._initialize_tool_sessions(
request, mock_context, mock_exit_stack)
assert mock_context.init_tool_sessions_called is False
# Create only MCP tools
tools = [
{
"type": "web_search_preview"
},
{
"type": "code_interpreter",
"container": {
"type": "auto"
}
},
]
request = ResponsesRequest(input="test input", tools=tools)
# Call the method
await serving_responses_instance._initialize_tool_sessions(
request, mock_context, mock_exit_stack)
# Verify that init_tool_sessions was called
assert mock_context.init_tool_sessions_called

View File

@ -10,9 +10,8 @@ from unittest.mock import patch
import pytest
import torch
from vllm.attention.backends.registry import _Backend
from vllm.attention.layer import MultiHeadAttention
from vllm.attention.selector import _cached_get_attn_backend
from vllm.attention.selector import _Backend, _cached_get_attn_backend
from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform

View File

@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
native_w8a8_block_matmul)
from vllm.config import VllmConfig
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import (fp8_gemm_nt,
@ -91,8 +91,7 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
out_dtype)
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
rel_diff = (torch.mean(
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /

View File

@ -20,11 +20,9 @@ from vllm.platforms import current_platform
(8, 513, 64), # Non-divisible (native only)
])
@pytest.mark.parametrize("seed", [42])
@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
group_size: int, seed: int,
use_ue8m0: bool) -> None:
group_size: int, seed: int) -> None:
"""Test QuantFP8 group quantization with various configurations.
Tests both CUDA and native implementations, column-major scales,
@ -40,8 +38,7 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
group_shape = GroupShape(1, group_size)
quant_op = QuantFP8(static=False,
group_shape=group_shape,
column_major_scales=False,
use_ue8m0=use_ue8m0)
column_major_scales=False)
# 1. Test native implementation (always available)
x_quant_native, scales_native = quant_op.forward_native(x.clone())
@ -51,15 +48,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
# 2. Test column-major scales configuration
quant_op_col = QuantFP8(static=False,
group_shape=group_shape,
column_major_scales=True,
use_ue8m0=use_ue8m0)
column_major_scales=True)
_, scales_col = quant_op_col.forward_native(x.clone())
assert scales_col.shape == (batch_size, expected_num_groups)
assert scales_col.stride(0) == 1
assert scales_col.stride(1) == batch_size
# Test column-major scales consistency
assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
assert scales_col.shape == (expected_num_groups, batch_size)
# 3. Test CUDA implementation (only for divisible dimensions)
if is_divisible:
@ -77,23 +68,21 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
@pytest.mark.parametrize("seed", [42])
@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
def test_quantfp8_group_multidimensional(seed: int) -> None:
current_platform.seed_everything(seed)
group_size = 64
# Test with 3D input
batch1, batch2, hidden_dim = 4, 8, 1024
batch1, batch2, hidden_dim = 4, 8, 512
x_3d = torch.randn(
(batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
group_shape = GroupShape(1, group_size)
quant_op = QuantFP8(static=False,
group_shape=group_shape,
column_major_scales=False,
use_ue8m0=use_ue8m0)
column_major_scales=False)
x_quant, scales = quant_op.forward_native(x_3d.clone())
assert x_quant.shape == x_3d.shape
@ -102,10 +91,9 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
# Test column_major_scales with multi-dim
quant_op_col = QuantFP8(static=False,
group_shape=group_shape,
column_major_scales=True,
use_ue8m0=use_ue8m0)
column_major_scales=True)
_, scales_col = quant_op_col.forward_native(x_3d.clone())
assert scales_col.shape == (batch1, batch2, hidden_dim // group_size)
assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)
# Test with 4D input
batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256

View File

@ -15,10 +15,10 @@ from torch._prims_common import TensorLikeType
from tests.kernels.quant_utils import native_w8a8_block_matmul
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
from vllm.attention.backends.registry import _Backend
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
from vllm.platforms.interface import _Backend
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)

View File

@ -17,6 +17,8 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
from vllm.model_executor.layers.layernorm import (RMSNorm,
dispatch_rocm_rmsnorm_func,
fused_add_rms_norm, rms_norm)
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
@ -109,6 +111,34 @@ def test_enabled_ops_invalid(env: str):
RMSNorm(1024).enabled()
@pytest.mark.skipif(
not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
@pytest.mark.parametrize("use_cutlass", [True, False])
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
use_rocm_aiter_gemm_w8a8_blockscale: str,
monkeypatch):
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
use_rocm_aiter_gemm_w8a8_blockscale)
use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
int(use_rocm_aiter_gemm_w8a8_blockscale)))
block_scale_func = dispatch_w8a8_blockscale_func(
use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
if use_cutlass:
assert block_scale_func == cutlass_scaled_mm
elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
use_rocm_aiter_gemm_w8a8_blockscale):
assert block_scale_func == (
torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
else:
assert block_scale_func == w8a8_block_fp8_matmul
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)

View File

@ -240,12 +240,12 @@ def test_distributed_correctness(
num_logprobs: int,
) -> None:
with vllm_runner(model, tensor_parallel_size=1,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
max_num_seqs=2) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
with vllm_runner(model, tensor_parallel_size=2,
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
max_num_seqs=2) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)

View File

@ -12,8 +12,6 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
ImageDummyOptions, VideoDummyOptions)
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
@ -114,26 +112,12 @@ def _test_processing_correctness(
processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
# Keep integer limits for local data generation
limit_mm_per_prompt_ints = {
limit_mm_per_prompt = {
modality: 3 if limit is None else limit
for modality, limit in supported_mm_limits.items()
}
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
if modality == "video":
return VideoDummyOptions(count=count)
if modality == "image":
return ImageDummyOptions(count=count)
if modality == "audio":
return AudioDummyOptions(count=count)
return BaseDummyOptions(count=count)
# Assign normalized DummyOptions to the model config
model_config.get_multimodal_config().limit_per_prompt = {
modality: _to_dummy_options(modality, count)
for modality, count in limit_mm_per_prompt_ints.items()
}
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
baseline_processor = factories.build_processor(ctx, cache=None)
cached_processor = factories.build_processor(ctx, cache=cache)
@ -166,7 +150,7 @@ def _test_processing_correctness(
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit + 1))]
for k, limit in limit_mm_per_prompt_ints.items()
for k, limit in limit_mm_per_prompt.items()
}
mm_counts = {k: len(vs) for k, vs in mm_data.items()}

View File

@ -17,23 +17,23 @@ def test_profiling(model_id: str, max_model_len: int):
model_config_kwargs = {
"max_model_len": max_model_len,
}
mm_counts = {"image": 1}
ctx = build_model_context(
model_id,
model_config_kwargs=model_config_kwargs,
limit_mm_per_prompt=mm_counts,
limit_mm_per_prompt={"image": 1},
)
mm_config = ctx.get_mm_config()
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
profiler = MultiModalProfiler(processor)
decoder_dummy_data = profiler.get_decoder_dummy_data(
max_model_len,
mm_counts=mm_counts,
mm_counts=mm_config.limit_per_prompt,
)
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
max_model_len,
mm_counts=mm_counts,
mm_counts=mm_config.limit_per_prompt,
)
hf_config = ctx.get_hf_config(Llama4Config)
@ -58,7 +58,7 @@ def test_profiling(model_id: str, max_model_len: int):
profiled_tokens = profiler.get_mm_max_contiguous_tokens(
max_model_len,
mm_counts=mm_counts,
mm_counts=mm_config.limit_per_prompt,
)
assert total_tokens == profiled_tokens["image"]

View File

@ -15,8 +15,6 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
ImageDummyOptions, VideoDummyOptions)
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
@ -238,20 +236,7 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
modality: 3 if limit is None else limit
for modality, limit in supported_mm_limits.items()
}
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
if modality == "video":
return VideoDummyOptions(count=count)
if modality == "image":
return ImageDummyOptions(count=count)
if modality == "audio":
return AudioDummyOptions(count=count)
return BaseDummyOptions(count=count)
model_config.get_multimodal_config().limit_per_prompt = {
modality: _to_dummy_options(modality, count)
for modality, count in limit_mm_per_prompt.items()
}
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
processor = factories.build_processor(ctx, cache=None)
with initialize_dummy_model(model_cls, model_config) as model:

View File

@ -37,5 +37,4 @@ def test_multimodal_processor(model_id):
hf_processor_mm_kwargs={},
)
assert (str_processed_inputs["prompt_token_ids"]
== ids_processed_inputs["prompt_token_ids"])
assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]

View File

@ -661,10 +661,6 @@ _TRANSFORMERS_BACKEND_MODELS = {
"TransformersForSequenceClassification": _HfExamplesInfo("papluca/xlm-roberta-base-language-detection", min_transformers_version="4.57.0.dev0"), # noqa: E501
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
"TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"TransformersMoEForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924", min_transformers_version="4.57.0.dev0"), # noqa: E501
"TransformersMoEForMultimodalLM": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="4.57.0.dev0"), # noqa: E501
"TransformersMoEEmbeddingModel": _HfExamplesInfo("Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0"), # noqa: E501
"TransformersMoEForSequenceClassification": _HfExamplesInfo("Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0"), # noqa: E501
}
_EXAMPLE_MODELS = {

View File

@ -66,7 +66,6 @@ def check_implementation(
[
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
("hmellor/Ilama-3.2-1B", "auto"), # CUSTOM CODE
("allenai/OLMoE-1B-7B-0924", "transformers"), # MoE
]) # trust_remote_code=True by default
def test_models(
hf_runner: type[HfRunner],
@ -75,14 +74,6 @@ def test_models(
model: str,
model_impl: str,
) -> None:
import transformers
from packaging.version import Version
installed = Version(transformers.__version__)
required = Version("4.57.0.dev0")
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
pytest.skip("MoE models with the Transformers backend require "
f"transformers>={required}, but got {installed}")
check_implementation(hf_runner,
vllm_runner,
example_prompts,

View File

@ -430,26 +430,17 @@ def dummy_hf_overrides(
update_dict = {
"num_layers": num_layers,
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_local_experts": num_experts,
# Otherwise there will not be any expert layers
"first_k_dense_replace": 0,
# To avoid OOM on DeepSeek-V3
"n_routed_experts": num_experts,
# For Gemma-3n
"num_kv_shared_layers": 1,
}
class DummyConfig:
hf_text_config = text_config
# Only set MoE related config when the model has MoE layers.
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
if ModelConfig.get_num_experts(DummyConfig) > 0:
update_dict.update({
"num_experts": num_experts,
"num_experts_per_tok": 2,
"num_local_experts": num_experts,
# Otherwise there will not be any expert layers
"first_k_dense_replace": 0,
# To avoid OOM on DeepSeek-V3
"n_routed_experts": num_experts,
})
# Update num_hidden_layers for non-Longcat architectures
if model_arch != "LongcatFlashForCausalLM" \
and model_arch != "LongCatFlashMTPModel":

View File

@ -3,7 +3,6 @@
import json
import os
from typing import Optional
import pytest
@ -21,10 +20,9 @@ os.environ["FLASHINFER_NVCC_THREADS"] = "16"
dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
def can_initialize(model: str, extra_args: Optional[list[str]] = None):
def can_initialize(model: str, extra_args: list[str]):
# Server arguments
extra_args = extra_args if extra_args is not None else []
server_args = [
"--max-model-len",
"2048",
@ -67,7 +65,7 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
@pytest.mark.skip(reason="Works, but takes too long to run")
@ -75,21 +73,21 @@ def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
@pytest.mark.skip(reason="Works, but takes too long to run")
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
## DeepSeekV3 ##
@ -97,37 +95,21 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
can_initialize("deepseek-ai/DeepSeek-V3.1")
@pytest.mark.skip(reason=("Known issue: lack of kernel support. "
"Expected failure: assert self.block_quant is None"))
def test_deepseek_fp8_block_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("deepseek-ai/DeepSeek-V3.1")
def test_deepseek_fp8_block_moe_flashinfer_trtllm(
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("deepseek-ai/DeepSeek-V3.1")
can_initialize("deepseek-ai/DeepSeek-V3.1", [])
def test_deepseek_nvfp4_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
## GPT-OSS ##
@ -135,16 +117,16 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
can_initialize("openai/gpt-oss-20b")
can_initialize("openai/gpt-oss-20b", [])
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
can_initialize("openai/gpt-oss-20b")
can_initialize("openai/gpt-oss-20b", [])
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
can_initialize("openai/gpt-oss-20b")
can_initialize("openai/gpt-oss-20b", [])

View File

@ -18,9 +18,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
W8A8BlockFp8LinearOp)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@ -745,35 +742,3 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
perplexity = llm.generate_prompt_perplexity([prompt])[0]
print(perplexity)
assert perplexity <= exp_perplexity
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
with vllm_runner(model_path) as llm:
fp8_dtype = current_platform.fp8_dtype()
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear,
W8A8BlockFp8LinearOp)
assert qkv_proj.weight.dtype is fp8_dtype
assert qkv_proj.weight_scale.dtype is torch.float32
assert len(qkv_proj.weight.shape) == 2
assert len(qkv_proj.weight_scale.shape) == 2
input_quant_op = \
qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
assert isinstance(input_quant_op, QuantFP8)
assert input_quant_op._forward_method == input_quant_op.forward_cuda
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output

View File

@ -786,43 +786,13 @@ def test_model_specification(parser_with_config, cli_config_file,
parser_with_config.parse_args(['serve', '--config', cli_config_file])
# Test using --model option raises error
# with pytest.raises(
# ValueError,
# match=
# ("With `vllm serve`, you should provide the model as a positional "
# "argument or in a config file instead of via the `--model` option."),
# ):
# parser_with_config.parse_args(['serve', '--model', 'my-model'])
# Test using --model option back-compatibility
# (when back-compatibility ends, the above test should be uncommented
# and the below test should be removed)
args = parser_with_config.parse_args([
'serve',
'--tensor-parallel-size',
'2',
'--model',
'my-model',
'--trust-remote-code',
'--port',
'8001',
])
assert args.model is None
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 8001
args = parser_with_config.parse_args([
'serve',
'--tensor-parallel-size=2',
'--model=my-model',
'--trust-remote-code',
'--port=8001',
])
assert args.model is None
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 8001
with pytest.raises(
ValueError,
match=
("With `vllm serve`, you should provide the model as a positional "
"argument or in a config file instead of via the `--model` option."),
):
parser_with_config.parse_args(['serve', '--model', 'my-model'])
# Test other config values are preserved
args = parser_with_config.parse_args([

View File

@ -8,11 +8,11 @@ import pytest
import torch
from torch.nn.attention.flex_attention import create_block_mask, flex_attention
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata,
create_standard_kv_cache_spec,
create_vllm_config,
get_attention_backend)
from vllm.attention.backends.registry import _Backend
from vllm.config import ModelConfig
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer

View File

@ -6,12 +6,12 @@ from typing import Optional, Union
import pytest
import torch
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata,
create_standard_kv_cache_spec,
create_vllm_config,
get_attention_backend)
from vllm import _custom_ops as ops
from vllm.attention.backends.registry import _Backend
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec

View File

@ -22,7 +22,6 @@ from vllm.utils import cdiv
from vllm.v1.attention.backends.mla.flashmla_sparse import (
FlashMLASparseBackend, FlashMLASparseDecodeAndContextMetadata,
FlashMLASparseImpl, FlashMLASparseMetadata)
from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks
SPARSE_BACKEND_BATCH_SPECS = {
name: BATCH_SPECS[name]
@ -425,24 +424,3 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
sdpa_reference,
rtol=0.5,
atol=0.5)
@pytest.mark.parametrize(
"seq_lens,max_buf,start,expected",
[
# Basic split: totals per chunk ≤ max_buf
(torch.tensor([2, 3, 4, 2]), 5, 0, [(0, 2), (2, 3), (3, 4)]),
# Non-zero start index
(torch.tensor([2, 3, 4, 2]), 5, 1, [(1, 2), (2, 3), (3, 4)]),
# Exact fits should split between items when adding the next would
# overflow
(torch.tensor([5, 5, 5]), 5, 0, [(0, 1), (1, 2), (2, 3)]),
# All requests fit in a single chunk
(torch.tensor([1, 1, 1]), 10, 0, [(0, 3)]),
# Large buffer with non-zero start
(torch.tensor([4, 4, 4]), 100, 1, [(1, 3)]),
],
)
def test_split_prefill_chunks(seq_lens, max_buf, start, expected):
out = split_prefill_chunks(seq_lens, max_buf, start)
assert out == expected

View File

@ -8,11 +8,10 @@ from typing import Optional, Union
import pytest
import torch
from vllm.attention.backends.registry import _Backend
from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
LoadConfig, ModelConfig, ModelDType, ParallelConfig,
SchedulerConfig, VllmConfig)
from vllm.platforms import current_platform
from vllm.platforms import _Backend, current_platform
from vllm.utils import resolve_obj_by_qualname
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec

View File

@ -1,91 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
end-to-end tests for context length corner cases of vLLM v1 model runner
versus HuggingFace's transformers.
This test verifies the following behavior: allow a prefill that fills the
model's maximum context length and then request a single new token.
Test strategy
- Build a textual prompt that tokenizes to exactly ``max_model_len`` tokens.
- Run vLLM generation requesting a single new token (max_tokens=1).
- Run HF generation on the same prompt requesting a single token too.
- Assert both return the same number of generated tokens and the same ids.
"""
import pytest
import torch
from transformers import AutoModelForCausalLM
from tests.models.utils import check_outputs_equal
from tests.utils import create_new_process_for_each_test
from vllm import LLM, SamplingParams
from vllm.inputs import TokensPrompt
@create_new_process_for_each_test()
@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
@pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize("max_tokens", [1])
def test_prefill_max_context_length(
model: str,
max_model_len: int,
max_tokens: int,
) -> None:
"""Compare vLLM and HuggingFace when the prompt already fills the
model's maximum context length and we request a single new token.
The test ensures vLLM does not raise the "Sampled token IDs exceed the
max model length" assertion and that both vLLM and HF produce the same
single token when given the same inputs.
"""
# Construct a prompt of size max_model_len
prompt_ids = [[43] * max_model_len]
# Generate max_tokens new tokens deterministically.
sampling_params = [
SamplingParams(max_tokens=max_tokens, temperature=0.0, ignore_eos=True)
]
# --- vLLM generation ---
llm = LLM(
model=model,
tokenizer=model,
max_num_seqs=1,
tensor_parallel_size=1,
)
vllm_token_prompts = [TokensPrompt(prompt_token_ids=prompt_ids[0])]
vllm_results = llm.generate(vllm_token_prompts, sampling_params)
vllm_output_ids = vllm_results[0].outputs[0].token_ids
# --- HuggingFace generation ---
with torch.no_grad():
hf_model = AutoModelForCausalLM.from_pretrained(model)
# HF expects a tensor of input ids shaped (batch, seq_len).
hf_input_tokens = torch.tensor(prompt_ids[0]).unsqueeze(0)
# Generate max_tokens new tokens deterministically.
hf_generated = hf_model.generate(
hf_input_tokens,
do_sample=False,
min_new_tokens=max_tokens,
max_new_tokens=max_tokens,
)
# HF returns the prompt + generated tokens. Slice off the prompt.
hf_output_ids = hf_generated.cpu().tolist()[0][len(prompt_ids[0]):]
# check that vLLM outputs (token ids) match HF outputs
# Note: for simplicity don't pass detokenized string
check_outputs_equal(
outputs_0_lst=[(hf_output_ids, "")],
outputs_1_lst=[(vllm_output_ids, "")],
name_0="hf",
name_1="vllm",
)

View File

@ -123,10 +123,13 @@ def test_structured_output(
if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding")
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager = bool(not current_platform.is_tpu())
# Use a single LLM instance for several scenarios to
# speed up the test suite.
llm = LLM(model=model_name,
enforce_eager=True,
enforce_eager=enforce_eager,
max_model_len=1024,
structured_outputs_config=dict(backend=backend,
disable_any_whitespace=backend

View File

@ -57,26 +57,6 @@ def clear_kv_transfer():
ensure_kv_transfer_shutdown()
def get_default_xfer_telemetry(xferDurationS: float = 1,
postDurationS: float = 1,
totalBytes: int = 1,
descCount: int = 1) -> dict:
class AttributeDict(dict):
__slots__ = ()
__getattr__ = dict.__getitem__
__setattr__ = dict.__setitem__ # type: ignore[assignment]
# We can't instantiate nixlXferTelemetry because it's read only and
# ray env does not have NIXL, so we must fake it
return AttributeDict(
xferDuration=xferDurationS * 1e6, # in us
postDuration=postDurationS * 1e6, # in us
totalBytes=totalBytes,
descCount=descCount,
)
class FakeNixlWrapper:
"""Mock implementation of NixlWrapper for testing.
@ -152,9 +132,6 @@ class FakeNixlWrapper:
def transfer(self, handle: int) -> str:
return "PROC"
def get_xfer_telemetry(self, handle: int) -> dict:
return get_default_xfer_telemetry()
############################################################
# Follow are for changing the behavior during testing.
############################################################
@ -192,11 +169,6 @@ nixl_agent = FakeNixlWrapper
with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
f.write(stub)
# Mock nixlXferTelemetry class
pkg_root2 = os.path.join(td, "nixl", "_bindings")
os.makedirs(pkg_root2, exist_ok=True)
with open(os.path.join(pkg_root2, "__init__.py"), "w") as f:
f.write("class nixlXferTelemetry: pass")
# touch parent package
open(os.path.join(td, "nixl", "__init__.py"), "w").close()
yield td
@ -603,7 +575,7 @@ def test_kv_connector_stats(dist_init):
# Verify stats values are recorded
assert not stats_after_transfer.is_empty()
assert stats_after_transfer.num_successful_transfers == 1
assert stats_after_transfer.data["num_successful_transfers"] == 1
# Verify stats are reset after retrieval
stats_after_reset = connector.get_kv_connector_stats()
@ -627,21 +599,16 @@ def test_kv_connector_stats_aggregation():
# Record different transfers on each worker
# Worker 1: 2 transfers
stats = get_default_xfer_telemetry()
worker1_stats.record_transfer(stats)
worker1_stats.record_transfer(stats)
worker1_stats.record_transfer()
worker1_stats.record_transfer()
# Worker 2: 1 transfer
worker2_stats.record_transfer(stats)
worker2_stats.record_transfer()
# Worker 3: 3 transfers
stats = get_default_xfer_telemetry(xferDurationS=2,
postDurationS=2,
totalBytes=2,
descCount=2)
worker3_stats.record_transfer(stats)
worker3_stats.record_transfer(stats)
worker3_stats.record_transfer(stats)
worker3_stats.record_transfer()
worker3_stats.record_transfer()
worker3_stats.record_transfer()
# Create ModelRunnerOutput instances for each worker
worker_outputs = []
@ -669,12 +636,7 @@ def test_kv_connector_stats_aggregation():
aggregated_output.kv_connector_output.kv_connector_stats
assert isinstance(kv_connector_stats, NixlKVConnectorStats)
# Number of total transfers across all workers.
assert kv_connector_stats.num_successful_transfers == 6
# Logging proc, call reduce() to get CLI-friendly stats.
cli_stats = kv_connector_stats.reduce()
assert cli_stats["Avg xfer time (ms)"] == 1500.0
assert cli_stats["Avg post time (ms)"] == 1500.0
assert cli_stats["Avg number of descriptors"] == 1.5
assert kv_connector_stats.data["num_successful_transfers"] == 6
def test_multi_kv_connector_stats_aggregation():
@ -687,7 +649,6 @@ def test_multi_kv_connector_stats_aggregation():
from dataclasses import dataclass
# Mock a KVConnectorStats class for testing aggregation over connectors.
@dataclass
class FooKVConnectorStats(KVConnectorStats):
@ -715,7 +676,7 @@ def test_multi_kv_connector_stats_aggregation():
if nixl_count > 0:
nixl_stats = NixlKVConnectorStats()
for _ in range(nixl_count):
nixl_stats.record_transfer(get_default_xfer_telemetry())
nixl_stats.record_transfer()
data["NixlConnector"] = nixl_stats
if foo_count > 0:
foo_stats = FooKVConnectorStats()
@ -751,10 +712,8 @@ def test_multi_kv_connector_stats_aggregation():
assert isinstance(kv_connector_stats, MultiKVConnectorStats)
# Validate per-connector totals across workers
assert isinstance(kv_connector_stats["NixlConnector"],
NixlKVConnectorStats)
assert kv_connector_stats["NixlConnector"].num_successful_transfers == 5
assert isinstance(kv_connector_stats["FooConnector"], FooKVConnectorStats)
assert kv_connector_stats["NixlConnector"].data[
"num_successful_transfers"] == 5
assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6
@ -796,8 +755,6 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
"working_dir": working_dir, # ship fake nixl package
"env_vars": {
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
# TODO: for ray to carry over, remove once we set
"NIXL_TELEMETRY_ENABLE": "1",
},
}
ray.init(runtime_env=runtime_env)

View File

@ -8,10 +8,10 @@ import pytest
import torch
from tests.utils import get_attn_backend_list_based_on_platform
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata,
create_standard_kv_cache_spec,
get_attention_backend)
from vllm.attention.backends.registry import _Backend
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
VllmConfig)

View File

@ -6,10 +6,10 @@ from unittest import mock
import pytest
import torch
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
from tests.v1.attention.utils import (BatchSpec, _Backend,
create_common_attn_metadata,
create_standard_kv_cache_spec,
get_attention_backend)
from vllm.attention.backends.registry import _Backend
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
VllmConfig)

View File

@ -6,10 +6,9 @@ from typing import Optional
import torch
from tests.v1.attention.utils import (create_standard_kv_cache_spec,
from tests.v1.attention.utils import (_Backend, create_standard_kv_cache_spec,
create_vllm_config,
get_attention_backend)
from vllm.attention.backends.registry import _Backend
from vllm.config import ParallelConfig, SpeculativeConfig
from vllm.v1.attention.backends.utils import CommonAttentionMetadata

View File

@ -1,27 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention backend registry"""
import enum
class _Backend(enum.Enum):
FLASH_ATTN = enum.auto()
TRITON_ATTN = enum.auto()
XFORMERS = enum.auto()
ROCM_FLASH = enum.auto()
ROCM_AITER_MLA = enum.auto()
ROCM_AITER_FA = enum.auto() # used for ViT attn backend
TORCH_SDPA = enum.auto()
FLASHINFER = enum.auto()
FLASHINFER_MLA = enum.auto()
TRITON_MLA = enum.auto()
CUTLASS_MLA = enum.auto()
FLASHMLA = enum.auto()
FLASH_ATTN_MLA = enum.auto()
PALLAS = enum.auto()
IPEX = enum.auto()
NO_ATTENTION = enum.auto()
FLEX_ATTENTION = enum.auto()
TREE_ATTN = enum.auto()
ROCM_ATTN = enum.auto()

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer."""
from typing import Callable, List, Optional
from typing import List, Optional
import torch
import torch.nn as nn
@ -10,7 +10,6 @@ import torch.nn.functional as F
import vllm.envs as envs
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.backends.registry import _Backend
from vllm.attention.selector import backend_name_to_enum, get_attn_backend
from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
from vllm.config import CacheConfig, get_current_vllm_config
@ -27,7 +26,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape)
from vllm.model_executor.models.vision import get_vit_attn_backend
from vllm.platforms import current_platform
from vllm.platforms import _Backend, current_platform
from vllm.utils import GiB_bytes, direct_register_custom_op
logger = init_logger(__name__)
@ -68,39 +67,9 @@ def check_upstream_fa_availability(dtype: torch.dtype):
) and current_platform.has_device_capability(80):
from transformers.utils import is_flash_attn_2_available
return is_flash_attn_2_available()
if current_platform.is_rocm():
from importlib.util import find_spec
return find_spec("flash_attn") is not None
return False
def maybe_get_vit_flash_attn_backend(
attn_backend: _Backend,
use_upstream_fa: bool) -> tuple[_Backend, Callable]:
if attn_backend != _Backend.FLASH_ATTN and \
attn_backend != _Backend.ROCM_AITER_FA and \
check_upstream_fa_availability(torch.get_default_dtype()):
attn_backend = _Backend.FLASH_ATTN
use_upstream_fa = True
if current_platform.is_rocm() and \
attn_backend == _Backend.FLASH_ATTN:
use_upstream_fa = True
if (attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}):
if attn_backend == _Backend.ROCM_AITER_FA:
from aiter import flash_attn_varlen_func
else:
if use_upstream_fa:
from flash_attn import flash_attn_varlen_func
else:
from vllm.vllm_flash_attn import flash_attn_varlen_func
else:
flash_attn_varlen_func = None
return attn_backend, flash_attn_varlen_func
class Attention(nn.Module, AttentionLayerBase):
"""Attention layer.
@ -440,9 +409,13 @@ class MultiHeadAttention(nn.Module):
# to upstream flash attention if available.
# If vllm native fa is selected, we use it directly.
use_upstream_fa = False
if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
dtype):
backend = _Backend.FLASH_ATTN
use_upstream_fa = True
if current_platform.is_xpu():
# currently, only torch_sdpa is supported on xpu
if current_platform.is_rocm() or current_platform.is_xpu():
# currently, only torch_sdpa is supported on rocm/xpu
self.attn_backend = _Backend.TORCH_SDPA
else:
@ -454,25 +427,17 @@ class MultiHeadAttention(nn.Module):
_Backend.FLASH_ATTN,
} else _Backend.TORCH_SDPA
self.attn_backend, self._flash_attn_varlen_func \
= maybe_get_vit_flash_attn_backend(
self.attn_backend,
use_upstream_fa,
)
if (self.attn_backend == _Backend.XFORMERS
and not check_xformers_availability()):
self.attn_backend = _Backend.TORCH_SDPA
self.is_flash_attn_backend = self.attn_backend in {
_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
}
# this condition is just to make sure that the
# use_upstream_fa in the log is correct
if current_platform.is_rocm() \
and self.attn_backend == _Backend.FLASH_ATTN:
use_upstream_fa = True
if self.attn_backend == _Backend.FLASH_ATTN:
if use_upstream_fa:
from flash_attn import flash_attn_varlen_func
self._flash_attn_varlen_func = flash_attn_varlen_func
else:
from vllm.vllm_flash_attn import flash_attn_varlen_func
self._flash_attn_varlen_func = flash_attn_varlen_func
logger.info_once(
f"MultiHeadAttention attn_backend: {self.attn_backend}, "
@ -500,7 +465,7 @@ class MultiHeadAttention(nn.Module):
key = torch.repeat_interleave(key, num_repeat, dim=2)
value = torch.repeat_interleave(value, num_repeat, dim=2)
if self.is_flash_attn_backend:
if self.attn_backend == _Backend.FLASH_ATTN:
cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
step=q_len,
dtype=torch.int32,
@ -541,6 +506,14 @@ class MultiHeadAttention(nn.Module):
from torch_xla.experimental.custom_kernel import flash_attention
out = flash_attention(query, key, value, sm_scale=self.scale)
out = out.transpose(1, 2)
elif self.attn_backend == _Backend.ROCM_AITER_FA:
from aiter import flash_attn_varlen_func
# ROCm Flash Attention expects (batch, seq, heads, head_dim)
out = flash_attn_varlen_func(query,
key,
value,
softmax_scale=self.scale)
else:
# ViT attention hasn't supported this backend yet
raise NotImplementedError(

View File

@ -11,9 +11,8 @@ import torch
import vllm.envs as envs
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.backends.registry import _Backend
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.platforms import _Backend, current_platform
from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
logger = init_logger(__name__)

View File

@ -235,22 +235,16 @@ class BenchmarkDataset(ABC):
if len(requests) < num_requests:
random.seed(self.random_seed)
needed = num_requests - len(requests)
additional = []
for i in range(needed):
req = deepcopy(random.choice(requests))
additional = deepcopy(
random.choices(requests, k=num_requests - len(requests))
)
for i in range(len(additional)):
req = additional[i]
req.request_id = request_id_prefix + str(len(requests) + i)
additional.append(req)
requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.",
num_requests)
ids = [req.request_id for req in requests]
if len(ids) != len(set(ids)):
raise ValueError("Duplicate request_id found in the sampled "
"requests. Please ensure that each request_id "
"is unique.")
# -----------------------------------------------------------------------------
# Utility Functions and Global Caches
@ -1157,12 +1151,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
help="Do not oversample if the dataset has " \
"fewer samples than num-prompts.",
)
parser.add_argument(
"--skip-chat-template",
action="store_true",
help=
"Skip applying chat template to prompt for datasets that support it.",
)
# group for dataset specific arguments
custom_group = parser.add_argument_group("custom dataset options")
@ -1173,6 +1161,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
help=
"Number of output tokens per request, used only for custom dataset.",
)
custom_group.add_argument(
"--custom-skip-chat-template",
action="store_true",
help=
"Skip applying chat template to prompt, used only for custom dataset.",
)
spec_bench_group = parser.add_argument_group("spec bench dataset options")
spec_bench_group.add_argument(
@ -1441,7 +1435,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.custom_output_len,
skip_chat_template=args.skip_chat_template,
skip_chat_template=args.custom_skip_chat_template,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
)
@ -1582,7 +1576,6 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
output_len=args.hf_output_len,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
skip_chat_template=args.skip_chat_template,
**hf_kwargs
)
@ -1822,6 +1815,7 @@ class SpecBench(CustomDataset):
def sample(self, **kwargs) -> list:
# leverage CustomDataset sample
kwargs["skip_chat_template"] = False
return super().sample(**kwargs)
@ -2227,7 +2221,6 @@ class InstructCoderDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs) -> list:
@ -2243,15 +2236,14 @@ class InstructCoderDataset(HuggingFaceDataset):
)
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{
"role": "user",
"content": prompt
}],
add_generation_prompt=True,
tokenize=False,
)
prompt = tokenizer.apply_chat_template(
[{
"role": "user",
"content": prompt
}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
@ -2292,7 +2284,6 @@ class MTBenchDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
@ -2307,15 +2298,14 @@ class MTBenchDataset(HuggingFaceDataset):
prompt = item["turns"][0]
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{
"role": "user",
"content": prompt
}],
add_generation_prompt=True,
tokenize=False,
)
prompt = tokenizer.apply_chat_template(
[{
"role": "user",
"content": prompt
}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
@ -2359,7 +2349,6 @@ class BlazeditDataset(HuggingFaceDataset):
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
skip_chat_template: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
min_distance: float = 0.0,
@ -2383,7 +2372,7 @@ class BlazeditDataset(HuggingFaceDataset):
# template copied from
# https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
prompt = f"""Given a code file, please apply the change requests and generate the new file.
instruction = f"""Given a code file, please apply the change requests and generate the new file.
Original file:
```python
@ -2396,15 +2385,14 @@ Change request:
Please generate the new code file in the "New file" section below.""" # noqa: E501
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{
"role": "user",
"content": prompt
}],
add_generation_prompt=True,
tokenize=False,
)
prompt = tokenizer.apply_chat_template(
[{
"role": "user",
"content": instruction
}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)

View File

@ -20,7 +20,7 @@ from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
MultiModalConfig)
from vllm.config.pooler import PoolerConfig
from vllm.config.scheduler import RunnerType
from vllm.config.utils import assert_hashable, config, getattr_iter
from vllm.config.utils import assert_hashable, config
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.transformers_utils.config import (
@ -276,9 +276,7 @@ class ModelConfig:
multimodal_config: Optional[MultiModalConfig] = None
"""Configuration for multimodal model. If `None`, this will be inferred
from the architecture of `self.model`."""
limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int,
dict[str,
int]]]]] = None
limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
mm_processor_cache_gb: InitVar[Optional[float]] = None
@ -669,8 +667,6 @@ class ModelConfig:
def _get_transformers_backend_cls(self) -> str:
"""Determine which Transformers backend class will be used if
`model_impl` is set to `transformers` or `auto`."""
prefix = "Transformers"
prefix += "MoE" if self.get_num_experts() > 1 else ""
# Check if the architecture we're wrapping has defaults
runner = None
convert = None
@ -689,15 +685,15 @@ class ModelConfig:
# Resolve Transformers backend pooling classes
if runner == "pooling":
if convert == "embed":
return prefix + "EmbeddingModel"
return "TransformersEmbeddingModel"
if convert == "classify":
return prefix + "ForSequenceClassification"
return "TransformersForSequenceClassification"
# Resolve Transformers backend generate classes
if self.hf_config != self.hf_text_config:
# If 'hf_text_config' is the same as 'hf_config'. If not, it is
# probably a composite config, i.e. multimodal
return prefix + "ForMultimodalLM"
return prefix + "ForCausalLM"
return "TransformersForMultimodalLM"
return "TransformersForCausalLM"
def using_transformers_backend(self) -> bool:
"""Check if the model is using the Transformers backend class."""
@ -1029,7 +1025,17 @@ class ModelConfig:
self.enforce_eager = True
def _verify_with_expert_parallelism(self) -> None:
num_experts = self.get_num_experts()
num_expert_names = [
"moe_num_experts", # Dbrx
"num_experts", # Jamba
"n_routed_experts", # DeepSeek
"num_local_experts", # Mixtral
]
num_experts = 0
for name in num_expert_names:
num_experts = getattr(self.hf_text_config, name, 0)
if num_experts > 0:
break
if num_experts < 1:
raise ValueError(
"Number of experts in the model must be greater than 0 "
@ -1214,21 +1220,6 @@ class ModelConfig:
num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
return num_heads // parallel_config.tensor_parallel_size
def get_num_experts(self) -> int:
"""Returns the number of experts in the model."""
num_expert_names = [
"num_experts", # Jamba
"moe_num_experts", # Dbrx
"n_routed_experts", # DeepSeek
"num_local_experts", # Mixtral
]
num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0)
if isinstance(num_experts, list):
# Ernie VL's remote code uses list[int]...
# The values are always the same so we just take the first one.
return num_experts[0]
return num_experts
def get_layers_start_end_indices(
self, parallel_config: ParallelConfig) -> tuple[int, int]:
from vllm.distributed.utils import get_pp_indices

View File

@ -4,45 +4,15 @@
import hashlib
from collections.abc import Mapping
from dataclasses import field
from typing import Any, Literal, Optional, Union
from typing import Any, Literal, Optional
from pydantic import ConfigDict, Field, field_validator
from pydantic.dataclasses import dataclass
import vllm.envs as envs
from vllm.config.utils import config
@dataclass
class BaseDummyOptions:
"""Base options for generating dummy data during profiling."""
count: int = Field(999, ge=0)
@dataclass(config=ConfigDict(extra="forbid"))
class VideoDummyOptions(BaseDummyOptions):
"""Options for generating dummy video data during profiling."""
num_frames: Optional[int] = Field(None, gt=0)
width: Optional[int] = Field(None, gt=0)
height: Optional[int] = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class ImageDummyOptions(BaseDummyOptions):
"""Options for generating dummy image data during profiling."""
width: Optional[int] = Field(None, gt=0)
height: Optional[int] = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class AudioDummyOptions(BaseDummyOptions):
"""Options for generating dummy audio data during profiling."""
length: Optional[int] = Field(None, gt=0)
MMEncoderTPMode = Literal["weights", "data"]
MMCacheType = Literal["shm", "lru"]
DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions,
AudioDummyOptions]
@config
@ -50,22 +20,12 @@ DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions,
class MultiModalConfig:
"""Controls the behavior of multimodal models."""
limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict)
"""The maximum number of input items and options allowed per
prompt for each modality.
Defaults to 999 for each modality.
limit_per_prompt: dict[str, int] = field(default_factory=dict)
"""The maximum number of input items allowed per prompt for each modality.
Defaults to 1 (V0) or 999 (V1) for each modality.
Legacy format (count only):
{"image": 16, "video": 2}
Configurable format (with options):
{"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
"image": {"count": 5, "width": 512, "height": 512}}
Mixed format (combining both):
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
"height": 512}}
"""
For example, to allow up to 16 images and 2 videos per prompt:
`{"image": 16, "video": 2}`"""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
@ -124,27 +84,6 @@ class MultiModalConfig:
from each video to be pruned.
"""
@field_validator("limit_per_prompt", mode="before")
@classmethod
def _validate_limit_per_prompt(
cls, value: dict[str, Union[int,
dict[str,
int]]]) -> dict[str, DummyOptions]:
for k, v in value.items():
# Handle legacy format where only count is specified
if isinstance(v, int):
v = {"count": v}
# Convert to the appropriate DummyOptions subclass
if k == "video":
value[k] = VideoDummyOptions(**v)
elif k == "image":
value[k] = ImageDummyOptions(**v)
elif k == "audio":
value[k] = AudioDummyOptions(**v)
else:
value[k] = BaseDummyOptions(**v)
return value
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
@ -167,22 +106,12 @@ class MultiModalConfig:
def get_limit_per_prompt(self, modality: str) -> int:
"""
Get the maximum number of input items allowed per prompt
for the given modality (backward compatible).
for the given modality.
"""
limit_data = self.limit_per_prompt.get(modality)
if limit_data is None:
# Unspecified modality is set to 999 by default
return 999
return limit_data.count
def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]:
"""
Get the configurable dummy data options for a modality.
Returns None if no options are configured for this modality.
"""
# All values are now DummyOptions after normalization
return self.limit_per_prompt.get(modality)
return self.limit_per_prompt.get(
modality,
999 if envs.VLLM_USE_V1 else 1,
)
def merge_mm_processor_kwargs(
self,

View File

@ -516,23 +516,6 @@ class VllmConfig:
" by VLLM_DEBUG_DUMP_PATH to %s", env_path)
self.compilation_config.debug_dump_path = env_path
def has_blocked_weights():
if self.quant_config is not None:
if hasattr(self.quant_config, "weight_block_size"):
return self.quant_config.weight_block_size is not None
elif hasattr(self.quant_config, "has_blocked_weights"):
return self.quant_config.has_blocked_weights()
return False
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
# On H100 the CUDA kernel is faster than
# native implementation
# https://github.com/vllm-project/vllm/issues/25094
if has_blocked_weights():
custom_ops = self.compilation_config.custom_ops
if "none" not in custom_ops and "-quant_fp8" not in custom_ops:
custom_ops.append("+quant_fp8")
def update_sizes_for_sequence_parallelism(self,
possible_sizes: list) -> list:
# remove the sizes that not multiple of tp_size when

View File

@ -54,7 +54,6 @@ class HTTPConnection:
stream: bool = False,
timeout: Optional[float] = None,
extra_headers: Optional[Mapping[str, str]] = None,
allow_redirects: bool = True,
):
self._validate_http_url(url)
@ -64,8 +63,7 @@ class HTTPConnection:
return client.get(url,
headers=self._headers(**extra_headers),
stream=stream,
timeout=timeout,
allow_redirects=allow_redirects)
timeout=timeout)
async def get_async_response(
self,
@ -73,7 +71,6 @@ class HTTPConnection:
*,
timeout: Optional[float] = None,
extra_headers: Optional[Mapping[str, str]] = None,
allow_redirects: bool = True,
):
self._validate_http_url(url)
@ -82,17 +79,10 @@ class HTTPConnection:
return client.get(url,
headers=self._headers(**extra_headers),
timeout=timeout,
allow_redirects=allow_redirects)
timeout=timeout)
def get_bytes(self,
url: str,
*,
timeout: Optional[float] = None,
allow_redirects: bool = True) -> bytes:
with self.get_response(url,
timeout=timeout,
allow_redirects=allow_redirects) as r:
def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes:
with self.get_response(url, timeout=timeout) as r:
r.raise_for_status()
return r.content
@ -102,10 +92,8 @@ class HTTPConnection:
url: str,
*,
timeout: Optional[float] = None,
allow_redirects: bool = True,
) -> bytes:
async with await self.get_async_response(
url, timeout=timeout, allow_redirects=allow_redirects) as r:
async with await self.get_async_response(url, timeout=timeout) as r:
r.raise_for_status()
return await r.read()

View File

@ -4,7 +4,6 @@ import contextlib
import copy
import logging
import math
import os
import queue
import threading
import time
@ -21,7 +20,6 @@ import torch
import zmq
from vllm import envs
from vllm.attention.backends.registry import _Backend
from vllm.attention.selector import backend_name_to_enum, get_attn_backend
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@ -34,7 +32,7 @@ from vllm.distributed.parallel_state import (
from vllm.distributed.utils import divide
from vllm.forward_context import ForwardContext
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.platforms import _Backend, current_platform
from vllm.utils import make_zmq_path, make_zmq_socket
from vllm.v1.attention.backends.utils import get_kv_cache_layout
from vllm.v1.core.sched.output import SchedulerOutput
@ -55,12 +53,10 @@ logger = init_logger(__name__)
# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
try:
from nixl._api import nixl_agent as NixlWrapper
from nixl._bindings import nixlXferTelemetry
logger.info("NIXL is available")
except ImportError:
logger.warning("NIXL is not available")
NixlWrapper = None
nixlXferTelemetry = None
try:
from nixl._api import nixl_agent_config
@ -479,9 +475,6 @@ class NixlConnectorWorker:
self.nixl_backends = \
vllm_config.kv_transfer_config.get_from_extra_config(
"backends", ["UCX"])
# TODO temporary, once nixl allows for telemetry flag in config
# (next release), we can remove this env var.
os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
# Agent.
non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
if nixl_agent_config is None:
@ -1181,10 +1174,9 @@ class NixlConnectorWorker:
for handle, _xfer_stime in handles:
xfer_state = self.nixl_wrapper.check_xfer_state(handle)
if xfer_state == "DONE":
# Get telemetry from NIXL
res = self.nixl_wrapper.get_xfer_telemetry(handle)
self.xfer_stats.record_transfer(res)
self.nixl_wrapper.release_xfer_handle(handle)
# TODO (NickLucche) Get from NIXL telemetry once integrated
self.xfer_stats.record_transfer()
elif xfer_state == "PROC":
in_progress = True
continue
@ -1456,25 +1448,15 @@ class NixlKVConnectorStats(KVConnectorStats):
"""Container for transfer performance metrics"""
def __post_init__(self):
if not self.data:
# Empty container init, no data is passed in.
self.reset()
if "num_successful_transfers" not in self.data:
self.data["num_successful_transfers"] = 0
def reset(self):
# Must be serializable
self.data: dict[str, list[float]] = {
"transfer_duration": [],
"post_duration": [],
"bytes_transferred": [],
"num_descriptors": [],
}
self.data = {"num_successful_transfers": 0}
def record_transfer(self, res: nixlXferTelemetry):
# Keep metrics units consistent with rest of the code: time us->s
self.data["transfer_duration"].append(res.xferDuration / 1e6)
self.data["post_duration"].append(res.postDuration / 1e6)
self.data["bytes_transferred"].append(res.totalBytes)
self.data["num_descriptors"].append(res.descCount)
def record_transfer(self):
# TODO: record actual transfer stats when available
self.data["num_successful_transfers"] += 1
def clone_and_reset(self) -> "NixlKVConnectorStats":
old = copy.copy(self)
@ -1482,55 +1464,16 @@ class NixlKVConnectorStats(KVConnectorStats):
return old
def is_empty(self) -> bool:
return self.num_successful_transfers == 0
return self.data["num_successful_transfers"] == 0
def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
if not other.is_empty():
for k, v in other.data.items():
accumulator = self.data[k]
assert isinstance(accumulator, list)
accumulator.extend(v)
self.data["num_successful_transfers"] += other.data[
"num_successful_transfers"]
return self
def reduce(self) -> dict[str, Union[int, float]]:
# Compute compact representative stats suitable for CLI logging
if self.is_empty():
return {
"Num successful transfers": 0,
"Avg xfer time (ms)": 0,
"P90 xfer time (ms)": 0,
"Avg post time (ms)": 0,
"P90 post time (ms)": 0,
"Avg MB per transfer": 0,
"Throughput (MB/s)": 0,
"Avg number of descriptors": 0,
}
xfer_time = np.asarray(self.data["transfer_duration"])
post_time = np.asarray(self.data["post_duration"])
# Convert to MB for CLI logging.
mb = np.asarray(self.data["bytes_transferred"]) / 2**20
descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
n = len(descs)
assert n == self.num_successful_transfers
total_mb = mb.sum()
avg_mb = total_mb / n
total_time_seconds = xfer_time.sum()
throughput_mb_s = total_mb / total_time_seconds
# TODO: reduce stats to a single value, calculate latency/throughput
return {
"Num successful transfers": n,
"Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
"P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3),
"Avg post time (ms)": round(post_time.mean() * 1e3, 3),
"P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3),
"Avg MB per transfer": round(avg_mb, 3),
"Throughput (MB/s)": round(throughput_mb_s, 3),
"Avg number of descriptors": round(descs.mean(), 1),
"num_successful_transfers": self.data["num_successful_transfers"]
}
@property
def num_successful_transfers(self) -> int:
return len(self.data["transfer_duration"])

View File

@ -376,7 +376,7 @@ class EngineArgs:
quantization: Optional[QuantizationMethods] = ModelConfig.quantization
enforce_eager: bool = ModelConfig.enforce_eager
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
limit_mm_per_prompt: dict[str, Union[int, dict[str, int]]] = \
limit_mm_per_prompt: dict[str, int] = \
get_field(MultiModalConfig, "limit_per_prompt")
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
media_io_kwargs: dict[str, dict[str,
@ -1131,10 +1131,6 @@ class EngineArgs:
device_config = DeviceConfig(
device=cast(Device, current_platform.device_type))
model_config = self.create_model_config()
self.model = model_config.model
self.tokenizer = model_config.tokenizer
(self.model, self.tokenizer,
self.speculative_config) = maybe_override_with_speculators(
model=self.model,
@ -1143,6 +1139,7 @@ class EngineArgs:
trust_remote_code=self.trust_remote_code,
vllm_speculative_config=self.speculative_config,
)
model_config = self.create_model_config()
# * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
# and fall back to V0 for experimental or unsupported features.

View File

@ -19,7 +19,6 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tasks import SupportedTask
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import Device, collect_from_async_generator, random_uuid
from vllm.v1.engine import EngineCoreRequest
logger = init_logger(__name__)
@ -50,16 +49,12 @@ class EngineClient(ABC):
@abstractmethod
def generate(
self,
prompt: Union[EngineCoreRequest, PromptType],
prompt: PromptType,
sampling_params: SamplingParams,
request_id: str,
*,
prompt_text: Optional[str] = None,
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> AsyncGenerator[RequestOutput, None]:
"""Generate outputs for a request."""
...
@ -99,15 +94,10 @@ class EngineClient(ABC):
# this happens again in generation, so the double expansion causes
# a mismatch.
# TODO - would be ideal to handle this more gracefully.
if isinstance(prompt, str):
prompt_text = prompt
prompt_token_ids = []
multi_modal_data = None
else:
prompt_text = prompt.get("prompt")
prompt_token_ids = prompt.get("prompt_token_ids", [])
multi_modal_data = prompt.get("multi_modal_data")
prompt_token_ids = prompt.get("prompt_token_ids")
multi_modal_data = prompt.get("multi_modal_data")
prompt_text = processed_inputs.get("prompt")
mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
tokenized_length = len(prompt_token_ids)

View File

@ -37,7 +37,6 @@ from vllm.entrypoints.utils import (_validate_truncation_size,
log_non_default_args)
from vllm.inputs import (DataPrompt, PromptType, SingletonPrompt, TextPrompt,
TokensPrompt)
from vllm.inputs.parse import get_prompt_components
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.quantization import QuantizationMethods
@ -50,13 +49,10 @@ from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
SamplingParams)
from vllm.tasks import PoolingTask
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
get_cached_tokenizer,
init_tokenizer_from_configs)
get_cached_tokenizer)
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter, Device, as_iter, is_list_of
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.llm_engine import LLMEngine
from vllm.v1.engine.processor import Processor
from vllm.v1.sample.logits_processor import LogitsProcessor
if TYPE_CHECKING:
@ -316,10 +312,6 @@ class LLM:
self.io_processor = get_io_processor(self.llm_engine.vllm_config,
io_processor_plugin)
@property
def model_config(self):
return self.llm_engine.model_config
def get_tokenizer(self) -> AnyTokenizer:
return self.llm_engine.get_tokenizer()
@ -332,16 +324,6 @@ class LLM:
else:
self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
def _get_processor(self) -> Processor:
if not hasattr(self, "_processor"):
vllm_config = self.llm_engine.vllm_config
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_configs(self.model_config)
self._processor = Processor(vllm_config, tokenizer)
return self._processor
def get_default_sampling_params(self) -> SamplingParams:
if self.default_sampling_params is None:
self.default_sampling_params = (
@ -1515,6 +1497,8 @@ class LLM:
tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
it = tqdm_func(it, desc="Adding requests")
model_config = self.llm_engine.model_config
for i, prompt in enumerate(it):
if isinstance(prompt, dict):
@ -1522,9 +1506,17 @@ class LLM:
prompt.get("multi_modal_data"),
prompt.get("multi_modal_uuids"))
param = params[i] if isinstance(params, Sequence) else params
tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(model_config.max_model_len,
param.truncate_prompt_tokens,
tokenization_kwargs)
self._add_request(
prompt,
params[i] if isinstance(params, Sequence) else params,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request[i] if isinstance(
lora_request, Sequence) else lora_request,
priority=priority[i] if priority else 0,
@ -1565,58 +1557,22 @@ class LLM:
raise ValueError(f"Multi-modal data for {modality} is None"
f" but UUID is not provided")
def _process_inputs(
self,
request_id: str,
engine_prompt: PromptType,
params: Union[SamplingParams, PoolingParams],
*,
lora_request: Optional[LoRARequest],
priority: int,
) -> tuple[EngineCoreRequest, dict[str, Any]]:
"""Use the Processor to process inputs for LLMEngine."""
tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(self.model_config.max_model_len,
params.truncate_prompt_tokens,
tokenization_kwargs)
processor = self._get_processor()
engine_request = processor.process_inputs(
request_id,
engine_prompt,
params,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
priority=priority,
)
return engine_request, tokenization_kwargs
def _add_request(
self,
prompt: PromptType,
params: Union[SamplingParams, PoolingParams],
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None,
priority: int = 0,
) -> None:
prompt_text, _, _ = get_prompt_components(prompt)
request_id = str(next(self.request_counter))
engine_request, tokenization_kwargs = self._process_inputs(
self.llm_engine.add_request(
request_id,
prompt,
params,
lora_request=lora_request,
priority=priority,
)
self.llm_engine.add_request(
request_id,
engine_request,
params,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
priority=priority,
prompt_text=prompt_text,
)
def _run_engine(

View File

@ -274,8 +274,7 @@ class OpenAIServingChat(OpenAIServing):
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
for i, engine_prompt in enumerate(engine_prompts):
prompt_text, _, _ = (self._get_prompt_components(
request_prompts[i]))
sampling_params: Union[SamplingParams, BeamSearchParams]
if self.default_sampling_params is None:
self.default_sampling_params = {}
@ -286,7 +285,6 @@ class OpenAIServingChat(OpenAIServing):
input_length=len(engine_prompt["prompt_token_ids"]),
default_sampling_params=self.default_sampling_params)
sampling_params: Union[SamplingParams, BeamSearchParams]
if request.use_beam_search:
sampling_params = request.to_beam_search_params(
max_tokens, self.default_sampling_params)
@ -311,25 +309,13 @@ class OpenAIServingChat(OpenAIServing):
lora_request=lora_request,
)
else:
engine_request, tokenization_kwargs = (
await self._process_inputs(
request_id,
engine_prompt,
sampling_params,
lora_request=lora_request,
trace_headers=trace_headers,
priority=request.priority,
))
generator = self.engine_client.generate(
engine_request,
engine_prompt,
sampling_params,
request_id,
lora_request=lora_request,
trace_headers=trace_headers,
priority=request.priority,
prompt_text=prompt_text,
tokenization_kwargs=tokenization_kwargs,
)
generators.append(generator)
@ -691,13 +677,11 @@ class OpenAIServingChat(OpenAIServing):
if self.use_harmony:
harmony_parser = harmony_parsers[i]
prev_recipient = harmony_parser.current_recipient
delta_text = ""
for token_id in output.token_ids:
harmony_parser.process(token_id)
delta_text += (harmony_parser.last_content_delta
or "")
cur_channel = harmony_parser.current_channel
cur_recipient = harmony_parser.current_recipient
delta_text = harmony_parser.last_content_delta or ""
else:
delta_text = output.text
@ -1591,9 +1575,7 @@ class OpenAIServingChat(OpenAIServing):
sys_msg = get_system_message(
reasoning_effort=request.reasoning_effort,
browser_description=None,
python_description=None,
with_custom_tools=request.tools is not None
)
python_description=None)
messages.append(sys_msg)
# Add developer message.

View File

@ -9,6 +9,7 @@ from typing import Optional, Union, cast
import jinja2
from fastapi import Request
from typing_extensions import assert_never
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
@ -31,7 +32,8 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.renderer import RenderConfig
from vllm.entrypoints.utils import get_max_tokens
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
is_tokens_prompt)
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import RequestOutput
@ -155,16 +157,23 @@ class OpenAIServingCompletion(OpenAIServing):
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
for i, engine_prompt in enumerate(engine_prompts):
prompt_text, prompt_token_ids, prompt_embeds = (
self._get_prompt_components(engine_prompt))
input_length = None
if prompt_token_ids is not None:
input_length = len(prompt_token_ids)
elif prompt_embeds is not None:
input_length = len(prompt_embeds)
sampling_params: Union[SamplingParams, BeamSearchParams]
# Mypy does not infer that engine_prompt will have only one of
# "prompt_token_ids" or "prompt_embeds" defined, and both of
# these as Union[object, the expected type], where it infers
# object if engine_prompt is a subclass of one of the
# typeddicts that defines both keys. Worse, because of
# https://github.com/python/mypy/issues/8586, mypy does not
# infer the type of engine_prompt correctly because of the
# enumerate. So we need an unnecessary cast here.
engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt],
engine_prompt)
if is_embeds_prompt(engine_prompt):
input_length = len(engine_prompt["prompt_embeds"])
elif is_tokens_prompt(engine_prompt):
input_length = len(engine_prompt["prompt_token_ids"])
else:
raise NotImplementedError
assert_never(engine_prompt)
if self.default_sampling_params is None:
self.default_sampling_params = {}
@ -176,7 +185,6 @@ class OpenAIServingCompletion(OpenAIServing):
default_sampling_params=self.default_sampling_params,
)
sampling_params: Union[SamplingParams, BeamSearchParams]
if request.use_beam_search:
sampling_params = request.to_beam_search_params(
max_tokens, self.default_sampling_params)
@ -212,25 +220,13 @@ class OpenAIServingCompletion(OpenAIServing):
lora_request=lora_request,
)
else:
engine_request, tokenization_kwargs = (
await self._process_inputs(
request_id_item,
engine_prompt,
sampling_params,
lora_request=lora_request,
trace_headers=trace_headers,
priority=request.priority,
))
generator = self.engine_client.generate(
engine_request,
engine_prompt,
sampling_params,
request_id_item,
lora_request=lora_request,
trace_headers=trace_headers,
priority=request.priority,
prompt_text=prompt_text,
tokenization_kwargs=tokenization_kwargs,
)
generators.append(generator)

View File

@ -15,11 +15,6 @@ from pydantic import BaseModel, ConfigDict, Field
from starlette.datastructures import Headers
from typing_extensions import TypeIs
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.processor import Processor
if sys.version_info >= (3, 12):
from typing import TypedDict
else:
@ -68,7 +63,6 @@ from vllm.entrypoints.renderer import (BaseRenderer, CompletionRenderer,
# yapf: enable
from vllm.inputs.data import PromptType
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.inputs.parse import PromptComponents, get_prompt_components
from vllm.logger import init_logger
from vllm.logprobs import Logprob, PromptLogprobs
from vllm.lora.request import LoRARequest
@ -245,16 +239,6 @@ class OpenAIServing:
AsyncMicrobatchTokenizer] = {}
self.log_error_stack = log_error_stack
async def _get_processor(self) -> Processor:
if not hasattr(self, "_processor"):
vllm_config = await self.engine_client.get_vllm_config()
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_configs(self.model_config)
self._processor = Processor(vllm_config, tokenizer)
return self._processor
def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer:
"""
Get a Renderer instance with the provided tokenizer.
@ -866,34 +850,6 @@ class OpenAIServing:
return conversation, [request_prompt], [engine_prompt]
async def _process_inputs(
self,
request_id: str,
engine_prompt: PromptType,
params: Union[SamplingParams, PoolingParams],
*,
lora_request: Optional[LoRARequest],
trace_headers: Optional[Mapping[str, str]],
priority: int,
) -> tuple[EngineCoreRequest, dict[str, Any]]:
"""Use the Processor to process inputs for AsyncLLM."""
tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(self.max_model_len,
params.truncate_prompt_tokens,
tokenization_kwargs)
processor = await self._get_processor()
engine_request = processor.process_inputs(
request_id,
engine_prompt,
params,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
trace_headers=trace_headers,
priority=priority,
)
return engine_request, tokenization_kwargs
async def _generate_with_builtin_tools(
self,
request_id: str,
@ -905,7 +861,6 @@ class OpenAIServing:
priority: int = 0,
**kwargs,
):
prompt_text, _, _ = self._get_prompt_components(request_prompt)
orig_priority = priority
while True:
self._log_inputs(
@ -914,27 +869,14 @@ class OpenAIServing:
params=sampling_params,
lora_request=lora_request,
)
trace_headers = kwargs.get("trace_headers")
engine_request, tokenization_kwargs = (await self._process_inputs(
request_id,
generator = self.engine_client.generate(
engine_prompt,
sampling_params,
lora_request=lora_request,
trace_headers=trace_headers,
priority=priority,
))
generator = self.engine_client.generate(
engine_request,
sampling_params,
request_id,
lora_request=lora_request,
priority=priority,
prompt_text=prompt_text,
tokenization_kwargs=tokenization_kwargs,
**kwargs,
)
async for res in generator:
context.append_output(res)
# NOTE(woosuk): The stop condition is handled by the engine.
@ -963,15 +905,6 @@ class OpenAIServing:
# OPTIMIZATION
priority = orig_priority - 1
def _get_prompt_components(
self,
prompt: Union[RequestPrompt, PromptType],
) -> PromptComponents:
if isinstance(prompt, list):
return PromptComponents(token_ids=prompt)
return get_prompt_components(prompt) # type: ignore[arg-type]
def _log_inputs(
self,
request_id: str,
@ -982,9 +915,14 @@ class OpenAIServing:
) -> None:
if self.request_logger is None:
return
prompt, prompt_token_ids, prompt_embeds = (
self._get_prompt_components(inputs))
prompt, prompt_token_ids, prompt_embeds = None, None, None
if isinstance(inputs, str):
prompt = inputs
elif isinstance(inputs, list):
prompt_token_ids = inputs
else:
prompt = getattr(inputs, 'prompt', None)
prompt_token_ids = getattr(inputs, 'prompt_token_ids', None)
self.request_logger.log_inputs(
request_id,

View File

@ -445,19 +445,6 @@ class OpenAIServingResponses(OpenAIServing):
return messages, [prompt_token_ids], [engine_prompt]
async def _initialize_tool_sessions(self, request: ResponsesRequest,
context: ConversationContext,
exit_stack: AsyncExitStack):
# we should only initialize the tool session if the request needs tools
if len(request.tools) == 0:
return
mcp_tools = {
tool.server_label: tool
for tool in request.tools if tool.type == "mcp"
}
await context.init_tool_sessions(self.tool_server, exit_stack,
request.request_id, mcp_tools)
async def responses_full_generator(
self,
request: ResponsesRequest,
@ -474,8 +461,12 @@ class OpenAIServingResponses(OpenAIServing):
async with AsyncExitStack() as exit_stack:
try:
await self._initialize_tool_sessions(request, context,
exit_stack)
mcp_tools = {
tool.server_label: tool
for tool in request.tools if tool.type == "mcp"
}
await context.init_tool_sessions(self.tool_server, exit_stack,
request.request_id, mcp_tools)
async for _ in result_generator:
pass
except asyncio.CancelledError:
@ -1659,10 +1650,12 @@ class OpenAIServingResponses(OpenAIServing):
async with AsyncExitStack() as exit_stack:
processer = None
if self.use_harmony:
# TODO: in streaming, we noticed this bug:
# https://github.com/vllm-project/vllm/issues/25697
await self._initialize_tool_sessions(request, context,
exit_stack)
mcp_tools = {
tool.server_label: tool
for tool in request.tools if tool.type == "mcp"
}
await context.init_tool_sessions(self.tool_server, exit_stack,
request.request_id, mcp_tools)
processer = self._process_harmony_streaming_events
else:
processer = self._process_simple_streaming_events

View File

@ -68,7 +68,6 @@ if TYPE_CHECKING:
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
VLLM_MEDIA_URL_ALLOW_REDIRECTS: bool = True
VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
@ -619,9 +618,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# All possible options loaded dynamically from _Backend enum
"VLLM_ATTENTION_BACKEND":
env_with_choices("VLLM_ATTENTION_BACKEND", None,
lambda: list(__import__(
'vllm.attention.backends.registry',
fromlist=['_Backend'])._Backend.__members__.keys())),
lambda: list(__import__('vllm.platforms.interface', \
fromlist=['_Backend'])._Backend.__members__.keys())),
# If set, vllm will use flashinfer sampler
"VLLM_USE_FLASHINFER_SAMPLER":
@ -735,11 +733,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_AUDIO_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
# Whether to allow HTTP redirects when fetching from media URLs.
# Default to True
"VLLM_MEDIA_URL_ALLOW_REDIRECTS":
lambda: bool(int(os.getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1"))),
# Max number of workers for the thread pool handling
# media bytes loading. Set to 1 to disable parallel processing.
# Default is 8

View File

@ -205,6 +205,11 @@ class TokenInputs(TypedDict):
prompt_token_ids: list[int]
"""The token IDs of the prompt."""
prompt: NotRequired[str]
"""
The original prompt text corresponding to the token IDs, if available.
"""
cache_salt: NotRequired[str]
"""
Optional cache salt to be used for prefix caching.
@ -213,12 +218,15 @@ class TokenInputs(TypedDict):
def token_inputs(
prompt_token_ids: list[int],
prompt: Optional[str] = None,
cache_salt: Optional[str] = None,
) -> TokenInputs:
"""Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
values."""
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
if prompt is not None:
inputs["prompt"] = prompt
if cache_salt is not None:
inputs["cache_salt"] = cache_salt

View File

@ -1,8 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import (TYPE_CHECKING, Literal, NamedTuple, Optional, TypedDict,
Union, cast, overload)
from typing import Literal, Optional, TypedDict, Union, cast, overload
from typing_extensions import TypeIs
@ -12,9 +11,6 @@ from .data import (EmbedsPrompt, ExplicitEncoderDecoderPrompt, ProcessorInputs,
PromptType, SingletonInputs, SingletonPrompt, TextPrompt,
TokensPrompt)
if TYPE_CHECKING:
import torch
class ParsedText(TypedDict):
content: str
@ -153,23 +149,3 @@ def split_enc_dec_inputs(
)
return None, inputs
class PromptComponents(NamedTuple):
text: Optional[str] = None
token_ids: Optional[list[int]] = None
embeds: Optional["torch.Tensor"] = None
def get_prompt_components(prompt: PromptType) -> PromptComponents:
if isinstance(prompt, str):
return PromptComponents(text=prompt)
if (encoder_prompt := prompt.get("encoder_prompt")):
return get_prompt_components(encoder_prompt) # type: ignore[arg-type]
return PromptComponents(
text=prompt.get("prompt"), # type: ignore[arg-type]
token_ids=prompt.get("prompt_token_ids"), # type: ignore[arg-type]
embeds=prompt.get("prompt_embeds"),
)

View File

@ -16,10 +16,9 @@ from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
ProcessorInputs, PromptType, SingletonInputs,
SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
embeds_inputs, token_inputs)
EncoderDecoderInputs, ProcessorInputs, PromptType,
SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
TokensPrompt, embeds_inputs, token_inputs)
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
logger = init_logger(__name__)
@ -323,7 +322,7 @@ class InputPreprocessor:
mm_uuids=mm_uuids,
)
else:
inputs = token_inputs(prompt_token_ids)
inputs = token_inputs(prompt_token_ids=prompt_token_ids)
if cache_salt := parsed_content.get("cache_salt"):
inputs["cache_salt"] = cache_salt
@ -353,7 +352,10 @@ class InputPreprocessor:
prompt_text,
tokenization_kwargs=tokenization_kwargs,
)
inputs = token_inputs(prompt_token_ids)
inputs = token_inputs(
prompt=prompt_text,
prompt_token_ids=prompt_token_ids,
)
if cache_salt := parsed_content.get("cache_salt"):
inputs["cache_salt"] = cache_salt
@ -471,17 +473,22 @@ class InputPreprocessor:
decoder_inputs: SingletonInputs
if inputs["type"] == "multimodal": # Multimodal data inputs
if "encoder_prompt_token_ids" not in inputs:
if not ("encoder_prompt" in inputs
and "encoder_prompt_token_ids" in inputs):
raise RuntimeError("You should register an encoder-decoder "
"multi-modal processor for encoder-decoder "
"models.")
inputs = cast(MultiModalEncDecInputs, inputs)
encoder_inputs = token_inputs(inputs["encoder_prompt_token_ids"])
encoder_inputs = token_inputs(
prompt=inputs["encoder_prompt"],
prompt_token_ids=inputs["encoder_prompt_token_ids"],
)
decoder_prompt_inputs = decoder_inputs_to_override or inputs
decoder_inputs = MultiModalInputs(
type="multimodal",
prompt=decoder_prompt_inputs.get("prompt", ""),
prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
mm_kwargs=inputs["mm_kwargs"],
mm_hashes=inputs["mm_hashes"],
@ -491,7 +498,7 @@ class InputPreprocessor:
decoder_inputs["cache_salt"] = cache_salt
elif inputs["type"] == "token": # Text-only inputs
encoder_inputs = token_inputs(prompt_token_ids=[])
encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
decoder_inputs = decoder_inputs_to_override or inputs
else:
assert_never(inputs) # type: ignore[arg-type]
@ -542,14 +549,12 @@ class InputPreprocessor:
decoder_inputs: Optional[SingletonInputs]
if is_explicit_encoder_decoder_prompt(prompt):
# `cast` is needed for mypy, but not pyright
prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
encoder_inputs = self._prompt_to_llm_inputs(
prompt_["encoder_prompt"],
prompt["encoder_prompt"],
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)
if (decoder_input := prompt_["decoder_prompt"]) is None:
if (decoder_input := prompt["decoder_prompt"]) is None:
decoder_inputs = None
else:
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
@ -560,9 +565,8 @@ class InputPreprocessor:
self._split_enc_dec_mm_inputs(encoder_inputs,
decoder_inputs))
else:
# `cast` is needed for mypy, but not pyright
inputs = self._prompt_to_llm_inputs(
cast(SingletonPrompt, prompt),
prompt,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)
@ -637,9 +641,8 @@ class InputPreprocessor:
"to decoder-only models")
# Decoder-only operation
# `cast` is needed for mypy, but not pyright
return self._process_decoder_only_prompt(
cast(SingletonPrompt, prompt),
prompt,
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)

View File

@ -355,7 +355,7 @@ def batched_triton_kernel(
def invoke_moe_batched_triton_kernel(
A: torch.Tensor, # [E, max_tokens, K]
B: torch.Tensor, # [E, N, K]
B: torch.Tensor, # [E, K, N]
C: torch.Tensor, # [E, max_tokens, N]
expert_num_tokens: torch.Tensor, # [E]
compute_type: tl.dtype,

View File

@ -4,6 +4,9 @@
import functools
import json
import os
# torch.compile needs typing.List. It will fail torch.library.infer_schema
# otherwise
from typing import List # noqa: UP035
from typing import Any, Callable, Optional, Union
import torch
@ -1226,7 +1229,7 @@ def inplace_fused_experts(
w2_zp: Optional[torch.Tensor] = None,
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[list[int]] = None,
block_shape: Optional[List[int]] = None, #noqa: UP006
w1_bias: Optional[torch.Tensor] = None,
w2_bias: Optional[torch.Tensor] = None,
) -> None:
@ -1260,7 +1263,7 @@ def inplace_fused_experts_fake(
w2_zp: Optional[torch.Tensor] = None,
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[list[int]] = None,
block_shape: Optional[List[int]] = None, #noqa: UP006
w1_bias: Optional[torch.Tensor] = None,
w2_bias: Optional[torch.Tensor] = None,
) -> None:
@ -1299,7 +1302,7 @@ def outplace_fused_experts(
w2_zp: Optional[torch.Tensor] = None,
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
block_shape: Optional[list[int]] = None,
block_shape: Optional[List[int]] = None, #noqa: UP006
w1_bias: Optional[torch.Tensor] = None,
w2_bias: Optional[torch.Tensor] = None,
) -> torch.Tensor:

View File

@ -960,7 +960,6 @@ class FusedMoE(CustomOp):
is_sequence_parallel=False,
zero_expert_num: Optional[int] = 0,
zero_expert_type: Optional[str] = None,
expert_mapping: Optional[list[tuple[str, str, int, str]]] = None,
):
super().__init__()
if params_dtype is None:
@ -997,9 +996,6 @@ class FusedMoE(CustomOp):
self.zero_expert_num = zero_expert_num
self.zero_expert_type = zero_expert_type
# Expert mapping used in self.load_weights
self.expert_mapping = expert_mapping
# Round up hidden size if needed.
hidden_size = maybe_roundup_hidden_size(hidden_size, moe_in_dtype,
quant_config,
@ -1621,33 +1617,6 @@ class FusedMoE(CustomOp):
return False if return_success else None
def load_weights(
self, weights: Iterable[tuple[str,
torch.Tensor]]) -> Iterable[str]:
if (expert_mapping := self.expert_mapping) is None:
raise ValueError("`self.expert_mapping` must be provided to "
"load weights using `self.load_weights`.")
for expert_name, loaded_weight in weights:
qual_name = f"{self.layer_name}.{expert_name}"
for param_name, weight_name, expert_id, shard_id in expert_mapping:
if weight_name not in qual_name:
continue
weight_name = qual_name.replace(weight_name, param_name)
param_name = weight_name.removeprefix(f"{self.layer_name}.")
param = getattr(self, param_name)
success = self.weight_loader(
param=param,
loaded_weight=loaded_weight,
weight_name=weight_name,
shard_id=shard_id,
expert_id=expert_id,
return_success=True,
)
if success:
logger.debug("Loaded %s for expert %d into %s", param_name,
expert_id, self.layer_name)
yield param_name
def get_expert_weights(self) -> Iterable[torch.Tensor]:
weights = list(self.named_parameters())
assert all(weight.is_contiguous() for _, weight in weights)
@ -1930,15 +1899,6 @@ class FusedMoE(CustomOp):
staged_hidden_states.copy_(hidden_states, non_blocking=True)
staged_router_logits.copy_(router_logits, non_blocking=True)
# If there are shared experts but we are not using a modular kernel,
# the shared experts must be called here
if (not isinstance(self.quant_method.fused_experts,
FusedMoEModularKernel)
and self.shared_experts is not None):
shared_output = self.shared_experts(staged_hidden_states)
else:
shared_output = None
# Matrix multiply.
final_hidden_states = self.quant_method.apply(
layer=self,
@ -1962,13 +1922,8 @@ class FusedMoE(CustomOp):
logical_replica_count=self.logical_replica_count,
)
if shared_output is not None:
assert not isinstance(final_hidden_states, tuple)
assert self.shared_experts is not None
final_hidden_states = (
shared_output,
final_hidden_states,
)
assert self.shared_experts is None or isinstance(
final_hidden_states, tuple)
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, tuple)

View File

@ -115,7 +115,7 @@ class ShortConv(MambaBase, CustomOp):
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
conv_state = self_kv_cache[0].transpose(-1, -2)
state_indices_tensor = attn_metadata.state_indices_tensor
has_initial_states_p = attn_metadata.has_initial_states_p
has_initial_states_p = attn_metadata.has_initial_states
BCx, _ = self.in_proj(hidden_states)

View File

@ -644,14 +644,6 @@ class CompressedTensorsConfig(QuantizationConfig):
# If no matches, return None
return None
def has_blocked_weights(self) -> bool:
for scheme in self.target_scheme_map.values():
weight_quant = scheme.get("weights")
if (weight_quant is not None
and weight_quant.strategy == QuantizationStrategy.BLOCK):
return True
return False
@staticmethod
def supports_cutlass_24(
weight_quant: Optional[QuantizationArgs],

View File

@ -11,7 +11,7 @@ from torch.nn import Parameter
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support,
apply_fp8_block_linear, check_aiter_fp8_linear_support,
create_fp8_input_scale, create_fp8_scale_parameter,
create_fp8_weight_parameter, maybe_post_process_fp8_weight_block,
process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy,
@ -41,30 +41,16 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
self.strategy = weight_quant.strategy
self.out_dtype = torch.get_default_dtype()
self.is_static_input_scheme = is_static_input_scheme
self.act_q_group_shape = GroupShape.PER_TENSOR \
if is_static_input_scheme else GroupShape.PER_TOKEN
self.fp8_linear = Fp8LinearOp(
act_quant_static=self.is_static_input_scheme,
act_quant_group_shape=self.act_q_group_shape)
self.weight_block_size = self.weight_quant.block_structure
if self.weight_block_size is not None:
self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
else:
self.act_q_group_shape = GroupShape.PER_TENSOR \
if is_static_input_scheme else GroupShape.PER_TOKEN
self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
if self.weight_block_size is not None:
assert not self.is_static_input_scheme
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(*self.weight_block_size),
act_quant_group_shape=self.act_q_group_shape,
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
use_aiter_and_is_supported=self.use_aiter_and_is_supported,
)
else:
self.fp8_linear = Fp8LinearOp(
act_quant_static=self.is_static_input_scheme,
act_quant_group_shape=self.act_q_group_shape)
@classmethod
def get_min_capability(cls) -> int:
# lovelace and up
@ -156,14 +142,13 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
if self.weight_block_size is not None:
return self.w8a8_block_fp8_linear.apply(
if layer.weight_block_size is not None:
return apply_fp8_block_linear(
layer,
input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
input_scale=layer.input_scale,
bias=bias,
)
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
use_aiter_and_is_supported=self.use_aiter_and_is_supported)
return self.fp8_linear.apply(input=x,
weight=layer.weight,

View File

@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support,
apply_fp8_block_linear, check_aiter_fp8_linear_support,
create_fp8_input_scale, create_fp8_scale_parameter,
create_fp8_weight_parameter, expert_weight_is_col_major,
maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy,
@ -242,28 +242,15 @@ class Fp8LinearMethod(LinearMethodBase):
self.weight_block_size = self.quant_config.weight_block_size
self.block_quant = self.weight_block_size is not None
self.act_q_static = self.quant_config.activation_scheme == "static"
if self.weight_block_size:
self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
# Use per-token quantization for better perf if dynamic and cutlass
if not self.act_q_static and cutlass_fp8_supported():
self.act_q_group_shape = GroupShape.PER_TOKEN
else:
# Use per-token quantization for better perf if dynamic and cutlass
if not self.act_q_static and cutlass_fp8_supported():
self.act_q_group_shape = GroupShape.PER_TOKEN
else:
self.act_q_group_shape = GroupShape.PER_TENSOR
self.act_q_group_shape = GroupShape.PER_TENSOR
if self.block_quant:
assert not self.act_q_static
assert self.weight_block_size is not None
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(*self.weight_block_size),
act_quant_group_shape=self.act_q_group_shape,
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
use_aiter_and_is_supported=self.use_aiter_and_is_supported,
)
else:
self.fp8_linear = Fp8LinearOp(
act_quant_static=self.act_q_static,
act_quant_group_shape=self.act_q_group_shape)
self.fp8_linear = Fp8LinearOp(
act_quant_static=self.act_q_static,
act_quant_group_shape=self.act_q_group_shape)
def create_weights(
self,
@ -412,15 +399,12 @@ class Fp8LinearMethod(LinearMethodBase):
bias=bias)
if self.block_quant:
assert self.weight_block_size is not None
return self.w8a8_block_fp8_linear.apply(
return apply_fp8_block_linear(
layer,
input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
input_scale=layer.input_scale,
bias=bias,
)
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
use_aiter_and_is_supported=self.use_aiter_and_is_supported)
return self.fp8_linear.apply(input=x,
weight=layer.weight,

Some files were not shown because too many files have changed in this diff Show More