mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-22 16:04:36 +08:00
Compare commits
1 Commits
debug-logs
...
use-uv-pyt
Author | SHA1 | Date | |
---|---|---|---|
728c365e4d |
@ -181,14 +181,18 @@ launch_vllm_server() {
|
||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||
server_command="vllm serve $model \
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
else
|
||||
echo "Key 'fp8' does not exist in common params."
|
||||
server_command="vllm serve $model \
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
-tp $tp \
|
||||
--model $model \
|
||||
--port $port \
|
||||
$server_args"
|
||||
fi
|
||||
|
@ -365,7 +365,8 @@ run_serving_tests() {
|
||||
continue
|
||||
fi
|
||||
|
||||
server_command="$server_envs vllm serve \
|
||||
server_command="$server_envs python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
|
@ -1,191 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Base ubuntu image with basic ascend development libraries and python installed
|
||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
||||
VLLM_ASCEND_TMP_DIR=
|
||||
# Get the test run configuration file from the vllm-ascend repository
|
||||
fetch_vllm_test_cfg() {
|
||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
||||
cleanup() {
|
||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If the file already exists locally, just overwrite it
|
||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
||||
|
||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||
trap - EXIT
|
||||
}
|
||||
|
||||
# Downloads test run configuration file from a remote URL.
|
||||
# Loads the configuration into the current script environment.
|
||||
get_config() {
|
||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
||||
exit 1
|
||||
fi
|
||||
source "${TEST_RUN_CONFIG_FILE}"
|
||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
||||
return 0
|
||||
}
|
||||
|
||||
# get test running configuration.
|
||||
fetch_vllm_test_cfg
|
||||
get_config
|
||||
# Check if the function call was successful. If not, exit the script.
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
||||
echo "agent_idx: ${agent_idx}"
|
||||
builder_name="cachebuilder${agent_idx}"
|
||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
||||
mkdir -p ${builder_cache_dir}
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
||||
--progress=plain --load -t ${image_name} -f - .
|
||||
FROM ${BASE_IMAGE_NAME}
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install for pytest to make the docker build cache layer always valid
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install pytest>=6.0 modelscope
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -r requirements/common.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
# Install vLLM
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton
|
||||
|
||||
# Install vllm-ascend
|
||||
WORKDIR /workspace
|
||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
||||
ARG VLLM_ASCEND_TAG=main
|
||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
||||
|
||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
||||
|
||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
ENV VLLM_USE_MODELSCOPE=True
|
||||
|
||||
WORKDIR /workspace/vllm-ascend
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
EOF
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || true;
|
||||
docker image rm -f "${image_name}" || true;
|
||||
docker system prune -f || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
||||
parse_and_gen_devices() {
|
||||
local input="$1"
|
||||
local index cards_num
|
||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
||||
index="${BASH_REMATCH[1]}"
|
||||
cards_num="${BASH_REMATCH[2]}"
|
||||
else
|
||||
echo "parse error" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
local devices=""
|
||||
local i=0
|
||||
while (( i < cards_num )); do
|
||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
||||
devices="$devices --device /dev/davinci${dev_idx}"
|
||||
((i++))
|
||||
done
|
||||
|
||||
# trim leading space
|
||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
||||
# Output devices: assigned to the caller variable
|
||||
printf '%s' "$devices"
|
||||
}
|
||||
|
||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||
|
||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
||||
# the hardware plugin vllm-ascend.
|
||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
||||
mkdir -p ${model_cache_dir}
|
||||
docker run \
|
||||
${devices} \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
||||
--entrypoint="" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
bash -c '
|
||||
set -e
|
||||
pytest -v -s tests/e2e/vllm_interface/
|
||||
'
|
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
||||
bench_throughput_exit_code=$?
|
||||
|
||||
# run server-based benchmarks and upload the result to buildkite
|
||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
||||
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||
server_pid=$!
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
|
30
.github/mergify.yml
vendored
30
.github/mergify.yml
vendored
@ -2,7 +2,6 @@ pull_request_rules:
|
||||
- name: label-documentation
|
||||
description: Automatically apply documentation label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
@ -15,7 +14,6 @@ pull_request_rules:
|
||||
- name: label-ci-build
|
||||
description: Automatically apply ci/build label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^\.github/
|
||||
- files~=\.buildkite/
|
||||
@ -32,7 +30,6 @@ pull_request_rules:
|
||||
- name: label-deepseek
|
||||
description: Automatically apply deepseek label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^examples/.*deepseek.*\.py
|
||||
- files~=^tests/.*deepseek.*\.py
|
||||
@ -49,7 +46,6 @@ pull_request_rules:
|
||||
- name: label-frontend
|
||||
description: Automatically apply frontend label
|
||||
conditions:
|
||||
- label != stale
|
||||
- files~=^vllm/entrypoints/
|
||||
actions:
|
||||
label:
|
||||
@ -59,7 +55,6 @@ pull_request_rules:
|
||||
- name: label-llama
|
||||
description: Automatically apply llama label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^examples/.*llama.*\.py
|
||||
- files~=^tests/.*llama.*\.py
|
||||
@ -75,7 +70,6 @@ pull_request_rules:
|
||||
- name: label-multi-modality
|
||||
description: Automatically apply multi-modality label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^vllm/multimodal/
|
||||
- files~=^tests/multimodal/
|
||||
@ -89,7 +83,6 @@ pull_request_rules:
|
||||
- name: label-new-model
|
||||
description: Automatically apply new-model label
|
||||
conditions:
|
||||
- label != stale
|
||||
- and:
|
||||
- files~=^vllm/model_executor/models/
|
||||
- files=vllm/model_executor/models/registry.py
|
||||
@ -101,7 +94,6 @@ pull_request_rules:
|
||||
- name: label-performance
|
||||
description: Automatically apply performance label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^benchmarks/
|
||||
- files~=^vllm/benchmarks/
|
||||
@ -115,7 +107,6 @@ pull_request_rules:
|
||||
- name: label-qwen
|
||||
description: Automatically apply qwen label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^examples/.*qwen.*\.py
|
||||
- files~=^tests/.*qwen.*\.py
|
||||
@ -130,7 +121,6 @@ pull_request_rules:
|
||||
- name: label-gpt-oss
|
||||
description: Automatically apply gpt-oss label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||
@ -152,7 +142,6 @@ pull_request_rules:
|
||||
- name: label-rocm
|
||||
description: Automatically apply rocm label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^csrc/rocm/
|
||||
- files~=^docker/Dockerfile.rocm
|
||||
@ -173,7 +162,6 @@ pull_request_rules:
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^benchmarks/structured_schemas/
|
||||
- files=benchmarks/benchmark_serving_structured_output.py
|
||||
@ -193,7 +181,6 @@ pull_request_rules:
|
||||
- name: label-speculative-decoding
|
||||
description: Automatically apply speculative-decoding label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^vllm/v1/spec_decode/
|
||||
- files~=^tests/v1/spec_decode/
|
||||
@ -209,7 +196,6 @@ pull_request_rules:
|
||||
- name: label-v1
|
||||
description: Automatically apply v1 label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^vllm/v1/
|
||||
- files~=^tests/v1/
|
||||
@ -222,7 +208,6 @@ pull_request_rules:
|
||||
description: Automatically apply tpu label
|
||||
# Keep this list in sync with `label-tpu-remove` conditions
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=tpu.py
|
||||
- files~=_tpu
|
||||
@ -238,7 +223,6 @@ pull_request_rules:
|
||||
description: Automatically remove tpu label
|
||||
# Keep this list in sync with `label-tpu` conditions
|
||||
conditions:
|
||||
- label != stale
|
||||
- and:
|
||||
- -files~=tpu.py
|
||||
- -files~=_tpu
|
||||
@ -253,7 +237,6 @@ pull_request_rules:
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
@ -272,9 +255,8 @@ pull_request_rules:
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- label != stale
|
||||
- conflict
|
||||
- -closed
|
||||
- conflict
|
||||
- -closed
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
@ -288,8 +270,6 @@ pull_request_rules:
|
||||
|
||||
- name: assign reviewer for tensorizer changes
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
@ -301,7 +281,6 @@ pull_request_rules:
|
||||
|
||||
- name: assign reviewer for modelopt changes
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||
@ -316,8 +295,8 @@ pull_request_rules:
|
||||
|
||||
- name: remove 'needs-rebase' label when conflict is resolved
|
||||
conditions:
|
||||
- -conflict
|
||||
- -closed
|
||||
- -conflict
|
||||
- -closed
|
||||
actions:
|
||||
label:
|
||||
remove:
|
||||
@ -326,7 +305,6 @@ pull_request_rules:
|
||||
- name: label-kv-connector
|
||||
description: Automatically apply kv-connector label
|
||||
conditions:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||
|
@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
|
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
w8a8_triton_block_scaled_mm,
|
||||
w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||
|
||||
@ -158,7 +158,7 @@ def bench_fp8(
|
||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||
),
|
||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||
),
|
||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||
|
@ -55,7 +55,9 @@ benchmark() {
|
||||
output_len=$2
|
||||
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8100 \
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
@ -63,7 +65,9 @@ benchmark() {
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8200 \
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
|
@ -38,12 +38,16 @@ wait_for_server() {
|
||||
launch_chunked_prefill() {
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
# disagg prefill
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8100 \
|
||||
--max-model-len 10000 \
|
||||
--enable-chunked-prefill \
|
||||
--gpu-memory-utilization 0.6 &
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8200 \
|
||||
--max-model-len 10000 \
|
||||
--enable-chunked-prefill \
|
||||
@ -58,14 +62,18 @@ launch_chunked_prefill() {
|
||||
launch_disagg_prefill() {
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
# disagg prefill
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8100 \
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
--model $model \
|
||||
--port 8200 \
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
|
@ -1,174 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
import torch
|
||||
from tabulate import tabulate
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (
|
||||
STR_DTYPE_TO_TORCH_DTYPE,
|
||||
FlexibleArgumentParser,
|
||||
create_kv_caches_with_random,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def run_benchmark(
|
||||
num_tokens: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str,
|
||||
num_iters: int,
|
||||
benchmark_mode: str,
|
||||
device: str = "cuda",
|
||||
) -> float:
|
||||
"""Return latency (seconds) for given num_tokens."""
|
||||
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||
|
||||
current_platform.seed_everything(42)
|
||||
torch.set_default_device(device)
|
||||
|
||||
# create random key / value tensors [T, H, D].
|
||||
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
||||
value = torch.randn_like(key)
|
||||
|
||||
# prepare the slot mapping.
|
||||
# each token is assigned a unique slot in the KV-cache.
|
||||
num_slots = block_size * num_blocks
|
||||
if num_tokens > num_slots:
|
||||
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
||||
|
||||
key_caches, value_caches = create_kv_caches_with_random(
|
||||
num_blocks,
|
||||
block_size,
|
||||
1, # num_layers
|
||||
num_heads,
|
||||
head_size,
|
||||
kv_cache_dtype,
|
||||
dtype,
|
||||
device=device,
|
||||
)
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
# to free unused memory
|
||||
del key_caches, value_caches
|
||||
|
||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||
|
||||
function_under_test = lambda: ops.reshape_and_cache(
|
||||
key, # noqa: F821
|
||||
value, # noqa: F821
|
||||
key_cache, # noqa: F821
|
||||
value_cache, # noqa: F821
|
||||
slot_mapping, # noqa: F821
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
|
||||
if benchmark_mode == "cudagraph":
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g):
|
||||
function_under_test()
|
||||
torch.cuda.synchronize()
|
||||
function_under_test = lambda: g.replay()
|
||||
|
||||
def run_cuda_benchmark(n_iters: int) -> float:
|
||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.synchronize()
|
||||
start = time.perf_counter()
|
||||
for _ in range(n_iters):
|
||||
function_under_test()
|
||||
torch.cuda.synchronize()
|
||||
end = time.perf_counter()
|
||||
return (end - start) / n_iters
|
||||
|
||||
# warm-up
|
||||
run_cuda_benchmark(3)
|
||||
|
||||
lat = run_cuda_benchmark(num_iters)
|
||||
|
||||
# free tensors to mitigate OOM when sweeping
|
||||
del key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return lat
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for exp in range(1, 17):
|
||||
n_tok = 2**exp
|
||||
lat = run_benchmark(
|
||||
num_tokens=n_tok,
|
||||
num_heads=args.num_heads,
|
||||
head_size=args.head_size,
|
||||
block_size=args.block_size,
|
||||
num_blocks=args.num_blocks,
|
||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
num_iters=args.iters,
|
||||
benchmark_mode=args.mode,
|
||||
device="cuda",
|
||||
)
|
||||
rows.append([n_tok, lat * 1e6]) # convert to microseconds
|
||||
|
||||
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
|
||||
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
|
||||
parser.add_argument("--num-heads", type=int, default=128)
|
||||
parser.add_argument(
|
||||
"--head-size",
|
||||
type=int,
|
||||
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||
default=128,
|
||||
)
|
||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||
parser.add_argument("--num-blocks", type=int, default=128 * 128)
|
||||
|
||||
parser.add_argument(
|
||||
"--dtype",
|
||||
type=str,
|
||||
choices=["half", "bfloat16", "float"],
|
||||
default="bfloat16",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
choices=["auto", "fp8"],
|
||||
default="auto",
|
||||
)
|
||||
|
||||
parser.add_argument("--iters", type=int, default=200)
|
||||
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
type=str,
|
||||
choices=["cudagraph", "no_graph"],
|
||||
default="cudagraph",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
@ -9,7 +9,7 @@ import torch
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8,
|
||||
w8a8_triton_block_scaled_mm,
|
||||
w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import (
|
||||
@ -63,7 +63,7 @@ def benchmark_shape(m: int,
|
||||
|
||||
# === vLLM Triton Implementation ===
|
||||
def vllm_triton_gemm():
|
||||
return w8a8_triton_block_scaled_mm(A_vllm,
|
||||
return w8a8_block_fp8_matmul(A_vllm,
|
||||
B_vllm,
|
||||
A_scale_vllm,
|
||||
B_scale_vllm,
|
||||
|
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
||||
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
@ -17,6 +17,8 @@
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cfloat> // FLT_MIN
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
@ -208,20 +210,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Used to copy/convert one element
|
||||
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||
struct CopyWithScaleOp {
|
||||
float scale;
|
||||
|
||||
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst = static_cast<OutT>(src);
|
||||
} else {
|
||||
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void reshape_and_cache_kernel(
|
||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||
@ -237,51 +225,59 @@ __global__ void reshape_and_cache_kernel(
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
if (slot_idx < 0) {
|
||||
// Padding token that should be ignored.
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
const int h_block_count = head_size / x; // head_size//x
|
||||
|
||||
const int h_block_idx = threadIdx.x;
|
||||
if (h_block_idx >= num_heads * h_block_count) {
|
||||
return;
|
||||
}
|
||||
const int n = num_heads * head_size;
|
||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||
|
||||
const int head_idx = h_block_idx / h_block_count;
|
||||
const int h_block = h_block_idx % h_block_count;
|
||||
const int head_idx = i / head_size;
|
||||
const int head_offset = i % head_size;
|
||||
const int x_idx = head_offset / x;
|
||||
const int x_offset = head_offset % x;
|
||||
|
||||
const scalar_t* __restrict__ key_src =
|
||||
key + token_idx * key_stride + head_idx * head_size + h_block * x;
|
||||
const int64_t src_value_start =
|
||||
token_idx * value_stride + head_idx * head_size + h_block * x;
|
||||
|
||||
cache_t* __restrict__ key_dst =
|
||||
key_cache + block_idx * num_heads * h_block_count * block_size * x +
|
||||
head_idx * h_block_count * block_size * x + h_block * block_size * x +
|
||||
block_offset * x;
|
||||
const int64_t tgt_value_start =
|
||||
block_idx * num_heads * h_block_count * x * block_size +
|
||||
head_idx * h_block_count * x * block_size + h_block * x * block_size +
|
||||
block_offset;
|
||||
|
||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||
|
||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
|
||||
|
||||
const scalar_t* __restrict__ value_src = value + src_value_start;
|
||||
cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < x; i++) {
|
||||
v_op(value_dst[i * block_size], value_src[i]);
|
||||
const int64_t tgt_key_idx =
|
||||
block_idx * num_heads * (head_size / x) * block_size * x +
|
||||
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
|
||||
block_offset * x + x_offset;
|
||||
const int64_t tgt_value_idx =
|
||||
block_idx * num_heads * head_size * block_size +
|
||||
head_idx * head_size * block_size + head_offset * block_size +
|
||||
block_offset;
|
||||
scalar_t tgt_key = key[src_key_idx];
|
||||
scalar_t tgt_value = value[src_value_idx];
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
key_cache[tgt_key_idx] = tgt_key;
|
||||
value_cache[tgt_value_idx] = tgt_value;
|
||||
} else {
|
||||
key_cache[tgt_key_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
|
||||
value_cache[tgt_value_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Used by vectorization_utils to copy/convert one element
|
||||
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||
struct CopyWithScaleOp {
|
||||
float scale;
|
||||
|
||||
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst = static_cast<OutT>(src);
|
||||
} else {
|
||||
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void reshape_and_cache_flash_kernel(
|
||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||
@ -428,80 +424,84 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
||||
const int64_t dst_idx_start =
|
||||
block_idx * block_stride + block_offset * entry_stride;
|
||||
|
||||
// For the NoPE part, each tile of 128 elements is handled by half of one warp
|
||||
// (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
|
||||
// So in total, we use 3 warps (96 threads) per block.
|
||||
// Create 4 tile scales in shared memory
|
||||
__shared__ float smem[20];
|
||||
float* shard_abs_max = smem;
|
||||
float* tile_scales = smem + 16;
|
||||
|
||||
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||
// The first thread of the first warp in each tile writes the scale
|
||||
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||
// by another 2 warps (64 threads).
|
||||
// So in total, we use 18 warps (576 threads) per block.
|
||||
|
||||
// Cast kv_cache to 16_bit for RoPE values
|
||||
scalar_t* kv_cache_16bit =
|
||||
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
||||
|
||||
// The last warp handles the RoPE part
|
||||
if (threadIdx.x >= 64) {
|
||||
// Each thread handles two elements of RoPE
|
||||
const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
|
||||
const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
|
||||
// Vectorized load of two 16-bit values, performed as one 32-bit load
|
||||
const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
|
||||
// The last 64 threads handle the RoPE part
|
||||
if (threadIdx.x >= kv_lora_rank) {
|
||||
const int8_t pe_idx = threadIdx.x - kv_lora_rank;
|
||||
const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
|
||||
// RoPE values start after the packed 8-bit NoPE values and the
|
||||
// 32-bit scales
|
||||
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
|
||||
// Vectorized store of two 16-bit values, performed as one 32-bit store
|
||||
*reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
|
||||
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
|
||||
kv_cache_16bit[dst_idx] = k_pe[src_idx];
|
||||
return;
|
||||
}
|
||||
|
||||
// The first two warps handle the NoPE part
|
||||
const int8_t warp_idx = threadIdx.x >> 5;
|
||||
const int8_t lane_idx = threadIdx.x & 31;
|
||||
const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
|
||||
// Determine the scale for each chunk of NoPE
|
||||
const int16_t tile_idx = threadIdx.x >> 7;
|
||||
const int16_t warp_idx = (threadIdx.x & 127) >> 5;
|
||||
const int16_t lane_idx = threadIdx.x & 31;
|
||||
|
||||
// Each thread handles 8 elements of NoPE
|
||||
// Load the NoPE elements for this thread into registers
|
||||
const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
|
||||
// Vectorized load of eight 16-bit values, performed as an int4 load
|
||||
const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
|
||||
const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
|
||||
// Load the NoPE element for this thread into registers
|
||||
const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
|
||||
const scalar_t src_val = kv_c[src_idx];
|
||||
|
||||
// Max absolute value of this thread's elements
|
||||
float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
|
||||
fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
|
||||
fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
|
||||
fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
|
||||
|
||||
// Warp-level reduction to find the max absolute value in each half-warp
|
||||
// Warp-level reduction to find the max absolute value in the warp
|
||||
float max_abs = fabsf(src_val);
|
||||
#pragma unroll
|
||||
for (int offset = 8; offset > 0; offset /= 2) {
|
||||
max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
|
||||
for (int offset = 16; offset > 0; offset /= 2) {
|
||||
#ifdef USE_ROCM
|
||||
max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
|
||||
#else
|
||||
max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Compute the scale for the tile
|
||||
float tile_scale = max_abs / 448.f;
|
||||
// The first lane of each warp in each tile writes the max_abs of this part
|
||||
// of the tile to shared memory
|
||||
if (lane_idx == 0) {
|
||||
shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// The first lane of each half-warp writes the scale to kv_cache
|
||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
||||
// The first lane of the first warp in each tile computes the scale for the
|
||||
// tile and writes it to shared memory and to kv_cache
|
||||
if (warp_idx == 0 && lane_idx == 0) {
|
||||
float4 shard_abs_max_vec =
|
||||
reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
|
||||
float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
|
||||
fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
|
||||
448.f;
|
||||
|
||||
// Avoid division by zero in `scaled_convert`
|
||||
tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
|
||||
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
||||
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
||||
kv_cache_32bit[dst_idx] = tile_scale;
|
||||
kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
|
||||
}
|
||||
|
||||
// Now all threads in the block scale and write their elements
|
||||
// NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
|
||||
const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
|
||||
__syncthreads();
|
||||
|
||||
uint8_t result[8];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
result[i] =
|
||||
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
||||
vals[i], tile_scale);
|
||||
}
|
||||
|
||||
// Store as aligned 64-bit writes
|
||||
*reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
|
||||
*reinterpret_cast<const uint64_t*>(result);
|
||||
// Now all threads in the block scale and write their element
|
||||
const float scale_val = tile_scales[tile_idx];
|
||||
const int64_t dst_idx = dst_idx_start + threadIdx.x;
|
||||
kv_cache[dst_idx] =
|
||||
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
||||
src_val, scale_val);
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
@ -606,10 +606,9 @@ void reshape_and_cache(
|
||||
|
||||
int key_stride = key.stride(0);
|
||||
int value_stride = value.stride(0);
|
||||
int head_div_x = head_size / x;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(num_heads * head_div_x, 512));
|
||||
dim3 block(std::min(num_heads * head_size, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
@ -742,12 +741,13 @@ void concat_and_cache_mla(
|
||||
|
||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
||||
dim3 grid(num_tokens);
|
||||
// For the NoPE part, each tile of 128 elements is handled by half of one
|
||||
// warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32
|
||||
// threads). So in total, we use 3 warps (96 threads) per block.
|
||||
dim3 block(96);
|
||||
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||
// The first thread of the first warp in each tile writes the scale
|
||||
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||
// by another 2 warps (64 threads).
|
||||
// So in total, we use 18 warps (576 threads) per block.
|
||||
dim3 block(576);
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||
CALL_CONCAT_AND_CACHE_DS_MLA);
|
||||
} else {
|
||||
|
@ -254,7 +254,7 @@ void cutlass_moe_mm(
|
||||
bool per_act_token, bool per_out_ch) {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||
if (version_num >= 100 && version_num < 110) {
|
||||
if (version_num >= 100) {
|
||||
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, per_act_token, per_out_ch);
|
||||
@ -262,7 +262,7 @@ void cutlass_moe_mm(
|
||||
}
|
||||
#endif
|
||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||
if (version_num >= 90 && version_num < 100) {
|
||||
if (version_num >= 90) {
|
||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||
c_strides, per_act_token, per_out_ch);
|
||||
|
@ -14,8 +14,6 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "core/registration.h"
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <cutlass/arch/arch.h>
|
||||
|
||||
@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
|
||||
"12.8 or above.");
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
|
||||
}
|
||||
|
@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
|
||||
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
|
||||
{stride_tag});
|
||||
// conditionally compiled so impl registration is in source file
|
||||
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
|
||||
|
||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||
// quantization, as well as bias
|
||||
|
@ -13,13 +13,8 @@ ARG PYTHON_VERSION=3.12
|
||||
# private registries that use a different repository naming conventions.
|
||||
#
|
||||
# Example:
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
|
||||
# Important: We build with an old version of Ubuntu to maintain broad
|
||||
# compatibility with other Linux OSes. The main reason for this is that the
|
||||
# glibc version is baked into the distro, and binaries built with one glibc
|
||||
# version are not backwards compatible with OSes that use an earlier version.
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
|
||||
@ -80,20 +75,15 @@ ARG TARGETPLATFORM
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ARG DEADSNAKES_MIRROR_URL
|
||||
ARG DEADSNAKES_GPGKEY_URL
|
||||
ARG GET_PIP_URL
|
||||
|
||||
# Install system dependencies and uv, then create Python virtual environment
|
||||
# Install minimal dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
||||
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
|
||||
&& ln -s /opt/venv/bin/python3 /usr/bin/python3 \
|
||||
&& ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
|
||||
&& ln -s /opt/venv/bin/pip /usr/bin/pip \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
@ -101,9 +91,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Activate virtual environment and add uv to PATH
|
||||
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
# Install uv and Python
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -132,7 +122,7 @@ WORKDIR /workspace
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cuda.txt requirements/cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
uv pip install --system -r requirements/cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# cuda arch list used by torch
|
||||
@ -162,7 +152,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
uv pip install --system -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
COPY . .
|
||||
@ -259,7 +249,7 @@ COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||
uv pip install --system -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
#################### DEV IMAGE ####################
|
||||
|
||||
@ -286,32 +276,12 @@ ARG GET_PIP_URL
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies
|
||||
# Install minimal dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
||||
fi ; \
|
||||
else \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done ; \
|
||||
fi \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
@ -319,9 +289,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
# Install uv and Python
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -555,5 +525,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
||||
|
||||
FROM vllm-openai-base AS vllm-openai
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
#################### OPENAI API SERVER ####################
|
||||
|
@ -177,4 +177,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
||||
uv pip install dist/*.whl
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
@ -314,4 +314,4 @@ WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
@ -309,4 +309,4 @@ USER 2000
|
||||
WORKDIR /home/vllm
|
||||
|
||||
# Set the default entrypoint
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 127 KiB |
@ -661,7 +661,8 @@ Benchmark the performance of multi-modal requests in vLLM.
|
||||
Start vLLM:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--limit-mm-per-prompt '{"image": 1}' \
|
||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
||||
@ -687,7 +688,8 @@ vllm bench serve \
|
||||
Start vLLM:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||
--dtype bfloat16 \
|
||||
--limit-mm-per-prompt '{"video": 1}' \
|
||||
--allowed-local-media-path /path/to/sharegpt4video/videos
|
||||
|
@ -258,21 +258,17 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides)
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
@ -442,20 +438,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides)
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -39,7 +39,8 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
|
||||
|
||||
```bash
|
||||
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
||||
vllm serve meta-llama/Meta-Llama-3-70B
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model meta-llama/Meta-Llama-3-70B
|
||||
```
|
||||
|
||||
vllm bench command:
|
||||
|
@ -19,7 +19,8 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
|
||||
1. Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```bash
|
||||
vllm serve mistralai/Mistral-7B-Instruct-v0.2
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model mistralai/Mistral-7B-Instruct-v0.2
|
||||
```
|
||||
|
||||
1. Call it with AutoGen:
|
||||
|
@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
|
||||
For example:
|
||||
|
||||
```console
|
||||
vllm serve <model> --host 0.0.0.0 --port 8000
|
||||
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
3. Start the Open WebUI Docker container:
|
||||
|
@ -32,7 +32,6 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
||||
ports: 8081 # Expose to internet traffic.
|
||||
|
||||
envs:
|
||||
PYTHONUNBUFFERED: 1
|
||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||
|
||||
@ -48,8 +47,9 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
||||
run: |
|
||||
conda activate vllm
|
||||
echo 'Starting vllm api server...'
|
||||
vllm serve $MODEL_NAME \
|
||||
python -u -m vllm.entrypoints.openai.api_server \
|
||||
--port 8081 \
|
||||
--model $MODEL_NAME \
|
||||
--trust-remote-code \
|
||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||
2>&1 | tee api_server.log &
|
||||
@ -131,7 +131,6 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
||||
ports: 8081 # Expose to internet traffic.
|
||||
|
||||
envs:
|
||||
PYTHONUNBUFFERED: 1
|
||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||
|
||||
@ -147,8 +146,9 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
||||
run: |
|
||||
conda activate vllm
|
||||
echo 'Starting vllm api server...'
|
||||
vllm serve $MODEL_NAME \
|
||||
python -u -m vllm.entrypoints.openai.api_server \
|
||||
--port 8081 \
|
||||
--model $MODEL_NAME \
|
||||
--trust-remote-code \
|
||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||
2>&1 | tee api_server.log
|
||||
@ -243,7 +243,6 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
||||
ports: 8081 # Expose to internet traffic.
|
||||
|
||||
envs:
|
||||
PYTHONUNBUFFERED: 1
|
||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||
|
||||
@ -259,8 +258,9 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
||||
run: |
|
||||
conda activate vllm
|
||||
echo 'Starting vllm api server...'
|
||||
vllm serve $MODEL_NAME \
|
||||
python -u -m vllm.entrypoints.openai.api_server \
|
||||
--port 8081 \
|
||||
--model $MODEL_NAME \
|
||||
--trust-remote-code \
|
||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||
2>&1 | tee api_server.log
|
||||
|
@ -69,11 +69,6 @@ Sometimes you may see the API server entrypoint used directly instead of via the
|
||||
python -m vllm.entrypoints.openai.api_server --model <model>
|
||||
```
|
||||
|
||||
!!! warning
|
||||
|
||||
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
||||
and may become unsupported in a future release.
|
||||
|
||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||
|
||||
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||
|
@ -8,9 +8,6 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup
|
||||
|
||||
!!! tip
|
||||
When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
|
||||
|
||||
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP redirects from being followed to bypass domain restrictions.
|
||||
|
||||
This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
|
||||
|
||||
## Offline Inference
|
||||
|
@ -64,7 +64,8 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
|
||||
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
|
||||
|
||||
```bash
|
||||
VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
|
||||
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
|
||||
--model Qwen/Qwen3-0.6B \
|
||||
--enable-sleep-mode \
|
||||
--port 8000
|
||||
```
|
||||
|
@ -48,9 +48,10 @@ The following code configures vLLM in an offline mode to use speculative decodin
|
||||
To perform the same with an online mode launch the server:
|
||||
|
||||
```bash
|
||||
vllm serve facebook/opt-6.7b \
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--model facebook/opt-6.7b \
|
||||
--seed 42 \
|
||||
-tp 1 \
|
||||
--gpu_memory_utilization 0.8 \
|
||||
|
@ -67,7 +67,8 @@ docker run -it \
|
||||
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
||||
|
||||
```bash
|
||||
vllm serve facebook/opt-13b \
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model=facebook/opt-13b \
|
||||
--dtype=bfloat16 \
|
||||
--max_model_len=1024 \
|
||||
--distributed-executor-backend=mp \
|
||||
|
@ -17,12 +17,12 @@ These models are what we list in [supported-text-models][supported-text-models]
|
||||
|
||||
### Transformers
|
||||
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||
|
||||
Currently, the Transformers backend works for the following:
|
||||
|
||||
- Modalities: embedding models, language models and vision-language models*
|
||||
- Architectures: encoder-only, decoder-only, mixture-of-experts
|
||||
- Architectures: encoder-only, decoder-only
|
||||
- Attention types: full attention and/or sliding attention
|
||||
|
||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||
@ -31,7 +31,6 @@ If the Transformers model implementation follows all the steps in [writing a cus
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
- Data parallel
|
||||
- Pipeline parallel
|
||||
- Tensor parallel
|
||||
|
||||
@ -677,7 +676,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
@ -66,9 +66,6 @@ Restrict domains that vLLM can access for media URLs by setting
|
||||
`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
|
||||
(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
|
||||
|
||||
Also, consider setting `VLLM_MEDIA_URL_ALLOW_REDIRECTS=0` to prevent HTTP
|
||||
redirects from being followed to bypass domain restrictions.
|
||||
|
||||
## Security and Firewalls: Protecting Exposed vLLM Systems
|
||||
|
||||
While vLLM is designed to allow unsafe network services to be isolated to
|
||||
|
@ -576,7 +576,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
|
||||
# Intern-S1
|
||||
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "internlm/Intern-S1-mini"
|
||||
model_name = "internlm/Intern-S1"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
|
@ -309,7 +309,7 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
|
||||
|
||||
def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "internlm/Intern-S1-mini"
|
||||
model_name = "internlm/Intern-S1"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
|
@ -21,4 +21,4 @@ while IFS='=' read -r key value; do
|
||||
done < <(env | grep "^${PREFIX}")
|
||||
|
||||
# Pass the collected arguments to the main entrypoint
|
||||
exec vllm serve "${ARGS[@]}"
|
||||
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"
|
@ -1,2 +1,2 @@
|
||||
lmcache
|
||||
nixl >= 0.6.0 # Required for disaggregated prefill
|
||||
nixl >= 0.5.1 # Required for disaggregated prefill
|
||||
|
@ -11,8 +11,8 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from tests.v1.attention.utils import _Backend
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
||||
PassConfig)
|
||||
|
@ -8,11 +8,11 @@ import torch._dynamo
|
||||
|
||||
from tests.compile.backend import LazyInitPass, TestBackend
|
||||
from tests.models.utils import check_outputs_equal
|
||||
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
|
||||
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||
create_common_attn_metadata)
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.compilation.fusion import QUANT_OPS
|
||||
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
|
||||
|
@ -756,7 +756,7 @@ class VllmRunner:
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
||||
prompts: Union[list[str], list[torch.Tensor], list[int]],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
|
@ -86,16 +86,3 @@ def test_max_model_len():
|
||||
# It can be less if generation finishes due to other reasons (e.g., EOS)
|
||||
# before reaching the absolute model length limit.
|
||||
assert num_total_tokens <= max_model_len
|
||||
|
||||
|
||||
def test_log_stats():
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
|
||||
# disable_log_stats is False, every output should have metrics
|
||||
assert all(output.metrics is not None for output in outputs)
|
||||
|
@ -122,9 +122,6 @@ def mock_serving_setup():
|
||||
models,
|
||||
request_logger=None)
|
||||
|
||||
serving_completion._process_inputs = AsyncMock(return_value=(MagicMock(
|
||||
name="engine_request"), {}))
|
||||
|
||||
return mock_engine, serving_completion
|
||||
|
||||
|
||||
|
@ -698,22 +698,6 @@ async def test_function_calling_required(client: OpenAI, model_name: str):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_system_message_with_tools(client: OpenAI, model_name: str):
|
||||
from vllm.entrypoints.harmony_utils import get_system_message
|
||||
|
||||
# Test with custom tools enabled - commentary channel should be available
|
||||
sys_msg = get_system_message(with_custom_tools=True)
|
||||
valid_channels = sys_msg.content[0].channel_config.valid_channels
|
||||
assert "commentary" in valid_channels
|
||||
|
||||
# Test with custom tools disabled - commentary channel should be removed
|
||||
sys_msg = get_system_message(with_custom_tools=False)
|
||||
valid_channels = sys_msg.content[0].channel_config.valid_channels
|
||||
assert "commentary" not in valid_channels
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||
|
@ -7,7 +7,7 @@ import asyncio
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
@ -230,7 +230,6 @@ class MockHFConfig:
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
task = "generate"
|
||||
runner_type = "generate"
|
||||
tokenizer = MODEL_NAME
|
||||
trust_remote_code = False
|
||||
tokenizer_mode = "auto"
|
||||
@ -245,33 +244,11 @@ class MockModelConfig:
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
skip_tokenizer_init = False
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
|
||||
def _build_serving_chat(engine: AsyncLLM,
|
||||
model_config: MockModelConfig) -> OpenAIServingChat:
|
||||
models = OpenAIServingModels(engine_client=engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=model_config)
|
||||
serving_chat = OpenAIServingChat(engine,
|
||||
model_config,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None)
|
||||
|
||||
async def _fake_process_inputs(request_id, engine_prompt, sampling_params,
|
||||
*, lora_request, trace_headers, priority):
|
||||
return dict(engine_prompt), {}
|
||||
|
||||
serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
|
||||
return serving_chat
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockEngine:
|
||||
|
||||
@ -305,7 +282,16 @@ async def test_serving_chat_returns_correct_model_name():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=MockModelConfig())
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
MockModelConfig(),
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None)
|
||||
messages = [{"role": "user", "content": "what is 1+1?"}]
|
||||
|
||||
async def return_model_name(*args):
|
||||
@ -332,7 +318,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine, MockModelConfig())
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=MockModelConfig())
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
MockModelConfig(),
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None)
|
||||
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
@ -366,7 +361,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.errored = False
|
||||
|
||||
# Initialize the serving chat
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config)
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
mock_model_config,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None)
|
||||
|
||||
# Test Case 1: No max_tokens specified in request
|
||||
req = ChatCompletionRequest(
|
||||
@ -411,7 +415,16 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.errored = False
|
||||
|
||||
# Initialize the serving chat
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config)
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
mock_model_config,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None)
|
||||
|
||||
# Test case 1: No max_tokens specified, defaults to context_window
|
||||
req = ChatCompletionRequest(
|
||||
@ -458,7 +471,16 @@ async def test_serving_chat_could_load_correct_generation_config():
|
||||
mock_engine.errored = False
|
||||
|
||||
# Initialize the serving chat
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config)
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
mock_model_config,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None)
|
||||
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
@ -503,7 +525,17 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine, mock_model_config)
|
||||
# Initialize the serving chat
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config)
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
mock_model_config,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None)
|
||||
|
||||
# Test cache_salt
|
||||
req = ChatCompletionRequest(
|
||||
@ -517,12 +549,10 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
||||
# By default, cache_salt in the engine prompt is not set
|
||||
with suppress(Exception):
|
||||
await serving_chat.create_chat_completion(req)
|
||||
engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
|
||||
assert "cache_salt" not in engine_prompt
|
||||
assert "cache_salt" not in mock_engine.generate.call_args.args[0]
|
||||
|
||||
# Test with certain cache_salt
|
||||
req.cache_salt = "test_salt"
|
||||
with suppress(Exception):
|
||||
await serving_chat.create_chat_completion(req)
|
||||
engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
|
||||
assert engine_prompt.get("cache_salt") == "test_salt"
|
||||
assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
|
||||
|
@ -1,129 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import AsyncExitStack
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.entrypoints.context import ConversationContext
|
||||
from vllm.entrypoints.openai.protocol import ResponsesRequest
|
||||
from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
|
||||
|
||||
class MockConversationContext(ConversationContext):
|
||||
"""Mock conversation context for testing"""
|
||||
|
||||
def __init__(self):
|
||||
self.init_tool_sessions_called = False
|
||||
self.init_tool_sessions_args = None
|
||||
self.init_tool_sessions_kwargs = None
|
||||
|
||||
def append_output(self, output) -> None:
|
||||
pass
|
||||
|
||||
async def call_tool(self):
|
||||
return []
|
||||
|
||||
def need_builtin_tool_call(self) -> bool:
|
||||
return False
|
||||
|
||||
def render_for_completion(self):
|
||||
return []
|
||||
|
||||
async def init_tool_sessions(self, tool_server, exit_stack, request_id,
|
||||
mcp_tools):
|
||||
self.init_tool_sessions_called = True
|
||||
self.init_tool_sessions_args = (tool_server, exit_stack, request_id,
|
||||
mcp_tools)
|
||||
|
||||
async def cleanup_session(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_serving_responses():
|
||||
"""Create a mock OpenAIServingResponses instance"""
|
||||
serving_responses = MagicMock(spec=OpenAIServingResponses)
|
||||
serving_responses.tool_server = MagicMock(spec=ToolServer)
|
||||
return serving_responses
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_context():
|
||||
"""Create a mock conversation context"""
|
||||
return MockConversationContext()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_exit_stack():
|
||||
"""Create a mock async exit stack"""
|
||||
return MagicMock(spec=AsyncExitStack)
|
||||
|
||||
|
||||
class TestInitializeToolSessions:
|
||||
"""Test class for _initialize_tool_sessions method"""
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def serving_responses_instance(self):
|
||||
"""Create a real OpenAIServingResponses instance for testing"""
|
||||
# Create minimal mocks for required dependencies
|
||||
engine_client = MagicMock()
|
||||
engine_client.get_model_config = AsyncMock()
|
||||
|
||||
model_config = MagicMock()
|
||||
model_config.hf_config.model_type = "test"
|
||||
model_config.get_diff_sampling_param.return_value = {}
|
||||
|
||||
models = MagicMock()
|
||||
|
||||
tool_server = MagicMock(spec=ToolServer)
|
||||
|
||||
# Create the actual instance
|
||||
instance = OpenAIServingResponses(
|
||||
engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
models=models,
|
||||
request_logger=None,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
tool_server=tool_server,
|
||||
)
|
||||
|
||||
return instance
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_initialize_tool_sessions(self, serving_responses_instance,
|
||||
mock_context, mock_exit_stack):
|
||||
"""Test that method works correctly with only MCP tools"""
|
||||
|
||||
request = ResponsesRequest(input="test input", tools=[])
|
||||
|
||||
# Call the method
|
||||
await serving_responses_instance._initialize_tool_sessions(
|
||||
request, mock_context, mock_exit_stack)
|
||||
assert mock_context.init_tool_sessions_called is False
|
||||
|
||||
# Create only MCP tools
|
||||
tools = [
|
||||
{
|
||||
"type": "web_search_preview"
|
||||
},
|
||||
{
|
||||
"type": "code_interpreter",
|
||||
"container": {
|
||||
"type": "auto"
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
request = ResponsesRequest(input="test input", tools=tools)
|
||||
|
||||
# Call the method
|
||||
await serving_responses_instance._initialize_tool_sessions(
|
||||
request, mock_context, mock_exit_stack)
|
||||
|
||||
# Verify that init_tool_sessions was called
|
||||
assert mock_context.init_tool_sessions_called
|
@ -10,9 +10,8 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.layer import MultiHeadAttention
|
||||
from vllm.attention.selector import _cached_get_attn_backend
|
||||
from vllm.attention.selector import _Backend, _cached_get_attn_backend
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
|
@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
|
||||
native_w8a8_block_matmul)
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
|
||||
cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_gemm
|
||||
from vllm.utils.deep_gemm import (fp8_gemm_nt,
|
||||
@ -91,8 +91,7 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
||||
|
||||
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
|
||||
out_dtype)
|
||||
out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
|
||||
out_dtype)
|
||||
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
|
||||
|
||||
rel_diff = (torch.mean(
|
||||
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
|
||||
|
@ -20,11 +20,9 @@ from vllm.platforms import current_platform
|
||||
(8, 513, 64), # Non-divisible (native only)
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [42])
|
||||
@pytest.mark.parametrize("use_ue8m0", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
group_size: int, seed: int,
|
||||
use_ue8m0: bool) -> None:
|
||||
group_size: int, seed: int) -> None:
|
||||
"""Test QuantFP8 group quantization with various configurations.
|
||||
|
||||
Tests both CUDA and native implementations, column-major scales,
|
||||
@ -40,8 +38,7 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
group_shape = GroupShape(1, group_size)
|
||||
quant_op = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=False,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=False)
|
||||
|
||||
# 1. Test native implementation (always available)
|
||||
x_quant_native, scales_native = quant_op.forward_native(x.clone())
|
||||
@ -51,15 +48,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
# 2. Test column-major scales configuration
|
||||
quant_op_col = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=True,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=True)
|
||||
_, scales_col = quant_op_col.forward_native(x.clone())
|
||||
assert scales_col.shape == (batch_size, expected_num_groups)
|
||||
assert scales_col.stride(0) == 1
|
||||
assert scales_col.stride(1) == batch_size
|
||||
|
||||
# Test column-major scales consistency
|
||||
assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
|
||||
assert scales_col.shape == (expected_num_groups, batch_size)
|
||||
|
||||
# 3. Test CUDA implementation (only for divisible dimensions)
|
||||
if is_divisible:
|
||||
@ -77,23 +68,21 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", [42])
|
||||
@pytest.mark.parametrize("use_ue8m0", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
|
||||
def test_quantfp8_group_multidimensional(seed: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
group_size = 64
|
||||
|
||||
# Test with 3D input
|
||||
batch1, batch2, hidden_dim = 4, 8, 1024
|
||||
batch1, batch2, hidden_dim = 4, 8, 512
|
||||
x_3d = torch.randn(
|
||||
(batch1, batch2, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
|
||||
|
||||
group_shape = GroupShape(1, group_size)
|
||||
quant_op = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=False,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=False)
|
||||
|
||||
x_quant, scales = quant_op.forward_native(x_3d.clone())
|
||||
assert x_quant.shape == x_3d.shape
|
||||
@ -102,10 +91,9 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
|
||||
# Test column_major_scales with multi-dim
|
||||
quant_op_col = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=True,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=True)
|
||||
_, scales_col = quant_op_col.forward_native(x_3d.clone())
|
||||
assert scales_col.shape == (batch1, batch2, hidden_dim // group_size)
|
||||
assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)
|
||||
|
||||
# Test with 4D input
|
||||
batch1, batch2, batch3, hidden_dim = 2, 3, 4, 256
|
||||
|
@ -15,10 +15,10 @@ from torch._prims_common import TensorLikeType
|
||||
|
||||
from tests.kernels.quant_utils import native_w8a8_block_matmul
|
||||
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe.utils import (
|
||||
moe_kernel_quantize_input)
|
||||
from vllm.platforms.interface import _Backend
|
||||
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
|
||||
STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
|
||||
|
||||
|
@ -17,6 +17,8 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
|
||||
from vllm.model_executor.layers.layernorm import (RMSNorm,
|
||||
dispatch_rocm_rmsnorm_func,
|
||||
fused_add_rms_norm, rms_norm)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
|
||||
@ -109,6 +111,34 @@ def test_enabled_ops_invalid(env: str):
|
||||
RMSNorm(1024).enabled()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
|
||||
reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
|
||||
@pytest.mark.parametrize("use_cutlass", [True, False])
|
||||
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
|
||||
@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
|
||||
def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
|
||||
use_rocm_aiter_gemm_w8a8_blockscale: str,
|
||||
monkeypatch):
|
||||
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
|
||||
use_rocm_aiter_gemm_w8a8_blockscale)
|
||||
|
||||
use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
|
||||
int(use_rocm_aiter_gemm_w8a8_blockscale)))
|
||||
block_scale_func = dispatch_w8a8_blockscale_func(
|
||||
use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
|
||||
if use_cutlass:
|
||||
assert block_scale_func == cutlass_scaled_mm
|
||||
elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
|
||||
use_rocm_aiter_gemm_w8a8_blockscale):
|
||||
assert block_scale_func == (
|
||||
torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
|
||||
else:
|
||||
assert block_scale_func == w8a8_block_fp8_matmul
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
|
||||
def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
|
||||
|
@ -240,12 +240,12 @@ def test_distributed_correctness(
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model, tensor_parallel_size=1,
|
||||
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
max_num_seqs=2) as vllm_model:
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, tensor_parallel_size=2,
|
||||
max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
max_num_seqs=2) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
|
@ -12,8 +12,6 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs
|
||||
@ -114,26 +112,12 @@ def _test_processing_correctness(
|
||||
|
||||
processing_info = factories.info(ctx)
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
# Keep integer limits for local data generation
|
||||
limit_mm_per_prompt_ints = {
|
||||
limit_mm_per_prompt = {
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
|
||||
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
|
||||
if modality == "video":
|
||||
return VideoDummyOptions(count=count)
|
||||
if modality == "image":
|
||||
return ImageDummyOptions(count=count)
|
||||
if modality == "audio":
|
||||
return AudioDummyOptions(count=count)
|
||||
return BaseDummyOptions(count=count)
|
||||
|
||||
# Assign normalized DummyOptions to the model config
|
||||
model_config.get_multimodal_config().limit_per_prompt = {
|
||||
modality: _to_dummy_options(modality, count)
|
||||
for modality, count in limit_mm_per_prompt_ints.items()
|
||||
}
|
||||
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
|
||||
|
||||
baseline_processor = factories.build_processor(ctx, cache=None)
|
||||
cached_processor = factories.build_processor(ctx, cache=cache)
|
||||
@ -166,7 +150,7 @@ def _test_processing_correctness(
|
||||
k:
|
||||
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit + 1))]
|
||||
for k, limit in limit_mm_per_prompt_ints.items()
|
||||
for k, limit in limit_mm_per_prompt.items()
|
||||
}
|
||||
|
||||
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
|
||||
|
@ -17,23 +17,23 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
model_config_kwargs = {
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
mm_counts = {"image": 1}
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
model_config_kwargs=model_config_kwargs,
|
||||
limit_mm_per_prompt=mm_counts,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
mm_config = ctx.get_mm_config()
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
decoder_dummy_data = profiler.get_decoder_dummy_data(
|
||||
max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
|
||||
max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
|
||||
hf_config = ctx.get_hf_config(Llama4Config)
|
||||
@ -58,7 +58,7 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
|
||||
profiled_tokens = profiler.get_mm_max_contiguous_tokens(
|
||||
max_model_len,
|
||||
mm_counts=mm_counts,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
|
||||
assert total_tokens == profiled_tokens["image"]
|
||||
|
@ -15,8 +15,6 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
@ -238,20 +236,7 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
|
||||
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
|
||||
if modality == "video":
|
||||
return VideoDummyOptions(count=count)
|
||||
if modality == "image":
|
||||
return ImageDummyOptions(count=count)
|
||||
if modality == "audio":
|
||||
return AudioDummyOptions(count=count)
|
||||
return BaseDummyOptions(count=count)
|
||||
|
||||
model_config.get_multimodal_config().limit_per_prompt = {
|
||||
modality: _to_dummy_options(modality, count)
|
||||
for modality, count in limit_mm_per_prompt.items()
|
||||
}
|
||||
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
|
||||
processor = factories.build_processor(ctx, cache=None)
|
||||
|
||||
with initialize_dummy_model(model_cls, model_config) as model:
|
||||
|
@ -37,5 +37,4 @@ def test_multimodal_processor(model_id):
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
assert (str_processed_inputs["prompt_token_ids"]
|
||||
== ids_processed_inputs["prompt_token_ids"])
|
||||
assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]
|
||||
|
@ -661,10 +661,6 @@ _TRANSFORMERS_BACKEND_MODELS = {
|
||||
"TransformersForSequenceClassification": _HfExamplesInfo("papluca/xlm-roberta-base-language-detection", min_transformers_version="4.57.0.dev0"), # noqa: E501
|
||||
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
|
||||
"TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
|
||||
"TransformersMoEForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924", min_transformers_version="4.57.0.dev0"), # noqa: E501
|
||||
"TransformersMoEForMultimodalLM": _HfExamplesInfo("Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="4.57.0.dev0"), # noqa: E501
|
||||
"TransformersMoEEmbeddingModel": _HfExamplesInfo("Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0"), # noqa: E501
|
||||
"TransformersMoEForSequenceClassification": _HfExamplesInfo("Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0"), # noqa: E501
|
||||
}
|
||||
|
||||
_EXAMPLE_MODELS = {
|
||||
|
@ -66,7 +66,6 @@ def check_implementation(
|
||||
[
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
|
||||
("hmellor/Ilama-3.2-1B", "auto"), # CUSTOM CODE
|
||||
("allenai/OLMoE-1B-7B-0924", "transformers"), # MoE
|
||||
]) # trust_remote_code=True by default
|
||||
def test_models(
|
||||
hf_runner: type[HfRunner],
|
||||
@ -75,14 +74,6 @@ def test_models(
|
||||
model: str,
|
||||
model_impl: str,
|
||||
) -> None:
|
||||
import transformers
|
||||
from packaging.version import Version
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("4.57.0.dev0")
|
||||
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
|
||||
pytest.skip("MoE models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}")
|
||||
|
||||
check_implementation(hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
|
@ -430,26 +430,17 @@ def dummy_hf_overrides(
|
||||
|
||||
update_dict = {
|
||||
"num_layers": num_layers,
|
||||
"num_experts": num_experts,
|
||||
"num_experts_per_tok": 2,
|
||||
"num_local_experts": num_experts,
|
||||
# Otherwise there will not be any expert layers
|
||||
"first_k_dense_replace": 0,
|
||||
# To avoid OOM on DeepSeek-V3
|
||||
"n_routed_experts": num_experts,
|
||||
# For Gemma-3n
|
||||
"num_kv_shared_layers": 1,
|
||||
}
|
||||
|
||||
class DummyConfig:
|
||||
hf_text_config = text_config
|
||||
|
||||
# Only set MoE related config when the model has MoE layers.
|
||||
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
|
||||
if ModelConfig.get_num_experts(DummyConfig) > 0:
|
||||
update_dict.update({
|
||||
"num_experts": num_experts,
|
||||
"num_experts_per_tok": 2,
|
||||
"num_local_experts": num_experts,
|
||||
# Otherwise there will not be any expert layers
|
||||
"first_k_dense_replace": 0,
|
||||
# To avoid OOM on DeepSeek-V3
|
||||
"n_routed_experts": num_experts,
|
||||
})
|
||||
|
||||
# Update num_hidden_layers for non-Longcat architectures
|
||||
if model_arch != "LongcatFlashForCausalLM" \
|
||||
and model_arch != "LongCatFlashMTPModel":
|
||||
|
@ -3,7 +3,6 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
@ -21,10 +20,9 @@ os.environ["FLASHINFER_NVCC_THREADS"] = "16"
|
||||
dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
|
||||
|
||||
|
||||
def can_initialize(model: str, extra_args: Optional[list[str]] = None):
|
||||
def can_initialize(model: str, extra_args: list[str]):
|
||||
|
||||
# Server arguments
|
||||
extra_args = extra_args if extra_args is not None else []
|
||||
server_args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
@ -67,7 +65,7 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Works, but takes too long to run")
|
||||
@ -75,21 +73,21 @@ def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Works, but takes too long to run")
|
||||
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
||||
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
|
||||
|
||||
|
||||
## DeepSeekV3 ##
|
||||
@ -97,37 +95,21 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason=("Known issue: lack of kernel support. "
|
||||
"Expected failure: assert self.block_quant is None"))
|
||||
def test_deepseek_fp8_block_moe_flashinfer_cutlass(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1")
|
||||
|
||||
|
||||
def test_deepseek_fp8_block_moe_flashinfer_trtllm(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1", [])
|
||||
|
||||
|
||||
def test_deepseek_nvfp4_moe_flashinfer_cutlass(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
||||
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
|
||||
|
||||
|
||||
## GPT-OSS ##
|
||||
@ -135,16 +117,16 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
|
||||
can_initialize("openai/gpt-oss-20b")
|
||||
can_initialize("openai/gpt-oss-20b", [])
|
||||
|
||||
|
||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
|
||||
can_initialize("openai/gpt-oss-20b")
|
||||
can_initialize("openai/gpt-oss-20b", [])
|
||||
|
||||
|
||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
|
||||
can_initialize("openai/gpt-oss-20b")
|
||||
can_initialize("openai/gpt-oss-20b", [])
|
||||
|
@ -18,9 +18,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
|
||||
CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
|
||||
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
|
||||
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
W8A8BlockFp8LinearOp)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
cutlass_fp4_supported)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
@ -745,35 +742,3 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
|
||||
perplexity = llm.generate_prompt_perplexity([prompt])[0]
|
||||
print(perplexity)
|
||||
assert perplexity <= exp_perplexity
|
||||
|
||||
|
||||
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
|
||||
with vllm_runner(model_path) as llm:
|
||||
|
||||
fp8_dtype = current_platform.fp8_dtype()
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method,
|
||||
CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
|
||||
assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear,
|
||||
W8A8BlockFp8LinearOp)
|
||||
|
||||
assert qkv_proj.weight.dtype is fp8_dtype
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert len(qkv_proj.weight.shape) == 2
|
||||
assert len(qkv_proj.weight_scale.shape) == 2
|
||||
|
||||
input_quant_op = \
|
||||
qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
|
||||
assert isinstance(input_quant_op, QuantFP8)
|
||||
assert input_quant_op._forward_method == input_quant_op.forward_cuda
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
@ -786,43 +786,13 @@ def test_model_specification(parser_with_config, cli_config_file,
|
||||
parser_with_config.parse_args(['serve', '--config', cli_config_file])
|
||||
|
||||
# Test using --model option raises error
|
||||
# with pytest.raises(
|
||||
# ValueError,
|
||||
# match=
|
||||
# ("With `vllm serve`, you should provide the model as a positional "
|
||||
# "argument or in a config file instead of via the `--model` option."),
|
||||
# ):
|
||||
# parser_with_config.parse_args(['serve', '--model', 'my-model'])
|
||||
|
||||
# Test using --model option back-compatibility
|
||||
# (when back-compatibility ends, the above test should be uncommented
|
||||
# and the below test should be removed)
|
||||
args = parser_with_config.parse_args([
|
||||
'serve',
|
||||
'--tensor-parallel-size',
|
||||
'2',
|
||||
'--model',
|
||||
'my-model',
|
||||
'--trust-remote-code',
|
||||
'--port',
|
||||
'8001',
|
||||
])
|
||||
assert args.model is None
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code is True
|
||||
assert args.port == 8001
|
||||
|
||||
args = parser_with_config.parse_args([
|
||||
'serve',
|
||||
'--tensor-parallel-size=2',
|
||||
'--model=my-model',
|
||||
'--trust-remote-code',
|
||||
'--port=8001',
|
||||
])
|
||||
assert args.model is None
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code is True
|
||||
assert args.port == 8001
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=
|
||||
("With `vllm serve`, you should provide the model as a positional "
|
||||
"argument or in a config file instead of via the `--model` option."),
|
||||
):
|
||||
parser_with_config.parse_args(['serve', '--model', 'my-model'])
|
||||
|
||||
# Test other config values are preserved
|
||||
args = parser_with_config.parse_args([
|
||||
|
@ -8,11 +8,11 @@ import pytest
|
||||
import torch
|
||||
from torch.nn.attention.flex_attention import create_block_mask, flex_attention
|
||||
|
||||
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
|
||||
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||
create_common_attn_metadata,
|
||||
create_standard_kv_cache_spec,
|
||||
create_vllm_config,
|
||||
get_attention_backend)
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
|
||||
|
@ -6,12 +6,12 @@ from typing import Optional, Union
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
|
||||
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||
create_common_attn_metadata,
|
||||
create_standard_kv_cache_spec,
|
||||
create_vllm_config,
|
||||
get_attention_backend)
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
@ -22,7 +22,6 @@ from vllm.utils import cdiv
|
||||
from vllm.v1.attention.backends.mla.flashmla_sparse import (
|
||||
FlashMLASparseBackend, FlashMLASparseDecodeAndContextMetadata,
|
||||
FlashMLASparseImpl, FlashMLASparseMetadata)
|
||||
from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks
|
||||
|
||||
SPARSE_BACKEND_BATCH_SPECS = {
|
||||
name: BATCH_SPECS[name]
|
||||
@ -425,24 +424,3 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name,
|
||||
sdpa_reference,
|
||||
rtol=0.5,
|
||||
atol=0.5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"seq_lens,max_buf,start,expected",
|
||||
[
|
||||
# Basic split: totals per chunk ≤ max_buf
|
||||
(torch.tensor([2, 3, 4, 2]), 5, 0, [(0, 2), (2, 3), (3, 4)]),
|
||||
# Non-zero start index
|
||||
(torch.tensor([2, 3, 4, 2]), 5, 1, [(1, 2), (2, 3), (3, 4)]),
|
||||
# Exact fits should split between items when adding the next would
|
||||
# overflow
|
||||
(torch.tensor([5, 5, 5]), 5, 0, [(0, 1), (1, 2), (2, 3)]),
|
||||
# All requests fit in a single chunk
|
||||
(torch.tensor([1, 1, 1]), 10, 0, [(0, 3)]),
|
||||
# Large buffer with non-zero start
|
||||
(torch.tensor([4, 4, 4]), 100, 1, [(1, 3)]),
|
||||
],
|
||||
)
|
||||
def test_split_prefill_chunks(seq_lens, max_buf, start, expected):
|
||||
out = split_prefill_chunks(seq_lens, max_buf, start)
|
||||
assert out == expected
|
||||
|
@ -8,11 +8,10 @@ from typing import Optional, Union
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
|
||||
LoadConfig, ModelConfig, ModelDType, ParallelConfig,
|
||||
SchedulerConfig, VllmConfig)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms import _Backend, current_platform
|
||||
from vllm.utils import resolve_obj_by_qualname
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
@ -1,91 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
end-to-end tests for context length corner cases of vLLM v1 model runner
|
||||
versus HuggingFace's transformers.
|
||||
|
||||
This test verifies the following behavior: allow a prefill that fills the
|
||||
model's maximum context length and then request a single new token.
|
||||
|
||||
Test strategy
|
||||
- Build a textual prompt that tokenizes to exactly ``max_model_len`` tokens.
|
||||
- Run vLLM generation requesting a single new token (max_tokens=1).
|
||||
- Run HF generation on the same prompt requesting a single token too.
|
||||
- Assert both return the same number of generated tokens and the same ids.
|
||||
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from tests.models.utils import check_outputs_equal
|
||||
from tests.utils import create_new_process_for_each_test
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.inputs import TokensPrompt
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
|
||||
@pytest.mark.parametrize("max_model_len", [2048])
|
||||
@pytest.mark.parametrize("max_tokens", [1])
|
||||
def test_prefill_max_context_length(
|
||||
model: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""Compare vLLM and HuggingFace when the prompt already fills the
|
||||
model's maximum context length and we request a single new token.
|
||||
|
||||
The test ensures vLLM does not raise the "Sampled token IDs exceed the
|
||||
max model length" assertion and that both vLLM and HF produce the same
|
||||
single token when given the same inputs.
|
||||
"""
|
||||
|
||||
# Construct a prompt of size max_model_len
|
||||
prompt_ids = [[43] * max_model_len]
|
||||
|
||||
# Generate max_tokens new tokens deterministically.
|
||||
sampling_params = [
|
||||
SamplingParams(max_tokens=max_tokens, temperature=0.0, ignore_eos=True)
|
||||
]
|
||||
|
||||
# --- vLLM generation ---
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tokenizer=model,
|
||||
max_num_seqs=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
vllm_token_prompts = [TokensPrompt(prompt_token_ids=prompt_ids[0])]
|
||||
vllm_results = llm.generate(vllm_token_prompts, sampling_params)
|
||||
|
||||
vllm_output_ids = vllm_results[0].outputs[0].token_ids
|
||||
|
||||
# --- HuggingFace generation ---
|
||||
with torch.no_grad():
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(model)
|
||||
|
||||
# HF expects a tensor of input ids shaped (batch, seq_len).
|
||||
hf_input_tokens = torch.tensor(prompt_ids[0]).unsqueeze(0)
|
||||
|
||||
# Generate max_tokens new tokens deterministically.
|
||||
hf_generated = hf_model.generate(
|
||||
hf_input_tokens,
|
||||
do_sample=False,
|
||||
min_new_tokens=max_tokens,
|
||||
max_new_tokens=max_tokens,
|
||||
)
|
||||
|
||||
# HF returns the prompt + generated tokens. Slice off the prompt.
|
||||
hf_output_ids = hf_generated.cpu().tolist()[0][len(prompt_ids[0]):]
|
||||
|
||||
# check that vLLM outputs (token ids) match HF outputs
|
||||
# Note: for simplicity don't pass detokenized string
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=[(hf_output_ids, "")],
|
||||
outputs_1_lst=[(vllm_output_ids, "")],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
@ -123,10 +123,13 @@ def test_structured_output(
|
||||
if current_platform.is_tpu() and speculative_config:
|
||||
pytest.skip("TPU does not support speculative decoding")
|
||||
|
||||
# Don't use eager execution on TPUs because we want to test for no
|
||||
# recompilation at runtime
|
||||
enforce_eager = bool(not current_platform.is_tpu())
|
||||
# Use a single LLM instance for several scenarios to
|
||||
# speed up the test suite.
|
||||
llm = LLM(model=model_name,
|
||||
enforce_eager=True,
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=1024,
|
||||
structured_outputs_config=dict(backend=backend,
|
||||
disable_any_whitespace=backend
|
||||
|
@ -57,26 +57,6 @@ def clear_kv_transfer():
|
||||
ensure_kv_transfer_shutdown()
|
||||
|
||||
|
||||
def get_default_xfer_telemetry(xferDurationS: float = 1,
|
||||
postDurationS: float = 1,
|
||||
totalBytes: int = 1,
|
||||
descCount: int = 1) -> dict:
|
||||
|
||||
class AttributeDict(dict):
|
||||
__slots__ = ()
|
||||
__getattr__ = dict.__getitem__
|
||||
__setattr__ = dict.__setitem__ # type: ignore[assignment]
|
||||
|
||||
# We can't instantiate nixlXferTelemetry because it's read only and
|
||||
# ray env does not have NIXL, so we must fake it
|
||||
return AttributeDict(
|
||||
xferDuration=xferDurationS * 1e6, # in us
|
||||
postDuration=postDurationS * 1e6, # in us
|
||||
totalBytes=totalBytes,
|
||||
descCount=descCount,
|
||||
)
|
||||
|
||||
|
||||
class FakeNixlWrapper:
|
||||
"""Mock implementation of NixlWrapper for testing.
|
||||
|
||||
@ -152,9 +132,6 @@ class FakeNixlWrapper:
|
||||
def transfer(self, handle: int) -> str:
|
||||
return "PROC"
|
||||
|
||||
def get_xfer_telemetry(self, handle: int) -> dict:
|
||||
return get_default_xfer_telemetry()
|
||||
|
||||
############################################################
|
||||
# Follow are for changing the behavior during testing.
|
||||
############################################################
|
||||
@ -192,11 +169,6 @@ nixl_agent = FakeNixlWrapper
|
||||
with open(os.path.join(pkg_root, "__init__.py"), "w") as f:
|
||||
f.write(stub)
|
||||
|
||||
# Mock nixlXferTelemetry class
|
||||
pkg_root2 = os.path.join(td, "nixl", "_bindings")
|
||||
os.makedirs(pkg_root2, exist_ok=True)
|
||||
with open(os.path.join(pkg_root2, "__init__.py"), "w") as f:
|
||||
f.write("class nixlXferTelemetry: pass")
|
||||
# touch parent package
|
||||
open(os.path.join(td, "nixl", "__init__.py"), "w").close()
|
||||
yield td
|
||||
@ -603,7 +575,7 @@ def test_kv_connector_stats(dist_init):
|
||||
|
||||
# Verify stats values are recorded
|
||||
assert not stats_after_transfer.is_empty()
|
||||
assert stats_after_transfer.num_successful_transfers == 1
|
||||
assert stats_after_transfer.data["num_successful_transfers"] == 1
|
||||
|
||||
# Verify stats are reset after retrieval
|
||||
stats_after_reset = connector.get_kv_connector_stats()
|
||||
@ -627,21 +599,16 @@ def test_kv_connector_stats_aggregation():
|
||||
|
||||
# Record different transfers on each worker
|
||||
# Worker 1: 2 transfers
|
||||
stats = get_default_xfer_telemetry()
|
||||
worker1_stats.record_transfer(stats)
|
||||
worker1_stats.record_transfer(stats)
|
||||
worker1_stats.record_transfer()
|
||||
worker1_stats.record_transfer()
|
||||
|
||||
# Worker 2: 1 transfer
|
||||
worker2_stats.record_transfer(stats)
|
||||
worker2_stats.record_transfer()
|
||||
|
||||
# Worker 3: 3 transfers
|
||||
stats = get_default_xfer_telemetry(xferDurationS=2,
|
||||
postDurationS=2,
|
||||
totalBytes=2,
|
||||
descCount=2)
|
||||
worker3_stats.record_transfer(stats)
|
||||
worker3_stats.record_transfer(stats)
|
||||
worker3_stats.record_transfer(stats)
|
||||
worker3_stats.record_transfer()
|
||||
worker3_stats.record_transfer()
|
||||
worker3_stats.record_transfer()
|
||||
|
||||
# Create ModelRunnerOutput instances for each worker
|
||||
worker_outputs = []
|
||||
@ -669,12 +636,7 @@ def test_kv_connector_stats_aggregation():
|
||||
aggregated_output.kv_connector_output.kv_connector_stats
|
||||
assert isinstance(kv_connector_stats, NixlKVConnectorStats)
|
||||
# Number of total transfers across all workers.
|
||||
assert kv_connector_stats.num_successful_transfers == 6
|
||||
# Logging proc, call reduce() to get CLI-friendly stats.
|
||||
cli_stats = kv_connector_stats.reduce()
|
||||
assert cli_stats["Avg xfer time (ms)"] == 1500.0
|
||||
assert cli_stats["Avg post time (ms)"] == 1500.0
|
||||
assert cli_stats["Avg number of descriptors"] == 1.5
|
||||
assert kv_connector_stats.data["num_successful_transfers"] == 6
|
||||
|
||||
|
||||
def test_multi_kv_connector_stats_aggregation():
|
||||
@ -687,7 +649,6 @@ def test_multi_kv_connector_stats_aggregation():
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
# Mock a KVConnectorStats class for testing aggregation over connectors.
|
||||
@dataclass
|
||||
class FooKVConnectorStats(KVConnectorStats):
|
||||
|
||||
@ -715,7 +676,7 @@ def test_multi_kv_connector_stats_aggregation():
|
||||
if nixl_count > 0:
|
||||
nixl_stats = NixlKVConnectorStats()
|
||||
for _ in range(nixl_count):
|
||||
nixl_stats.record_transfer(get_default_xfer_telemetry())
|
||||
nixl_stats.record_transfer()
|
||||
data["NixlConnector"] = nixl_stats
|
||||
if foo_count > 0:
|
||||
foo_stats = FooKVConnectorStats()
|
||||
@ -751,10 +712,8 @@ def test_multi_kv_connector_stats_aggregation():
|
||||
assert isinstance(kv_connector_stats, MultiKVConnectorStats)
|
||||
|
||||
# Validate per-connector totals across workers
|
||||
assert isinstance(kv_connector_stats["NixlConnector"],
|
||||
NixlKVConnectorStats)
|
||||
assert kv_connector_stats["NixlConnector"].num_successful_transfers == 5
|
||||
assert isinstance(kv_connector_stats["FooConnector"], FooKVConnectorStats)
|
||||
assert kv_connector_stats["NixlConnector"].data[
|
||||
"num_successful_transfers"] == 5
|
||||
assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6
|
||||
|
||||
|
||||
@ -796,8 +755,6 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
|
||||
"working_dir": working_dir, # ship fake nixl package
|
||||
"env_vars": {
|
||||
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout),
|
||||
# TODO: for ray to carry over, remove once we set
|
||||
"NIXL_TELEMETRY_ENABLE": "1",
|
||||
},
|
||||
}
|
||||
ray.init(runtime_env=runtime_env)
|
||||
|
@ -8,10 +8,10 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.utils import get_attn_backend_list_based_on_platform
|
||||
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
|
||||
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||
create_common_attn_metadata,
|
||||
create_standard_kv_cache_spec,
|
||||
get_attention_backend)
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, SpeculativeConfig,
|
||||
VllmConfig)
|
||||
|
@ -6,10 +6,10 @@ from unittest import mock
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
|
||||
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||
create_common_attn_metadata,
|
||||
create_standard_kv_cache_spec,
|
||||
get_attention_backend)
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, SpeculativeConfig,
|
||||
VllmConfig)
|
||||
|
@ -6,10 +6,9 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from tests.v1.attention.utils import (create_standard_kv_cache_spec,
|
||||
from tests.v1.attention.utils import (_Backend, create_standard_kv_cache_spec,
|
||||
create_vllm_config,
|
||||
get_attention_backend)
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.config import ParallelConfig, SpeculativeConfig
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
|
||||
|
@ -1,27 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Attention backend registry"""
|
||||
|
||||
import enum
|
||||
|
||||
|
||||
class _Backend(enum.Enum):
|
||||
FLASH_ATTN = enum.auto()
|
||||
TRITON_ATTN = enum.auto()
|
||||
XFORMERS = enum.auto()
|
||||
ROCM_FLASH = enum.auto()
|
||||
ROCM_AITER_MLA = enum.auto()
|
||||
ROCM_AITER_FA = enum.auto() # used for ViT attn backend
|
||||
TORCH_SDPA = enum.auto()
|
||||
FLASHINFER = enum.auto()
|
||||
FLASHINFER_MLA = enum.auto()
|
||||
TRITON_MLA = enum.auto()
|
||||
CUTLASS_MLA = enum.auto()
|
||||
FLASHMLA = enum.auto()
|
||||
FLASH_ATTN_MLA = enum.auto()
|
||||
PALLAS = enum.auto()
|
||||
IPEX = enum.auto()
|
||||
NO_ATTENTION = enum.auto()
|
||||
FLEX_ATTENTION = enum.auto()
|
||||
TREE_ATTN = enum.auto()
|
||||
ROCM_ATTN = enum.auto()
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Attention layer."""
|
||||
from typing import Callable, List, Optional
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -10,7 +10,6 @@ import torch.nn.functional as F
|
||||
import vllm.envs as envs
|
||||
from vllm.attention import AttentionType
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.selector import backend_name_to_enum, get_attn_backend
|
||||
from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
|
||||
from vllm.config import CacheConfig, get_current_vllm_config
|
||||
@ -27,7 +26,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape)
|
||||
from vllm.model_executor.models.vision import get_vit_attn_backend
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms import _Backend, current_platform
|
||||
from vllm.utils import GiB_bytes, direct_register_custom_op
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -68,39 +67,9 @@ def check_upstream_fa_availability(dtype: torch.dtype):
|
||||
) and current_platform.has_device_capability(80):
|
||||
from transformers.utils import is_flash_attn_2_available
|
||||
return is_flash_attn_2_available()
|
||||
if current_platform.is_rocm():
|
||||
from importlib.util import find_spec
|
||||
return find_spec("flash_attn") is not None
|
||||
return False
|
||||
|
||||
|
||||
def maybe_get_vit_flash_attn_backend(
|
||||
attn_backend: _Backend,
|
||||
use_upstream_fa: bool) -> tuple[_Backend, Callable]:
|
||||
if attn_backend != _Backend.FLASH_ATTN and \
|
||||
attn_backend != _Backend.ROCM_AITER_FA and \
|
||||
check_upstream_fa_availability(torch.get_default_dtype()):
|
||||
attn_backend = _Backend.FLASH_ATTN
|
||||
use_upstream_fa = True
|
||||
|
||||
if current_platform.is_rocm() and \
|
||||
attn_backend == _Backend.FLASH_ATTN:
|
||||
use_upstream_fa = True
|
||||
|
||||
if (attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}):
|
||||
if attn_backend == _Backend.ROCM_AITER_FA:
|
||||
from aiter import flash_attn_varlen_func
|
||||
else:
|
||||
if use_upstream_fa:
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
else:
|
||||
from vllm.vllm_flash_attn import flash_attn_varlen_func
|
||||
else:
|
||||
flash_attn_varlen_func = None
|
||||
|
||||
return attn_backend, flash_attn_varlen_func
|
||||
|
||||
|
||||
class Attention(nn.Module, AttentionLayerBase):
|
||||
"""Attention layer.
|
||||
|
||||
@ -440,9 +409,13 @@ class MultiHeadAttention(nn.Module):
|
||||
# to upstream flash attention if available.
|
||||
# If vllm native fa is selected, we use it directly.
|
||||
use_upstream_fa = False
|
||||
if backend != _Backend.FLASH_ATTN and check_upstream_fa_availability(
|
||||
dtype):
|
||||
backend = _Backend.FLASH_ATTN
|
||||
use_upstream_fa = True
|
||||
|
||||
if current_platform.is_xpu():
|
||||
# currently, only torch_sdpa is supported on xpu
|
||||
if current_platform.is_rocm() or current_platform.is_xpu():
|
||||
# currently, only torch_sdpa is supported on rocm/xpu
|
||||
self.attn_backend = _Backend.TORCH_SDPA
|
||||
else:
|
||||
|
||||
@ -454,25 +427,17 @@ class MultiHeadAttention(nn.Module):
|
||||
_Backend.FLASH_ATTN,
|
||||
} else _Backend.TORCH_SDPA
|
||||
|
||||
self.attn_backend, self._flash_attn_varlen_func \
|
||||
= maybe_get_vit_flash_attn_backend(
|
||||
self.attn_backend,
|
||||
use_upstream_fa,
|
||||
)
|
||||
|
||||
if (self.attn_backend == _Backend.XFORMERS
|
||||
and not check_xformers_availability()):
|
||||
self.attn_backend = _Backend.TORCH_SDPA
|
||||
|
||||
self.is_flash_attn_backend = self.attn_backend in {
|
||||
_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
|
||||
}
|
||||
|
||||
# this condition is just to make sure that the
|
||||
# use_upstream_fa in the log is correct
|
||||
if current_platform.is_rocm() \
|
||||
and self.attn_backend == _Backend.FLASH_ATTN:
|
||||
use_upstream_fa = True
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
if use_upstream_fa:
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
self._flash_attn_varlen_func = flash_attn_varlen_func
|
||||
else:
|
||||
from vllm.vllm_flash_attn import flash_attn_varlen_func
|
||||
self._flash_attn_varlen_func = flash_attn_varlen_func
|
||||
|
||||
logger.info_once(
|
||||
f"MultiHeadAttention attn_backend: {self.attn_backend}, "
|
||||
@ -500,7 +465,7 @@ class MultiHeadAttention(nn.Module):
|
||||
key = torch.repeat_interleave(key, num_repeat, dim=2)
|
||||
value = torch.repeat_interleave(value, num_repeat, dim=2)
|
||||
|
||||
if self.is_flash_attn_backend:
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
|
||||
step=q_len,
|
||||
dtype=torch.int32,
|
||||
@ -541,6 +506,14 @@ class MultiHeadAttention(nn.Module):
|
||||
from torch_xla.experimental.custom_kernel import flash_attention
|
||||
out = flash_attention(query, key, value, sm_scale=self.scale)
|
||||
out = out.transpose(1, 2)
|
||||
elif self.attn_backend == _Backend.ROCM_AITER_FA:
|
||||
from aiter import flash_attn_varlen_func
|
||||
|
||||
# ROCm Flash Attention expects (batch, seq, heads, head_dim)
|
||||
out = flash_attn_varlen_func(query,
|
||||
key,
|
||||
value,
|
||||
softmax_scale=self.scale)
|
||||
else:
|
||||
# ViT attention hasn't supported this backend yet
|
||||
raise NotImplementedError(
|
||||
|
@ -11,9 +11,8 @@ import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms import _Backend, current_platform
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
@ -235,22 +235,16 @@ class BenchmarkDataset(ABC):
|
||||
|
||||
if len(requests) < num_requests:
|
||||
random.seed(self.random_seed)
|
||||
needed = num_requests - len(requests)
|
||||
additional = []
|
||||
for i in range(needed):
|
||||
req = deepcopy(random.choice(requests))
|
||||
additional = deepcopy(
|
||||
random.choices(requests, k=num_requests - len(requests))
|
||||
)
|
||||
for i in range(len(additional)):
|
||||
req = additional[i]
|
||||
req.request_id = request_id_prefix + str(len(requests) + i)
|
||||
additional.append(req)
|
||||
requests.extend(additional)
|
||||
logger.info("Oversampled requests to reach %d total samples.",
|
||||
num_requests)
|
||||
|
||||
ids = [req.request_id for req in requests]
|
||||
if len(ids) != len(set(ids)):
|
||||
raise ValueError("Duplicate request_id found in the sampled "
|
||||
"requests. Please ensure that each request_id "
|
||||
"is unique.")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Utility Functions and Global Caches
|
||||
@ -1157,12 +1151,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
||||
help="Do not oversample if the dataset has " \
|
||||
"fewer samples than num-prompts.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-chat-template",
|
||||
action="store_true",
|
||||
help=
|
||||
"Skip applying chat template to prompt for datasets that support it.",
|
||||
)
|
||||
|
||||
# group for dataset specific arguments
|
||||
custom_group = parser.add_argument_group("custom dataset options")
|
||||
@ -1173,6 +1161,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
||||
help=
|
||||
"Number of output tokens per request, used only for custom dataset.",
|
||||
)
|
||||
custom_group.add_argument(
|
||||
"--custom-skip-chat-template",
|
||||
action="store_true",
|
||||
help=
|
||||
"Skip applying chat template to prompt, used only for custom dataset.",
|
||||
)
|
||||
|
||||
spec_bench_group = parser.add_argument_group("spec bench dataset options")
|
||||
spec_bench_group.add_argument(
|
||||
@ -1441,7 +1435,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
output_len=args.custom_output_len,
|
||||
skip_chat_template=args.skip_chat_template,
|
||||
skip_chat_template=args.custom_skip_chat_template,
|
||||
request_id_prefix=args.request_id_prefix,
|
||||
no_oversample=args.no_oversample,
|
||||
)
|
||||
@ -1582,7 +1576,6 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
output_len=args.hf_output_len,
|
||||
request_id_prefix=args.request_id_prefix,
|
||||
no_oversample=args.no_oversample,
|
||||
skip_chat_template=args.skip_chat_template,
|
||||
**hf_kwargs
|
||||
)
|
||||
|
||||
@ -1822,6 +1815,7 @@ class SpecBench(CustomDataset):
|
||||
|
||||
def sample(self, **kwargs) -> list:
|
||||
# leverage CustomDataset sample
|
||||
kwargs["skip_chat_template"] = False
|
||||
return super().sample(**kwargs)
|
||||
|
||||
|
||||
@ -2227,7 +2221,6 @@ class InstructCoderDataset(HuggingFaceDataset):
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
skip_chat_template: bool = False,
|
||||
request_id_prefix: str = "",
|
||||
no_oversample: bool = False,
|
||||
**kwargs) -> list:
|
||||
@ -2243,15 +2236,14 @@ class InstructCoderDataset(HuggingFaceDataset):
|
||||
)
|
||||
|
||||
# apply template
|
||||
if not skip_chat_template:
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
sampled_requests.append(
|
||||
@ -2292,7 +2284,6 @@ class MTBenchDataset(HuggingFaceDataset):
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
skip_chat_template: bool = False,
|
||||
request_id_prefix: str = "",
|
||||
no_oversample: bool = False,
|
||||
**kwargs,
|
||||
@ -2307,15 +2298,14 @@ class MTBenchDataset(HuggingFaceDataset):
|
||||
prompt = item["turns"][0]
|
||||
|
||||
# apply template
|
||||
if not skip_chat_template:
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
sampled_requests.append(
|
||||
@ -2359,7 +2349,6 @@ class BlazeditDataset(HuggingFaceDataset):
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
skip_chat_template: bool = False,
|
||||
request_id_prefix: str = "",
|
||||
no_oversample: bool = False,
|
||||
min_distance: float = 0.0,
|
||||
@ -2383,7 +2372,7 @@ class BlazeditDataset(HuggingFaceDataset):
|
||||
|
||||
# template copied from
|
||||
# https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
|
||||
prompt = f"""Given a code file, please apply the change requests and generate the new file.
|
||||
instruction = f"""Given a code file, please apply the change requests and generate the new file.
|
||||
|
||||
Original file:
|
||||
```python
|
||||
@ -2396,15 +2385,14 @@ Change request:
|
||||
Please generate the new code file in the "New file" section below.""" # noqa: E501
|
||||
|
||||
# apply template
|
||||
if not skip_chat_template:
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"content": instruction
|
||||
}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
|
||||
|
@ -20,7 +20,7 @@ from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
|
||||
MultiModalConfig)
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
from vllm.config.scheduler import RunnerType
|
||||
from vllm.config.utils import assert_hashable, config, getattr_iter
|
||||
from vllm.config.utils import assert_hashable, config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import (
|
||||
@ -276,9 +276,7 @@ class ModelConfig:
|
||||
multimodal_config: Optional[MultiModalConfig] = None
|
||||
"""Configuration for multimodal model. If `None`, this will be inferred
|
||||
from the architecture of `self.model`."""
|
||||
limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int,
|
||||
dict[str,
|
||||
int]]]]] = None
|
||||
limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
|
||||
media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
|
||||
mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
|
||||
mm_processor_cache_gb: InitVar[Optional[float]] = None
|
||||
@ -669,8 +667,6 @@ class ModelConfig:
|
||||
def _get_transformers_backend_cls(self) -> str:
|
||||
"""Determine which Transformers backend class will be used if
|
||||
`model_impl` is set to `transformers` or `auto`."""
|
||||
prefix = "Transformers"
|
||||
prefix += "MoE" if self.get_num_experts() > 1 else ""
|
||||
# Check if the architecture we're wrapping has defaults
|
||||
runner = None
|
||||
convert = None
|
||||
@ -689,15 +685,15 @@ class ModelConfig:
|
||||
# Resolve Transformers backend pooling classes
|
||||
if runner == "pooling":
|
||||
if convert == "embed":
|
||||
return prefix + "EmbeddingModel"
|
||||
return "TransformersEmbeddingModel"
|
||||
if convert == "classify":
|
||||
return prefix + "ForSequenceClassification"
|
||||
return "TransformersForSequenceClassification"
|
||||
# Resolve Transformers backend generate classes
|
||||
if self.hf_config != self.hf_text_config:
|
||||
# If 'hf_text_config' is the same as 'hf_config'. If not, it is
|
||||
# probably a composite config, i.e. multimodal
|
||||
return prefix + "ForMultimodalLM"
|
||||
return prefix + "ForCausalLM"
|
||||
return "TransformersForMultimodalLM"
|
||||
return "TransformersForCausalLM"
|
||||
|
||||
def using_transformers_backend(self) -> bool:
|
||||
"""Check if the model is using the Transformers backend class."""
|
||||
@ -1029,7 +1025,17 @@ class ModelConfig:
|
||||
self.enforce_eager = True
|
||||
|
||||
def _verify_with_expert_parallelism(self) -> None:
|
||||
num_experts = self.get_num_experts()
|
||||
num_expert_names = [
|
||||
"moe_num_experts", # Dbrx
|
||||
"num_experts", # Jamba
|
||||
"n_routed_experts", # DeepSeek
|
||||
"num_local_experts", # Mixtral
|
||||
]
|
||||
num_experts = 0
|
||||
for name in num_expert_names:
|
||||
num_experts = getattr(self.hf_text_config, name, 0)
|
||||
if num_experts > 0:
|
||||
break
|
||||
if num_experts < 1:
|
||||
raise ValueError(
|
||||
"Number of experts in the model must be greater than 0 "
|
||||
@ -1214,21 +1220,6 @@ class ModelConfig:
|
||||
num_heads = getattr(self.hf_text_config, "num_attention_heads", 0)
|
||||
return num_heads // parallel_config.tensor_parallel_size
|
||||
|
||||
def get_num_experts(self) -> int:
|
||||
"""Returns the number of experts in the model."""
|
||||
num_expert_names = [
|
||||
"num_experts", # Jamba
|
||||
"moe_num_experts", # Dbrx
|
||||
"n_routed_experts", # DeepSeek
|
||||
"num_local_experts", # Mixtral
|
||||
]
|
||||
num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0)
|
||||
if isinstance(num_experts, list):
|
||||
# Ernie VL's remote code uses list[int]...
|
||||
# The values are always the same so we just take the first one.
|
||||
return num_experts[0]
|
||||
return num_experts
|
||||
|
||||
def get_layers_start_end_indices(
|
||||
self, parallel_config: ParallelConfig) -> tuple[int, int]:
|
||||
from vllm.distributed.utils import get_pp_indices
|
||||
|
@ -4,45 +4,15 @@
|
||||
import hashlib
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, Optional, Union
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
from pydantic import ConfigDict, Field, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.utils import config
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseDummyOptions:
|
||||
"""Base options for generating dummy data during profiling."""
|
||||
count: int = Field(999, ge=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class VideoDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy video data during profiling."""
|
||||
num_frames: Optional[int] = Field(None, gt=0)
|
||||
width: Optional[int] = Field(None, gt=0)
|
||||
height: Optional[int] = Field(None, gt=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class ImageDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy image data during profiling."""
|
||||
width: Optional[int] = Field(None, gt=0)
|
||||
height: Optional[int] = Field(None, gt=0)
|
||||
|
||||
|
||||
@dataclass(config=ConfigDict(extra="forbid"))
|
||||
class AudioDummyOptions(BaseDummyOptions):
|
||||
"""Options for generating dummy audio data during profiling."""
|
||||
length: Optional[int] = Field(None, gt=0)
|
||||
|
||||
|
||||
MMEncoderTPMode = Literal["weights", "data"]
|
||||
MMCacheType = Literal["shm", "lru"]
|
||||
DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions,
|
||||
AudioDummyOptions]
|
||||
|
||||
|
||||
@config
|
||||
@ -50,22 +20,12 @@ DummyOptions = Union[BaseDummyOptions, VideoDummyOptions, ImageDummyOptions,
|
||||
class MultiModalConfig:
|
||||
"""Controls the behavior of multimodal models."""
|
||||
|
||||
limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict)
|
||||
"""The maximum number of input items and options allowed per
|
||||
prompt for each modality.
|
||||
Defaults to 999 for each modality.
|
||||
limit_per_prompt: dict[str, int] = field(default_factory=dict)
|
||||
"""The maximum number of input items allowed per prompt for each modality.
|
||||
Defaults to 1 (V0) or 999 (V1) for each modality.
|
||||
|
||||
Legacy format (count only):
|
||||
{"image": 16, "video": 2}
|
||||
|
||||
Configurable format (with options):
|
||||
{"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
|
||||
"image": {"count": 5, "width": 512, "height": 512}}
|
||||
|
||||
Mixed format (combining both):
|
||||
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
|
||||
"height": 512}}
|
||||
"""
|
||||
For example, to allow up to 16 images and 2 videos per prompt:
|
||||
`{"image": 16, "video": 2}`"""
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
@ -124,27 +84,6 @@ class MultiModalConfig:
|
||||
from each video to be pruned.
|
||||
"""
|
||||
|
||||
@field_validator("limit_per_prompt", mode="before")
|
||||
@classmethod
|
||||
def _validate_limit_per_prompt(
|
||||
cls, value: dict[str, Union[int,
|
||||
dict[str,
|
||||
int]]]) -> dict[str, DummyOptions]:
|
||||
for k, v in value.items():
|
||||
# Handle legacy format where only count is specified
|
||||
if isinstance(v, int):
|
||||
v = {"count": v}
|
||||
# Convert to the appropriate DummyOptions subclass
|
||||
if k == "video":
|
||||
value[k] = VideoDummyOptions(**v)
|
||||
elif k == "image":
|
||||
value[k] = ImageDummyOptions(**v)
|
||||
elif k == "audio":
|
||||
value[k] = AudioDummyOptions(**v)
|
||||
else:
|
||||
value[k] = BaseDummyOptions(**v)
|
||||
return value
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
@ -167,22 +106,12 @@ class MultiModalConfig:
|
||||
def get_limit_per_prompt(self, modality: str) -> int:
|
||||
"""
|
||||
Get the maximum number of input items allowed per prompt
|
||||
for the given modality (backward compatible).
|
||||
for the given modality.
|
||||
"""
|
||||
limit_data = self.limit_per_prompt.get(modality)
|
||||
|
||||
if limit_data is None:
|
||||
# Unspecified modality is set to 999 by default
|
||||
return 999
|
||||
return limit_data.count
|
||||
|
||||
def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]:
|
||||
"""
|
||||
Get the configurable dummy data options for a modality.
|
||||
Returns None if no options are configured for this modality.
|
||||
"""
|
||||
# All values are now DummyOptions after normalization
|
||||
return self.limit_per_prompt.get(modality)
|
||||
return self.limit_per_prompt.get(
|
||||
modality,
|
||||
999 if envs.VLLM_USE_V1 else 1,
|
||||
)
|
||||
|
||||
def merge_mm_processor_kwargs(
|
||||
self,
|
||||
|
@ -516,23 +516,6 @@ class VllmConfig:
|
||||
" by VLLM_DEBUG_DUMP_PATH to %s", env_path)
|
||||
self.compilation_config.debug_dump_path = env_path
|
||||
|
||||
def has_blocked_weights():
|
||||
if self.quant_config is not None:
|
||||
if hasattr(self.quant_config, "weight_block_size"):
|
||||
return self.quant_config.weight_block_size is not None
|
||||
elif hasattr(self.quant_config, "has_blocked_weights"):
|
||||
return self.quant_config.has_blocked_weights()
|
||||
return False
|
||||
|
||||
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
|
||||
# On H100 the CUDA kernel is faster than
|
||||
# native implementation
|
||||
# https://github.com/vllm-project/vllm/issues/25094
|
||||
if has_blocked_weights():
|
||||
custom_ops = self.compilation_config.custom_ops
|
||||
if "none" not in custom_ops and "-quant_fp8" not in custom_ops:
|
||||
custom_ops.append("+quant_fp8")
|
||||
|
||||
def update_sizes_for_sequence_parallelism(self,
|
||||
possible_sizes: list) -> list:
|
||||
# remove the sizes that not multiple of tp_size when
|
||||
|
@ -54,7 +54,6 @@ class HTTPConnection:
|
||||
stream: bool = False,
|
||||
timeout: Optional[float] = None,
|
||||
extra_headers: Optional[Mapping[str, str]] = None,
|
||||
allow_redirects: bool = True,
|
||||
):
|
||||
self._validate_http_url(url)
|
||||
|
||||
@ -64,8 +63,7 @@ class HTTPConnection:
|
||||
return client.get(url,
|
||||
headers=self._headers(**extra_headers),
|
||||
stream=stream,
|
||||
timeout=timeout,
|
||||
allow_redirects=allow_redirects)
|
||||
timeout=timeout)
|
||||
|
||||
async def get_async_response(
|
||||
self,
|
||||
@ -73,7 +71,6 @@ class HTTPConnection:
|
||||
*,
|
||||
timeout: Optional[float] = None,
|
||||
extra_headers: Optional[Mapping[str, str]] = None,
|
||||
allow_redirects: bool = True,
|
||||
):
|
||||
self._validate_http_url(url)
|
||||
|
||||
@ -82,17 +79,10 @@ class HTTPConnection:
|
||||
|
||||
return client.get(url,
|
||||
headers=self._headers(**extra_headers),
|
||||
timeout=timeout,
|
||||
allow_redirects=allow_redirects)
|
||||
timeout=timeout)
|
||||
|
||||
def get_bytes(self,
|
||||
url: str,
|
||||
*,
|
||||
timeout: Optional[float] = None,
|
||||
allow_redirects: bool = True) -> bytes:
|
||||
with self.get_response(url,
|
||||
timeout=timeout,
|
||||
allow_redirects=allow_redirects) as r:
|
||||
def get_bytes(self, url: str, *, timeout: Optional[float] = None) -> bytes:
|
||||
with self.get_response(url, timeout=timeout) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
return r.content
|
||||
@ -102,10 +92,8 @@ class HTTPConnection:
|
||||
url: str,
|
||||
*,
|
||||
timeout: Optional[float] = None,
|
||||
allow_redirects: bool = True,
|
||||
) -> bytes:
|
||||
async with await self.get_async_response(
|
||||
url, timeout=timeout, allow_redirects=allow_redirects) as r:
|
||||
async with await self.get_async_response(url, timeout=timeout) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
return await r.read()
|
||||
|
@ -4,7 +4,6 @@ import contextlib
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
@ -21,7 +20,6 @@ import torch
|
||||
import zmq
|
||||
|
||||
from vllm import envs
|
||||
from vllm.attention.backends.registry import _Backend
|
||||
from vllm.attention.selector import backend_name_to_enum, get_attn_backend
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
@ -34,7 +32,7 @@ from vllm.distributed.parallel_state import (
|
||||
from vllm.distributed.utils import divide
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms import _Backend, current_platform
|
||||
from vllm.utils import make_zmq_path, make_zmq_socket
|
||||
from vllm.v1.attention.backends.utils import get_kv_cache_layout
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
@ -55,12 +53,10 @@ logger = init_logger(__name__)
|
||||
# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
|
||||
try:
|
||||
from nixl._api import nixl_agent as NixlWrapper
|
||||
from nixl._bindings import nixlXferTelemetry
|
||||
logger.info("NIXL is available")
|
||||
except ImportError:
|
||||
logger.warning("NIXL is not available")
|
||||
NixlWrapper = None
|
||||
nixlXferTelemetry = None
|
||||
|
||||
try:
|
||||
from nixl._api import nixl_agent_config
|
||||
@ -479,9 +475,6 @@ class NixlConnectorWorker:
|
||||
self.nixl_backends = \
|
||||
vllm_config.kv_transfer_config.get_from_extra_config(
|
||||
"backends", ["UCX"])
|
||||
# TODO temporary, once nixl allows for telemetry flag in config
|
||||
# (next release), we can remove this env var.
|
||||
os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
|
||||
# Agent.
|
||||
non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
|
||||
if nixl_agent_config is None:
|
||||
@ -1181,10 +1174,9 @@ class NixlConnectorWorker:
|
||||
for handle, _xfer_stime in handles:
|
||||
xfer_state = self.nixl_wrapper.check_xfer_state(handle)
|
||||
if xfer_state == "DONE":
|
||||
# Get telemetry from NIXL
|
||||
res = self.nixl_wrapper.get_xfer_telemetry(handle)
|
||||
self.xfer_stats.record_transfer(res)
|
||||
self.nixl_wrapper.release_xfer_handle(handle)
|
||||
# TODO (NickLucche) Get from NIXL telemetry once integrated
|
||||
self.xfer_stats.record_transfer()
|
||||
elif xfer_state == "PROC":
|
||||
in_progress = True
|
||||
continue
|
||||
@ -1456,25 +1448,15 @@ class NixlKVConnectorStats(KVConnectorStats):
|
||||
"""Container for transfer performance metrics"""
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.data:
|
||||
# Empty container init, no data is passed in.
|
||||
self.reset()
|
||||
if "num_successful_transfers" not in self.data:
|
||||
self.data["num_successful_transfers"] = 0
|
||||
|
||||
def reset(self):
|
||||
# Must be serializable
|
||||
self.data: dict[str, list[float]] = {
|
||||
"transfer_duration": [],
|
||||
"post_duration": [],
|
||||
"bytes_transferred": [],
|
||||
"num_descriptors": [],
|
||||
}
|
||||
self.data = {"num_successful_transfers": 0}
|
||||
|
||||
def record_transfer(self, res: nixlXferTelemetry):
|
||||
# Keep metrics units consistent with rest of the code: time us->s
|
||||
self.data["transfer_duration"].append(res.xferDuration / 1e6)
|
||||
self.data["post_duration"].append(res.postDuration / 1e6)
|
||||
self.data["bytes_transferred"].append(res.totalBytes)
|
||||
self.data["num_descriptors"].append(res.descCount)
|
||||
def record_transfer(self):
|
||||
# TODO: record actual transfer stats when available
|
||||
self.data["num_successful_transfers"] += 1
|
||||
|
||||
def clone_and_reset(self) -> "NixlKVConnectorStats":
|
||||
old = copy.copy(self)
|
||||
@ -1482,55 +1464,16 @@ class NixlKVConnectorStats(KVConnectorStats):
|
||||
return old
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return self.num_successful_transfers == 0
|
||||
return self.data["num_successful_transfers"] == 0
|
||||
|
||||
def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
|
||||
if not other.is_empty():
|
||||
for k, v in other.data.items():
|
||||
accumulator = self.data[k]
|
||||
assert isinstance(accumulator, list)
|
||||
accumulator.extend(v)
|
||||
self.data["num_successful_transfers"] += other.data[
|
||||
"num_successful_transfers"]
|
||||
return self
|
||||
|
||||
def reduce(self) -> dict[str, Union[int, float]]:
|
||||
# Compute compact representative stats suitable for CLI logging
|
||||
if self.is_empty():
|
||||
return {
|
||||
"Num successful transfers": 0,
|
||||
"Avg xfer time (ms)": 0,
|
||||
"P90 xfer time (ms)": 0,
|
||||
"Avg post time (ms)": 0,
|
||||
"P90 post time (ms)": 0,
|
||||
"Avg MB per transfer": 0,
|
||||
"Throughput (MB/s)": 0,
|
||||
"Avg number of descriptors": 0,
|
||||
}
|
||||
|
||||
xfer_time = np.asarray(self.data["transfer_duration"])
|
||||
post_time = np.asarray(self.data["post_duration"])
|
||||
# Convert to MB for CLI logging.
|
||||
mb = np.asarray(self.data["bytes_transferred"]) / 2**20
|
||||
descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
|
||||
n = len(descs)
|
||||
assert n == self.num_successful_transfers
|
||||
|
||||
total_mb = mb.sum()
|
||||
avg_mb = total_mb / n
|
||||
|
||||
total_time_seconds = xfer_time.sum()
|
||||
throughput_mb_s = total_mb / total_time_seconds
|
||||
|
||||
# TODO: reduce stats to a single value, calculate latency/throughput
|
||||
return {
|
||||
"Num successful transfers": n,
|
||||
"Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
|
||||
"P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3),
|
||||
"Avg post time (ms)": round(post_time.mean() * 1e3, 3),
|
||||
"P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3),
|
||||
"Avg MB per transfer": round(avg_mb, 3),
|
||||
"Throughput (MB/s)": round(throughput_mb_s, 3),
|
||||
"Avg number of descriptors": round(descs.mean(), 1),
|
||||
"num_successful_transfers": self.data["num_successful_transfers"]
|
||||
}
|
||||
|
||||
@property
|
||||
def num_successful_transfers(self) -> int:
|
||||
return len(self.data["transfer_duration"])
|
@ -376,7 +376,7 @@ class EngineArgs:
|
||||
quantization: Optional[QuantizationMethods] = ModelConfig.quantization
|
||||
enforce_eager: bool = ModelConfig.enforce_eager
|
||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||
limit_mm_per_prompt: dict[str, Union[int, dict[str, int]]] = \
|
||||
limit_mm_per_prompt: dict[str, int] = \
|
||||
get_field(MultiModalConfig, "limit_per_prompt")
|
||||
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
|
||||
media_io_kwargs: dict[str, dict[str,
|
||||
@ -1131,10 +1131,6 @@ class EngineArgs:
|
||||
device_config = DeviceConfig(
|
||||
device=cast(Device, current_platform.device_type))
|
||||
|
||||
model_config = self.create_model_config()
|
||||
self.model = model_config.model
|
||||
self.tokenizer = model_config.tokenizer
|
||||
|
||||
(self.model, self.tokenizer,
|
||||
self.speculative_config) = maybe_override_with_speculators(
|
||||
model=self.model,
|
||||
@ -1143,6 +1139,7 @@ class EngineArgs:
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
vllm_speculative_config=self.speculative_config,
|
||||
)
|
||||
model_config = self.create_model_config()
|
||||
|
||||
# * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
|
||||
# and fall back to V0 for experimental or unsupported features.
|
||||
|
@ -19,7 +19,6 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import Device, collect_from_async_generator, random_uuid
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -50,16 +49,12 @@ class EngineClient(ABC):
|
||||
@abstractmethod
|
||||
def generate(
|
||||
self,
|
||||
prompt: Union[EngineCoreRequest, PromptType],
|
||||
prompt: PromptType,
|
||||
sampling_params: SamplingParams,
|
||||
request_id: str,
|
||||
*,
|
||||
prompt_text: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
trace_headers: Optional[Mapping[str, str]] = None,
|
||||
priority: int = 0,
|
||||
data_parallel_rank: Optional[int] = None,
|
||||
) -> AsyncGenerator[RequestOutput, None]:
|
||||
"""Generate outputs for a request."""
|
||||
...
|
||||
@ -99,15 +94,10 @@ class EngineClient(ABC):
|
||||
# this happens again in generation, so the double expansion causes
|
||||
# a mismatch.
|
||||
# TODO - would be ideal to handle this more gracefully.
|
||||
if isinstance(prompt, str):
|
||||
prompt_text = prompt
|
||||
prompt_token_ids = []
|
||||
multi_modal_data = None
|
||||
else:
|
||||
prompt_text = prompt.get("prompt")
|
||||
prompt_token_ids = prompt.get("prompt_token_ids", [])
|
||||
multi_modal_data = prompt.get("multi_modal_data")
|
||||
prompt_token_ids = prompt.get("prompt_token_ids")
|
||||
multi_modal_data = prompt.get("multi_modal_data")
|
||||
|
||||
prompt_text = processed_inputs.get("prompt")
|
||||
mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
|
||||
|
||||
tokenized_length = len(prompt_token_ids)
|
||||
|
@ -37,7 +37,6 @@ from vllm.entrypoints.utils import (_validate_truncation_size,
|
||||
log_non_default_args)
|
||||
from vllm.inputs import (DataPrompt, PromptType, SingletonPrompt, TextPrompt,
|
||||
TokensPrompt)
|
||||
from vllm.inputs.parse import get_prompt_components
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
@ -50,13 +49,10 @@ from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
|
||||
SamplingParams)
|
||||
from vllm.tasks import PoolingTask
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
get_cached_tokenizer,
|
||||
init_tokenizer_from_configs)
|
||||
get_cached_tokenizer)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Counter, Device, as_iter, is_list_of
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
from vllm.v1.engine.processor import Processor
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -316,10 +312,6 @@ class LLM:
|
||||
self.io_processor = get_io_processor(self.llm_engine.vllm_config,
|
||||
io_processor_plugin)
|
||||
|
||||
@property
|
||||
def model_config(self):
|
||||
return self.llm_engine.model_config
|
||||
|
||||
def get_tokenizer(self) -> AnyTokenizer:
|
||||
return self.llm_engine.get_tokenizer()
|
||||
|
||||
@ -332,16 +324,6 @@ class LLM:
|
||||
else:
|
||||
self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
|
||||
|
||||
def _get_processor(self) -> Processor:
|
||||
if not hasattr(self, "_processor"):
|
||||
vllm_config = self.llm_engine.vllm_config
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = init_tokenizer_from_configs(self.model_config)
|
||||
self._processor = Processor(vllm_config, tokenizer)
|
||||
return self._processor
|
||||
|
||||
def get_default_sampling_params(self) -> SamplingParams:
|
||||
if self.default_sampling_params is None:
|
||||
self.default_sampling_params = (
|
||||
@ -1515,6 +1497,8 @@ class LLM:
|
||||
tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
|
||||
it = tqdm_func(it, desc="Adding requests")
|
||||
|
||||
model_config = self.llm_engine.model_config
|
||||
|
||||
for i, prompt in enumerate(it):
|
||||
|
||||
if isinstance(prompt, dict):
|
||||
@ -1522,9 +1506,17 @@ class LLM:
|
||||
prompt.get("multi_modal_data"),
|
||||
prompt.get("multi_modal_uuids"))
|
||||
|
||||
param = params[i] if isinstance(params, Sequence) else params
|
||||
|
||||
tokenization_kwargs: dict[str, Any] = {}
|
||||
_validate_truncation_size(model_config.max_model_len,
|
||||
param.truncate_prompt_tokens,
|
||||
tokenization_kwargs)
|
||||
|
||||
self._add_request(
|
||||
prompt,
|
||||
params[i] if isinstance(params, Sequence) else params,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request[i] if isinstance(
|
||||
lora_request, Sequence) else lora_request,
|
||||
priority=priority[i] if priority else 0,
|
||||
@ -1565,58 +1557,22 @@ class LLM:
|
||||
raise ValueError(f"Multi-modal data for {modality} is None"
|
||||
f" but UUID is not provided")
|
||||
|
||||
def _process_inputs(
|
||||
self,
|
||||
request_id: str,
|
||||
engine_prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
*,
|
||||
lora_request: Optional[LoRARequest],
|
||||
priority: int,
|
||||
) -> tuple[EngineCoreRequest, dict[str, Any]]:
|
||||
"""Use the Processor to process inputs for LLMEngine."""
|
||||
tokenization_kwargs: dict[str, Any] = {}
|
||||
_validate_truncation_size(self.model_config.max_model_len,
|
||||
params.truncate_prompt_tokens,
|
||||
tokenization_kwargs)
|
||||
|
||||
processor = self._get_processor()
|
||||
engine_request = processor.process_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
params,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
priority=priority,
|
||||
)
|
||||
return engine_request, tokenization_kwargs
|
||||
|
||||
def _add_request(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
priority: int = 0,
|
||||
) -> None:
|
||||
prompt_text, _, _ = get_prompt_components(prompt)
|
||||
request_id = str(next(self.request_counter))
|
||||
|
||||
engine_request, tokenization_kwargs = self._process_inputs(
|
||||
self.llm_engine.add_request(
|
||||
request_id,
|
||||
prompt,
|
||||
params,
|
||||
lora_request=lora_request,
|
||||
priority=priority,
|
||||
)
|
||||
|
||||
self.llm_engine.add_request(
|
||||
request_id,
|
||||
engine_request,
|
||||
params,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
priority=priority,
|
||||
prompt_text=prompt_text,
|
||||
)
|
||||
|
||||
def _run_engine(
|
||||
|
@ -274,8 +274,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
generators: list[AsyncGenerator[RequestOutput, None]] = []
|
||||
try:
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
prompt_text, _, _ = (self._get_prompt_components(
|
||||
request_prompts[i]))
|
||||
sampling_params: Union[SamplingParams, BeamSearchParams]
|
||||
|
||||
if self.default_sampling_params is None:
|
||||
self.default_sampling_params = {}
|
||||
@ -286,7 +285,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
input_length=len(engine_prompt["prompt_token_ids"]),
|
||||
default_sampling_params=self.default_sampling_params)
|
||||
|
||||
sampling_params: Union[SamplingParams, BeamSearchParams]
|
||||
if request.use_beam_search:
|
||||
sampling_params = request.to_beam_search_params(
|
||||
max_tokens, self.default_sampling_params)
|
||||
@ -311,25 +309,13 @@ class OpenAIServingChat(OpenAIServing):
|
||||
lora_request=lora_request,
|
||||
)
|
||||
else:
|
||||
engine_request, tokenization_kwargs = (
|
||||
await self._process_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
))
|
||||
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
prompt_text=prompt_text,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
@ -691,13 +677,11 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if self.use_harmony:
|
||||
harmony_parser = harmony_parsers[i]
|
||||
prev_recipient = harmony_parser.current_recipient
|
||||
delta_text = ""
|
||||
for token_id in output.token_ids:
|
||||
harmony_parser.process(token_id)
|
||||
delta_text += (harmony_parser.last_content_delta
|
||||
or "")
|
||||
cur_channel = harmony_parser.current_channel
|
||||
cur_recipient = harmony_parser.current_recipient
|
||||
delta_text = harmony_parser.last_content_delta or ""
|
||||
else:
|
||||
delta_text = output.text
|
||||
|
||||
@ -1591,9 +1575,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
sys_msg = get_system_message(
|
||||
reasoning_effort=request.reasoning_effort,
|
||||
browser_description=None,
|
||||
python_description=None,
|
||||
with_custom_tools=request.tools is not None
|
||||
)
|
||||
python_description=None)
|
||||
messages.append(sys_msg)
|
||||
|
||||
# Add developer message.
|
||||
|
@ -9,6 +9,7 @@ from typing import Optional, Union, cast
|
||||
|
||||
import jinja2
|
||||
from fastapi import Request
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
@ -31,7 +32,8 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.renderer import RenderConfig
|
||||
from vllm.entrypoints.utils import get_max_tokens
|
||||
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
|
||||
from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
|
||||
is_tokens_prompt)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.outputs import RequestOutput
|
||||
@ -155,16 +157,23 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
generators: list[AsyncGenerator[RequestOutput, None]] = []
|
||||
try:
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
prompt_text, prompt_token_ids, prompt_embeds = (
|
||||
self._get_prompt_components(engine_prompt))
|
||||
|
||||
input_length = None
|
||||
if prompt_token_ids is not None:
|
||||
input_length = len(prompt_token_ids)
|
||||
elif prompt_embeds is not None:
|
||||
input_length = len(prompt_embeds)
|
||||
sampling_params: Union[SamplingParams, BeamSearchParams]
|
||||
# Mypy does not infer that engine_prompt will have only one of
|
||||
# "prompt_token_ids" or "prompt_embeds" defined, and both of
|
||||
# these as Union[object, the expected type], where it infers
|
||||
# object if engine_prompt is a subclass of one of the
|
||||
# typeddicts that defines both keys. Worse, because of
|
||||
# https://github.com/python/mypy/issues/8586, mypy does not
|
||||
# infer the type of engine_prompt correctly because of the
|
||||
# enumerate. So we need an unnecessary cast here.
|
||||
engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt],
|
||||
engine_prompt)
|
||||
if is_embeds_prompt(engine_prompt):
|
||||
input_length = len(engine_prompt["prompt_embeds"])
|
||||
elif is_tokens_prompt(engine_prompt):
|
||||
input_length = len(engine_prompt["prompt_token_ids"])
|
||||
else:
|
||||
raise NotImplementedError
|
||||
assert_never(engine_prompt)
|
||||
|
||||
if self.default_sampling_params is None:
|
||||
self.default_sampling_params = {}
|
||||
@ -176,7 +185,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
default_sampling_params=self.default_sampling_params,
|
||||
)
|
||||
|
||||
sampling_params: Union[SamplingParams, BeamSearchParams]
|
||||
if request.use_beam_search:
|
||||
sampling_params = request.to_beam_search_params(
|
||||
max_tokens, self.default_sampling_params)
|
||||
@ -212,25 +220,13 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
lora_request=lora_request,
|
||||
)
|
||||
else:
|
||||
engine_request, tokenization_kwargs = (
|
||||
await self._process_inputs(
|
||||
request_id_item,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
))
|
||||
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
request_id_item,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
prompt_text=prompt_text,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
|
@ -15,11 +15,6 @@ from pydantic import BaseModel, ConfigDict, Field
|
||||
from starlette.datastructures import Headers
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.processor import Processor
|
||||
|
||||
if sys.version_info >= (3, 12):
|
||||
from typing import TypedDict
|
||||
else:
|
||||
@ -68,7 +63,6 @@ from vllm.entrypoints.renderer import (BaseRenderer, CompletionRenderer,
|
||||
# yapf: enable
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||
from vllm.inputs.parse import PromptComponents, get_prompt_components
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob, PromptLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
@ -245,16 +239,6 @@ class OpenAIServing:
|
||||
AsyncMicrobatchTokenizer] = {}
|
||||
self.log_error_stack = log_error_stack
|
||||
|
||||
async def _get_processor(self) -> Processor:
|
||||
if not hasattr(self, "_processor"):
|
||||
vllm_config = await self.engine_client.get_vllm_config()
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = init_tokenizer_from_configs(self.model_config)
|
||||
self._processor = Processor(vllm_config, tokenizer)
|
||||
return self._processor
|
||||
|
||||
def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer:
|
||||
"""
|
||||
Get a Renderer instance with the provided tokenizer.
|
||||
@ -866,34 +850,6 @@ class OpenAIServing:
|
||||
|
||||
return conversation, [request_prompt], [engine_prompt]
|
||||
|
||||
async def _process_inputs(
|
||||
self,
|
||||
request_id: str,
|
||||
engine_prompt: PromptType,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
*,
|
||||
lora_request: Optional[LoRARequest],
|
||||
trace_headers: Optional[Mapping[str, str]],
|
||||
priority: int,
|
||||
) -> tuple[EngineCoreRequest, dict[str, Any]]:
|
||||
"""Use the Processor to process inputs for AsyncLLM."""
|
||||
tokenization_kwargs: dict[str, Any] = {}
|
||||
_validate_truncation_size(self.max_model_len,
|
||||
params.truncate_prompt_tokens,
|
||||
tokenization_kwargs)
|
||||
|
||||
processor = await self._get_processor()
|
||||
engine_request = processor.process_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
params,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
trace_headers=trace_headers,
|
||||
priority=priority,
|
||||
)
|
||||
return engine_request, tokenization_kwargs
|
||||
|
||||
async def _generate_with_builtin_tools(
|
||||
self,
|
||||
request_id: str,
|
||||
@ -905,7 +861,6 @@ class OpenAIServing:
|
||||
priority: int = 0,
|
||||
**kwargs,
|
||||
):
|
||||
prompt_text, _, _ = self._get_prompt_components(request_prompt)
|
||||
orig_priority = priority
|
||||
while True:
|
||||
self._log_inputs(
|
||||
@ -914,27 +869,14 @@ class OpenAIServing:
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
trace_headers = kwargs.get("trace_headers")
|
||||
engine_request, tokenization_kwargs = (await self._process_inputs(
|
||||
request_id,
|
||||
generator = self.engine_client.generate(
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=priority,
|
||||
))
|
||||
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
sampling_params,
|
||||
request_id,
|
||||
lora_request=lora_request,
|
||||
priority=priority,
|
||||
prompt_text=prompt_text,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async for res in generator:
|
||||
context.append_output(res)
|
||||
# NOTE(woosuk): The stop condition is handled by the engine.
|
||||
@ -963,15 +905,6 @@ class OpenAIServing:
|
||||
# OPTIMIZATION
|
||||
priority = orig_priority - 1
|
||||
|
||||
def _get_prompt_components(
|
||||
self,
|
||||
prompt: Union[RequestPrompt, PromptType],
|
||||
) -> PromptComponents:
|
||||
if isinstance(prompt, list):
|
||||
return PromptComponents(token_ids=prompt)
|
||||
|
||||
return get_prompt_components(prompt) # type: ignore[arg-type]
|
||||
|
||||
def _log_inputs(
|
||||
self,
|
||||
request_id: str,
|
||||
@ -982,9 +915,14 @@ class OpenAIServing:
|
||||
) -> None:
|
||||
if self.request_logger is None:
|
||||
return
|
||||
|
||||
prompt, prompt_token_ids, prompt_embeds = (
|
||||
self._get_prompt_components(inputs))
|
||||
prompt, prompt_token_ids, prompt_embeds = None, None, None
|
||||
if isinstance(inputs, str):
|
||||
prompt = inputs
|
||||
elif isinstance(inputs, list):
|
||||
prompt_token_ids = inputs
|
||||
else:
|
||||
prompt = getattr(inputs, 'prompt', None)
|
||||
prompt_token_ids = getattr(inputs, 'prompt_token_ids', None)
|
||||
|
||||
self.request_logger.log_inputs(
|
||||
request_id,
|
||||
|
@ -445,19 +445,6 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
|
||||
return messages, [prompt_token_ids], [engine_prompt]
|
||||
|
||||
async def _initialize_tool_sessions(self, request: ResponsesRequest,
|
||||
context: ConversationContext,
|
||||
exit_stack: AsyncExitStack):
|
||||
# we should only initialize the tool session if the request needs tools
|
||||
if len(request.tools) == 0:
|
||||
return
|
||||
mcp_tools = {
|
||||
tool.server_label: tool
|
||||
for tool in request.tools if tool.type == "mcp"
|
||||
}
|
||||
await context.init_tool_sessions(self.tool_server, exit_stack,
|
||||
request.request_id, mcp_tools)
|
||||
|
||||
async def responses_full_generator(
|
||||
self,
|
||||
request: ResponsesRequest,
|
||||
@ -474,8 +461,12 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
|
||||
async with AsyncExitStack() as exit_stack:
|
||||
try:
|
||||
await self._initialize_tool_sessions(request, context,
|
||||
exit_stack)
|
||||
mcp_tools = {
|
||||
tool.server_label: tool
|
||||
for tool in request.tools if tool.type == "mcp"
|
||||
}
|
||||
await context.init_tool_sessions(self.tool_server, exit_stack,
|
||||
request.request_id, mcp_tools)
|
||||
async for _ in result_generator:
|
||||
pass
|
||||
except asyncio.CancelledError:
|
||||
@ -1659,10 +1650,12 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
async with AsyncExitStack() as exit_stack:
|
||||
processer = None
|
||||
if self.use_harmony:
|
||||
# TODO: in streaming, we noticed this bug:
|
||||
# https://github.com/vllm-project/vllm/issues/25697
|
||||
await self._initialize_tool_sessions(request, context,
|
||||
exit_stack)
|
||||
mcp_tools = {
|
||||
tool.server_label: tool
|
||||
for tool in request.tools if tool.type == "mcp"
|
||||
}
|
||||
await context.init_tool_sessions(self.tool_server, exit_stack,
|
||||
request.request_id, mcp_tools)
|
||||
processer = self._process_harmony_streaming_events
|
||||
else:
|
||||
processer = self._process_simple_streaming_events
|
||||
|
11
vllm/envs.py
11
vllm/envs.py
@ -68,7 +68,6 @@ if TYPE_CHECKING:
|
||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
||||
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
|
||||
VLLM_MEDIA_URL_ALLOW_REDIRECTS: bool = True
|
||||
VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
|
||||
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
|
||||
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
|
||||
@ -619,9 +618,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# All possible options loaded dynamically from _Backend enum
|
||||
"VLLM_ATTENTION_BACKEND":
|
||||
env_with_choices("VLLM_ATTENTION_BACKEND", None,
|
||||
lambda: list(__import__(
|
||||
'vllm.attention.backends.registry',
|
||||
fromlist=['_Backend'])._Backend.__members__.keys())),
|
||||
lambda: list(__import__('vllm.platforms.interface', \
|
||||
fromlist=['_Backend'])._Backend.__members__.keys())),
|
||||
|
||||
# If set, vllm will use flashinfer sampler
|
||||
"VLLM_USE_FLASHINFER_SAMPLER":
|
||||
@ -735,11 +733,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
|
||||
|
||||
# Whether to allow HTTP redirects when fetching from media URLs.
|
||||
# Default to True
|
||||
"VLLM_MEDIA_URL_ALLOW_REDIRECTS":
|
||||
lambda: bool(int(os.getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1"))),
|
||||
|
||||
# Max number of workers for the thread pool handling
|
||||
# media bytes loading. Set to 1 to disable parallel processing.
|
||||
# Default is 8
|
||||
|
@ -205,6 +205,11 @@ class TokenInputs(TypedDict):
|
||||
prompt_token_ids: list[int]
|
||||
"""The token IDs of the prompt."""
|
||||
|
||||
prompt: NotRequired[str]
|
||||
"""
|
||||
The original prompt text corresponding to the token IDs, if available.
|
||||
"""
|
||||
|
||||
cache_salt: NotRequired[str]
|
||||
"""
|
||||
Optional cache salt to be used for prefix caching.
|
||||
@ -213,12 +218,15 @@ class TokenInputs(TypedDict):
|
||||
|
||||
def token_inputs(
|
||||
prompt_token_ids: list[int],
|
||||
prompt: Optional[str] = None,
|
||||
cache_salt: Optional[str] = None,
|
||||
) -> TokenInputs:
|
||||
"""Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
|
||||
values."""
|
||||
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
|
||||
|
||||
if prompt is not None:
|
||||
inputs["prompt"] = prompt
|
||||
if cache_salt is not None:
|
||||
inputs["cache_salt"] = cache_salt
|
||||
|
||||
|
@ -1,8 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Sequence
|
||||
from typing import (TYPE_CHECKING, Literal, NamedTuple, Optional, TypedDict,
|
||||
Union, cast, overload)
|
||||
from typing import Literal, Optional, TypedDict, Union, cast, overload
|
||||
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
@ -12,9 +11,6 @@ from .data import (EmbedsPrompt, ExplicitEncoderDecoderPrompt, ProcessorInputs,
|
||||
PromptType, SingletonInputs, SingletonPrompt, TextPrompt,
|
||||
TokensPrompt)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
|
||||
class ParsedText(TypedDict):
|
||||
content: str
|
||||
@ -153,23 +149,3 @@ def split_enc_dec_inputs(
|
||||
)
|
||||
|
||||
return None, inputs
|
||||
|
||||
|
||||
class PromptComponents(NamedTuple):
|
||||
text: Optional[str] = None
|
||||
token_ids: Optional[list[int]] = None
|
||||
embeds: Optional["torch.Tensor"] = None
|
||||
|
||||
|
||||
def get_prompt_components(prompt: PromptType) -> PromptComponents:
|
||||
if isinstance(prompt, str):
|
||||
return PromptComponents(text=prompt)
|
||||
|
||||
if (encoder_prompt := prompt.get("encoder_prompt")):
|
||||
return get_prompt_components(encoder_prompt) # type: ignore[arg-type]
|
||||
|
||||
return PromptComponents(
|
||||
text=prompt.get("prompt"), # type: ignore[arg-type]
|
||||
token_ids=prompt.get("prompt_token_ids"), # type: ignore[arg-type]
|
||||
embeds=prompt.get("prompt_embeds"),
|
||||
)
|
||||
|
@ -16,10 +16,9 @@ from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
|
||||
EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
|
||||
ProcessorInputs, PromptType, SingletonInputs,
|
||||
SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
|
||||
embeds_inputs, token_inputs)
|
||||
EncoderDecoderInputs, ProcessorInputs, PromptType,
|
||||
SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
|
||||
TokensPrompt, embeds_inputs, token_inputs)
|
||||
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -323,7 +322,7 @@ class InputPreprocessor:
|
||||
mm_uuids=mm_uuids,
|
||||
)
|
||||
else:
|
||||
inputs = token_inputs(prompt_token_ids)
|
||||
inputs = token_inputs(prompt_token_ids=prompt_token_ids)
|
||||
|
||||
if cache_salt := parsed_content.get("cache_salt"):
|
||||
inputs["cache_salt"] = cache_salt
|
||||
@ -353,7 +352,10 @@ class InputPreprocessor:
|
||||
prompt_text,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
inputs = token_inputs(prompt_token_ids)
|
||||
inputs = token_inputs(
|
||||
prompt=prompt_text,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
)
|
||||
|
||||
if cache_salt := parsed_content.get("cache_salt"):
|
||||
inputs["cache_salt"] = cache_salt
|
||||
@ -471,17 +473,22 @@ class InputPreprocessor:
|
||||
decoder_inputs: SingletonInputs
|
||||
|
||||
if inputs["type"] == "multimodal": # Multimodal data inputs
|
||||
if "encoder_prompt_token_ids" not in inputs:
|
||||
if not ("encoder_prompt" in inputs
|
||||
and "encoder_prompt_token_ids" in inputs):
|
||||
raise RuntimeError("You should register an encoder-decoder "
|
||||
"multi-modal processor for encoder-decoder "
|
||||
"models.")
|
||||
inputs = cast(MultiModalEncDecInputs, inputs)
|
||||
|
||||
encoder_inputs = token_inputs(inputs["encoder_prompt_token_ids"])
|
||||
encoder_inputs = token_inputs(
|
||||
prompt=inputs["encoder_prompt"],
|
||||
prompt_token_ids=inputs["encoder_prompt_token_ids"],
|
||||
)
|
||||
|
||||
decoder_prompt_inputs = decoder_inputs_to_override or inputs
|
||||
decoder_inputs = MultiModalInputs(
|
||||
type="multimodal",
|
||||
prompt=decoder_prompt_inputs.get("prompt", ""),
|
||||
prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
|
||||
mm_kwargs=inputs["mm_kwargs"],
|
||||
mm_hashes=inputs["mm_hashes"],
|
||||
@ -491,7 +498,7 @@ class InputPreprocessor:
|
||||
decoder_inputs["cache_salt"] = cache_salt
|
||||
|
||||
elif inputs["type"] == "token": # Text-only inputs
|
||||
encoder_inputs = token_inputs(prompt_token_ids=[])
|
||||
encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
|
||||
decoder_inputs = decoder_inputs_to_override or inputs
|
||||
else:
|
||||
assert_never(inputs) # type: ignore[arg-type]
|
||||
@ -542,14 +549,12 @@ class InputPreprocessor:
|
||||
decoder_inputs: Optional[SingletonInputs]
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
# `cast` is needed for mypy, but not pyright
|
||||
prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
|
||||
encoder_inputs = self._prompt_to_llm_inputs(
|
||||
prompt_["encoder_prompt"],
|
||||
prompt["encoder_prompt"],
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_uuids=mm_uuids,
|
||||
)
|
||||
if (decoder_input := prompt_["decoder_prompt"]) is None:
|
||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
||||
decoder_inputs = None
|
||||
else:
|
||||
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
|
||||
@ -560,9 +565,8 @@ class InputPreprocessor:
|
||||
self._split_enc_dec_mm_inputs(encoder_inputs,
|
||||
decoder_inputs))
|
||||
else:
|
||||
# `cast` is needed for mypy, but not pyright
|
||||
inputs = self._prompt_to_llm_inputs(
|
||||
cast(SingletonPrompt, prompt),
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_uuids=mm_uuids,
|
||||
)
|
||||
@ -637,9 +641,8 @@ class InputPreprocessor:
|
||||
"to decoder-only models")
|
||||
|
||||
# Decoder-only operation
|
||||
# `cast` is needed for mypy, but not pyright
|
||||
return self._process_decoder_only_prompt(
|
||||
cast(SingletonPrompt, prompt),
|
||||
prompt,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_uuids=mm_uuids,
|
||||
)
|
||||
|
@ -355,7 +355,7 @@ def batched_triton_kernel(
|
||||
|
||||
def invoke_moe_batched_triton_kernel(
|
||||
A: torch.Tensor, # [E, max_tokens, K]
|
||||
B: torch.Tensor, # [E, N, K]
|
||||
B: torch.Tensor, # [E, K, N]
|
||||
C: torch.Tensor, # [E, max_tokens, N]
|
||||
expert_num_tokens: torch.Tensor, # [E]
|
||||
compute_type: tl.dtype,
|
||||
|
@ -4,6 +4,9 @@
|
||||
import functools
|
||||
import json
|
||||
import os
|
||||
# torch.compile needs typing.List. It will fail torch.library.infer_schema
|
||||
# otherwise
|
||||
from typing import List # noqa: UP035
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
@ -1226,7 +1229,7 @@ def inplace_fused_experts(
|
||||
w2_zp: Optional[torch.Tensor] = None,
|
||||
a1_scale: Optional[torch.Tensor] = None,
|
||||
a2_scale: Optional[torch.Tensor] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
block_shape: Optional[List[int]] = None, #noqa: UP006
|
||||
w1_bias: Optional[torch.Tensor] = None,
|
||||
w2_bias: Optional[torch.Tensor] = None,
|
||||
) -> None:
|
||||
@ -1260,7 +1263,7 @@ def inplace_fused_experts_fake(
|
||||
w2_zp: Optional[torch.Tensor] = None,
|
||||
a1_scale: Optional[torch.Tensor] = None,
|
||||
a2_scale: Optional[torch.Tensor] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
block_shape: Optional[List[int]] = None, #noqa: UP006
|
||||
w1_bias: Optional[torch.Tensor] = None,
|
||||
w2_bias: Optional[torch.Tensor] = None,
|
||||
) -> None:
|
||||
@ -1299,7 +1302,7 @@ def outplace_fused_experts(
|
||||
w2_zp: Optional[torch.Tensor] = None,
|
||||
a1_scale: Optional[torch.Tensor] = None,
|
||||
a2_scale: Optional[torch.Tensor] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
block_shape: Optional[List[int]] = None, #noqa: UP006
|
||||
w1_bias: Optional[torch.Tensor] = None,
|
||||
w2_bias: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
|
@ -960,7 +960,6 @@ class FusedMoE(CustomOp):
|
||||
is_sequence_parallel=False,
|
||||
zero_expert_num: Optional[int] = 0,
|
||||
zero_expert_type: Optional[str] = None,
|
||||
expert_mapping: Optional[list[tuple[str, str, int, str]]] = None,
|
||||
):
|
||||
super().__init__()
|
||||
if params_dtype is None:
|
||||
@ -997,9 +996,6 @@ class FusedMoE(CustomOp):
|
||||
self.zero_expert_num = zero_expert_num
|
||||
self.zero_expert_type = zero_expert_type
|
||||
|
||||
# Expert mapping used in self.load_weights
|
||||
self.expert_mapping = expert_mapping
|
||||
|
||||
# Round up hidden size if needed.
|
||||
hidden_size = maybe_roundup_hidden_size(hidden_size, moe_in_dtype,
|
||||
quant_config,
|
||||
@ -1621,33 +1617,6 @@ class FusedMoE(CustomOp):
|
||||
|
||||
return False if return_success else None
|
||||
|
||||
def load_weights(
|
||||
self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> Iterable[str]:
|
||||
if (expert_mapping := self.expert_mapping) is None:
|
||||
raise ValueError("`self.expert_mapping` must be provided to "
|
||||
"load weights using `self.load_weights`.")
|
||||
for expert_name, loaded_weight in weights:
|
||||
qual_name = f"{self.layer_name}.{expert_name}"
|
||||
for param_name, weight_name, expert_id, shard_id in expert_mapping:
|
||||
if weight_name not in qual_name:
|
||||
continue
|
||||
weight_name = qual_name.replace(weight_name, param_name)
|
||||
param_name = weight_name.removeprefix(f"{self.layer_name}.")
|
||||
param = getattr(self, param_name)
|
||||
success = self.weight_loader(
|
||||
param=param,
|
||||
loaded_weight=loaded_weight,
|
||||
weight_name=weight_name,
|
||||
shard_id=shard_id,
|
||||
expert_id=expert_id,
|
||||
return_success=True,
|
||||
)
|
||||
if success:
|
||||
logger.debug("Loaded %s for expert %d into %s", param_name,
|
||||
expert_id, self.layer_name)
|
||||
yield param_name
|
||||
|
||||
def get_expert_weights(self) -> Iterable[torch.Tensor]:
|
||||
weights = list(self.named_parameters())
|
||||
assert all(weight.is_contiguous() for _, weight in weights)
|
||||
@ -1930,15 +1899,6 @@ class FusedMoE(CustomOp):
|
||||
staged_hidden_states.copy_(hidden_states, non_blocking=True)
|
||||
staged_router_logits.copy_(router_logits, non_blocking=True)
|
||||
|
||||
# If there are shared experts but we are not using a modular kernel,
|
||||
# the shared experts must be called here
|
||||
if (not isinstance(self.quant_method.fused_experts,
|
||||
FusedMoEModularKernel)
|
||||
and self.shared_experts is not None):
|
||||
shared_output = self.shared_experts(staged_hidden_states)
|
||||
else:
|
||||
shared_output = None
|
||||
|
||||
# Matrix multiply.
|
||||
final_hidden_states = self.quant_method.apply(
|
||||
layer=self,
|
||||
@ -1962,13 +1922,8 @@ class FusedMoE(CustomOp):
|
||||
logical_replica_count=self.logical_replica_count,
|
||||
)
|
||||
|
||||
if shared_output is not None:
|
||||
assert not isinstance(final_hidden_states, tuple)
|
||||
assert self.shared_experts is not None
|
||||
final_hidden_states = (
|
||||
shared_output,
|
||||
final_hidden_states,
|
||||
)
|
||||
assert self.shared_experts is None or isinstance(
|
||||
final_hidden_states, tuple)
|
||||
|
||||
if self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(final_hidden_states, tuple)
|
||||
|
@ -115,7 +115,7 @@ class ShortConv(MambaBase, CustomOp):
|
||||
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
|
||||
conv_state = self_kv_cache[0].transpose(-1, -2)
|
||||
state_indices_tensor = attn_metadata.state_indices_tensor
|
||||
has_initial_states_p = attn_metadata.has_initial_states_p
|
||||
has_initial_states_p = attn_metadata.has_initial_states
|
||||
|
||||
BCx, _ = self.in_proj(hidden_states)
|
||||
|
||||
|
@ -644,14 +644,6 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
# If no matches, return None
|
||||
return None
|
||||
|
||||
def has_blocked_weights(self) -> bool:
|
||||
for scheme in self.target_scheme_map.values():
|
||||
weight_quant = scheme.get("weights")
|
||||
if (weight_quant is not None
|
||||
and weight_quant.strategy == QuantizationStrategy.BLOCK):
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def supports_cutlass_24(
|
||||
weight_quant: Optional[QuantizationArgs],
|
||||
|
@ -11,7 +11,7 @@ from torch.nn import Parameter
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
||||
CompressedTensorsScheme)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support,
|
||||
apply_fp8_block_linear, check_aiter_fp8_linear_support,
|
||||
create_fp8_input_scale, create_fp8_scale_parameter,
|
||||
create_fp8_weight_parameter, maybe_post_process_fp8_weight_block,
|
||||
process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy,
|
||||
@ -41,30 +41,16 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
|
||||
self.strategy = weight_quant.strategy
|
||||
self.out_dtype = torch.get_default_dtype()
|
||||
self.is_static_input_scheme = is_static_input_scheme
|
||||
self.act_q_group_shape = GroupShape.PER_TENSOR \
|
||||
if is_static_input_scheme else GroupShape.PER_TOKEN
|
||||
self.fp8_linear = Fp8LinearOp(
|
||||
act_quant_static=self.is_static_input_scheme,
|
||||
act_quant_group_shape=self.act_q_group_shape)
|
||||
|
||||
self.weight_block_size = self.weight_quant.block_structure
|
||||
if self.weight_block_size is not None:
|
||||
self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
|
||||
else:
|
||||
self.act_q_group_shape = GroupShape.PER_TENSOR \
|
||||
if is_static_input_scheme else GroupShape.PER_TOKEN
|
||||
|
||||
self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
|
||||
self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
|
||||
|
||||
if self.weight_block_size is not None:
|
||||
assert not self.is_static_input_scheme
|
||||
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
|
||||
weight_group_shape=GroupShape(*self.weight_block_size),
|
||||
act_quant_group_shape=self.act_q_group_shape,
|
||||
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
|
||||
use_aiter_and_is_supported=self.use_aiter_and_is_supported,
|
||||
)
|
||||
else:
|
||||
self.fp8_linear = Fp8LinearOp(
|
||||
act_quant_static=self.is_static_input_scheme,
|
||||
act_quant_group_shape=self.act_q_group_shape)
|
||||
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
# lovelace and up
|
||||
@ -156,14 +142,13 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
|
||||
x: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
|
||||
if self.weight_block_size is not None:
|
||||
return self.w8a8_block_fp8_linear.apply(
|
||||
if layer.weight_block_size is not None:
|
||||
return apply_fp8_block_linear(
|
||||
layer,
|
||||
input=x,
|
||||
weight=layer.weight,
|
||||
weight_scale=layer.weight_scale,
|
||||
input_scale=layer.input_scale,
|
||||
bias=bias,
|
||||
)
|
||||
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
|
||||
use_aiter_and_is_supported=self.use_aiter_and_is_supported)
|
||||
|
||||
return self.fp8_linear.apply(input=x,
|
||||
weight=layer.weight,
|
||||
|
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
|
||||
register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
|
||||
select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
W8A8BlockFp8LinearOp, check_aiter_fp8_linear_support,
|
||||
apply_fp8_block_linear, check_aiter_fp8_linear_support,
|
||||
create_fp8_input_scale, create_fp8_scale_parameter,
|
||||
create_fp8_weight_parameter, expert_weight_is_col_major,
|
||||
maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy,
|
||||
@ -242,28 +242,15 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
self.weight_block_size = self.quant_config.weight_block_size
|
||||
self.block_quant = self.weight_block_size is not None
|
||||
self.act_q_static = self.quant_config.activation_scheme == "static"
|
||||
if self.weight_block_size:
|
||||
self.act_q_group_shape = GroupShape(1, self.weight_block_size[0])
|
||||
# Use per-token quantization for better perf if dynamic and cutlass
|
||||
if not self.act_q_static and cutlass_fp8_supported():
|
||||
self.act_q_group_shape = GroupShape.PER_TOKEN
|
||||
else:
|
||||
# Use per-token quantization for better perf if dynamic and cutlass
|
||||
if not self.act_q_static and cutlass_fp8_supported():
|
||||
self.act_q_group_shape = GroupShape.PER_TOKEN
|
||||
else:
|
||||
self.act_q_group_shape = GroupShape.PER_TENSOR
|
||||
self.act_q_group_shape = GroupShape.PER_TENSOR
|
||||
|
||||
if self.block_quant:
|
||||
assert not self.act_q_static
|
||||
assert self.weight_block_size is not None
|
||||
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
|
||||
weight_group_shape=GroupShape(*self.weight_block_size),
|
||||
act_quant_group_shape=self.act_q_group_shape,
|
||||
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
|
||||
use_aiter_and_is_supported=self.use_aiter_and_is_supported,
|
||||
)
|
||||
else:
|
||||
self.fp8_linear = Fp8LinearOp(
|
||||
act_quant_static=self.act_q_static,
|
||||
act_quant_group_shape=self.act_q_group_shape)
|
||||
self.fp8_linear = Fp8LinearOp(
|
||||
act_quant_static=self.act_q_static,
|
||||
act_quant_group_shape=self.act_q_group_shape)
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
@ -412,15 +399,12 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
bias=bias)
|
||||
|
||||
if self.block_quant:
|
||||
assert self.weight_block_size is not None
|
||||
|
||||
return self.w8a8_block_fp8_linear.apply(
|
||||
return apply_fp8_block_linear(
|
||||
layer,
|
||||
input=x,
|
||||
weight=layer.weight,
|
||||
weight_scale=layer.weight_scale,
|
||||
input_scale=layer.input_scale,
|
||||
bias=bias,
|
||||
)
|
||||
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
|
||||
use_aiter_and_is_supported=self.use_aiter_and_is_supported)
|
||||
|
||||
return self.fp8_linear.apply(input=x,
|
||||
weight=layer.weight,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user