Compare commits
34 Commits
codex/remo
...
v0.11.0
Author | SHA1 | Date | |
---|---|---|---|
b8b302cde4 | |||
f71952c1c4 | |||
d1007767c5 | |||
c75c2e70d6 | |||
9d9a2b77f1 | |||
6040e0b6c0 | |||
05bf0c52a1 | |||
c536881a7c | |||
ebce361c07 | |||
e4beabd2c8 | |||
febb688356 | |||
a1825fe645 | |||
bab9231bf1 | |||
c214d699fd | |||
c3dfb0f6dd | |||
83f3c9beae | |||
d0b178cef1 | |||
b3230e1ac0 | |||
03df0fb5d2 | |||
9471879bd4 | |||
ab5b6459df | |||
8ce5d3198d | |||
09c2cbc04a | |||
4c347044c9 | |||
19e7ab7315 | |||
6de3d431d9 | |||
b14773bd64 | |||
26a7a33b88 | |||
5aa5811a16 | |||
c2fa2d4dc9 | |||
32335c8b34 | |||
04c2b26972 | |||
ee10d7e6ff | |||
bb79c4da2f |
@ -368,7 +368,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
|
@ -181,14 +181,18 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
|
@ -365,7 +365,8 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
server_command="$server_envs python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
|
46
.buildkite/pyproject.toml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||||
|
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||||
|
# following differences:
|
||||||
|
# - ruff line length is overridden to 88
|
||||||
|
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
"vllm/version.py" = ["F401"]
|
||||||
|
"vllm/_version.py" = ["ALL"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
"UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
"I",
|
||||||
|
# flake8-logging-format
|
||||||
|
"G",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# Loop control variable not used within loop body
|
||||||
|
"B007",
|
||||||
|
# f-string format
|
||||||
|
"UP032",
|
||||||
|
# Can remove once 3.10+ is the minimum Python version
|
||||||
|
"UP007",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
@ -48,7 +48,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@ -150,16 +150,11 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
- "docker push vllm/vllm-openai:nightly"
|
||||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
# Clean up old nightly builds (keep only last 14)
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||||
plugins:
|
plugins:
|
||||||
@ -168,4 +163,3 @@ steps:
|
|||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
|
||||||
|
@ -8,41 +8,20 @@ set -ex
|
|||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||||
|
|
||||||
# Get DockerHub credentials from environment
|
# Get DockerHub token from environment
|
||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
|
||||||
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get DockerHub bearer token
|
|
||||||
echo "Getting DockerHub bearer token..."
|
|
||||||
set +x
|
|
||||||
BEARER_TOKEN=$(curl -s -X POST \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
|
||||||
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
|
||||||
echo "Error: Failed to get DockerHub bearer token"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Function to get all tags from DockerHub
|
# Function to get all tags from DockerHub
|
||||||
get_all_tags() {
|
get_all_tags() {
|
||||||
local page=1
|
local page=1
|
||||||
local all_tags=""
|
local all_tags=""
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
set +x
|
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
|
||||||
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
|
||||||
"$REPO_API_URL?page=$page&page_size=100")
|
"$REPO_API_URL?page=$page&page_size=100")
|
||||||
set -x
|
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
# Get both last_updated timestamp and tag name, separated by |
|
||||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||||
@ -64,9 +43,7 @@ delete_tag() {
|
|||||||
echo "Deleting tag: $tag_name"
|
echo "Deleting tag: $tag_name"
|
||||||
|
|
||||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||||
set +x
|
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
|
||||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
||||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
||||||
|
@ -1,191 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Base ubuntu image with basic ascend development libraries and python installed
|
|
||||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
|
||||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
|
||||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
|
||||||
VLLM_ASCEND_TMP_DIR=
|
|
||||||
# Get the test run configuration file from the vllm-ascend repository
|
|
||||||
fetch_vllm_test_cfg() {
|
|
||||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
|
||||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
|
||||||
cleanup() {
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
|
||||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If the file already exists locally, just overwrite it
|
|
||||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
|
||||||
|
|
||||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
|
||||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
trap - EXIT
|
|
||||||
}
|
|
||||||
|
|
||||||
# Downloads test run configuration file from a remote URL.
|
|
||||||
# Loads the configuration into the current script environment.
|
|
||||||
get_config() {
|
|
||||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
|
||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# get test running configuration.
|
|
||||||
fetch_vllm_test_cfg
|
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
|
||||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
|
||||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|
||||||
echo "agent_idx: ${agent_idx}"
|
|
||||||
builder_name="cachebuilder${agent_idx}"
|
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
|
||||||
mkdir -p ${builder_cache_dir}
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
|
||||||
FROM ${BASE_IMAGE_NAME}
|
|
||||||
|
|
||||||
# Define environments
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
|
||||||
apt-get update -y && \
|
|
||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
|
||||||
rm -rf /var/cache/apt/* && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install for pytest to make the docker build cache layer always valid
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install pytest>=6.0 modelscope
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements/common.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install vLLM
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
|
||||||
python3 -m pip uninstall -y triton
|
|
||||||
|
|
||||||
# Install vllm-ascend
|
|
||||||
WORKDIR /workspace
|
|
||||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
|
||||||
ARG VLLM_ASCEND_TAG=main
|
|
||||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
|
||||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
|
||||||
|
|
||||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
ENV VLLM_USE_MODELSCOPE=True
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm-ascend
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
|
||||||
parse_and_gen_devices() {
|
|
||||||
local input="$1"
|
|
||||||
local index cards_num
|
|
||||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
|
||||||
index="${BASH_REMATCH[1]}"
|
|
||||||
cards_num="${BASH_REMATCH[2]}"
|
|
||||||
else
|
|
||||||
echo "parse error" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
|
||||||
while (( i < cards_num )); do
|
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
|
||||||
((i++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
|
||||||
# the hardware plugin vllm-ascend.
|
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
|
||||||
mkdir -p ${model_cache_dir}
|
|
||||||
docker run \
|
|
||||||
${devices} \
|
|
||||||
--device /dev/davinci_manager \
|
|
||||||
--device /dev/devmm_svm \
|
|
||||||
--device /dev/hisi_hdc \
|
|
||||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
|
||||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
|
||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/e2e/vllm_interface/
|
|
||||||
'
|
|
@ -42,8 +42,9 @@ docker run \
|
|||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_metrics
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
|
pytest -v -s v1/test_utils.py
|
||||||
|
pytest -v -s v1/test_metrics_reader.py
|
||||||
'
|
'
|
||||||
|
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
|||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
@ -50,28 +50,19 @@ steps:
|
|||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/multimodal
|
|
||||||
- tests/utils_
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
|
||||||
- pytest -v -s utils_
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
|
- tests/utils_
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
no_gpu: true
|
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s utils_ # Utils
|
||||||
|
- pytest -v -s transformers_utils # transformers_utils
|
||||||
|
|
||||||
- label: Python-only Installation Test # 10min
|
- label: Python-only Installation Test # 10min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@ -168,7 +159,10 @@ steps:
|
|||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/distributed
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
|
- tests/v1/test_internal_lb_dp.py
|
||||||
|
- tests/v1/test_hybrid_lb_dp.py
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
@ -186,10 +180,10 @@ steps:
|
|||||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
@ -296,34 +290,26 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
- pytest -v -s v1/kv_offload
|
- pytest -v -s v1/kv_offload
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/logits_processors
|
- pytest -v -s v1/logits_processors
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/spec_decode
|
||||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
- pytest -v -s v1/kv_connector/unit
|
||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
- pytest -v -s v1/metrics
|
||||||
|
- pytest -v -s v1/test_kv_sharing.py
|
||||||
|
- pytest -v -s v1/test_metrics_reader.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_request.py
|
- pytest -v -s v1/test_request.py
|
||||||
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
|
- pytest -v -s v1/test_utils.py
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
# split the test to avoid interference
|
|
||||||
- pytest -v -s v1/core
|
|
||||||
- pytest -v -s v1/structured_output
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
|
||||||
|
|
||||||
|
|
||||||
- label: Examples Test # 30min
|
- label: Examples Test # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -397,7 +383,6 @@ steps:
|
|||||||
- pytest -v -s compile/test_pass_manager.py
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
- pytest -v -s compile/test_fusion.py
|
- pytest -v -s compile/test_fusion.py
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
- pytest -v -s compile/test_functionalization.py
|
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s compile/test_async_tp.py
|
- pytest -v -s compile/test_async_tp.py
|
||||||
@ -477,23 +462,33 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/mamba/
|
- csrc/mamba/
|
||||||
- tests/kernels/mamba
|
- tests/kernels/mamba
|
||||||
- vllm/model_executor/layers/mamba/ops
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Tensorizer Test # 14min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 25
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor
|
- vllm/model_executor/model_loader
|
||||||
- tests/model_executor
|
- tests/tensorizer_loader
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s model_executor
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
|
- label: Model Executor Test # 7min
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 11min
|
- label: Benchmarks # 11min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -527,7 +522,7 @@ steps:
|
|||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -555,17 +550,10 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
|
- tests/mistral_tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s -m 'not cpu_test' tool_use
|
- pytest -v -s tool_use
|
||||||
|
- pytest -v -s mistral_tool_use
|
||||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'cpu_test' tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@ -605,19 +593,13 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_transformers.py
|
- tests/models/test_transformers.py
|
||||||
- tests/models/test_registry.py
|
- tests/models/test_registry.py
|
||||||
commands:
|
|
||||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
|
||||||
|
|
||||||
- label: Basic Models Test (Other CPU) # 5min
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/test_utils.py
|
- tests/models/test_utils.py
|
||||||
- tests/models/test_vision.py
|
- tests/models/test_vision.py
|
||||||
no_gpu: true
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
- pytest -v -s models/test_transformers.py \
|
||||||
|
models/test_registry.py \
|
||||||
|
models/test_utils.py \
|
||||||
|
models/test_vision.py
|
||||||
|
|
||||||
- label: Language Models Tests (Standard)
|
- label: Language Models Tests (Standard)
|
||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
@ -787,7 +769,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@ -835,11 +816,11 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: GPT-OSS Eval (Blackwell)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
optional: true # run on nightlies
|
optional: true # disable while debugging
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/evals/gpt_oss
|
- tests/evals/gpt_oss
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
@ -847,34 +828,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/quantization/test_blackwell_moe.py
|
|
||||||
- vllm/model_executor/models/deepseek_v2.py
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/models/llama4.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
|
||||||
- vllm/model_executor/layers/quantization/modelopt.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
|
||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
|
||||||
timeout_in_minutes: 75
|
|
||||||
gpu: b200
|
|
||||||
optional: true # run on nightlies
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -935,13 +889,14 @@ steps:
|
|||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/distributed
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- tests/v1/shutdown
|
- tests/v1/shutdown
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
|
11
.github/CODEOWNERS
vendored
@ -12,6 +12,8 @@
|
|||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
|
/vllm/v1/attention @LucasWilkinson
|
||||||
|
/vllm/v1/sample @22quinn @houseroad
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
@ -23,17 +25,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||||
|
/vllm/v1/spec_decode @benchislett @luccafong
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
|
||||||
/vllm/v1/spec_decode @benchislett @luccafong
|
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
|
||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
/vllm/v1/offloading @ApostaC
|
/vllm/v1/offloading @ApostaC
|
||||||
|
|
||||||
@ -55,7 +54,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
/tests/v1/kv_connector @ApostaC
|
/tests/v1/kv_connector @ApostaC
|
||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
|
33
.github/mergify.yml
vendored
@ -2,7 +2,6 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@ -15,7 +14,6 @@ pull_request_rules:
|
|||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@ -32,7 +30,6 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@ -49,7 +46,6 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -59,7 +55,6 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@ -75,7 +70,6 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@ -89,7 +83,6 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@ -101,7 +94,6 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
@ -115,7 +107,6 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@ -130,7 +121,6 @@ pull_request_rules:
|
|||||||
- name: label-gpt-oss
|
- name: label-gpt-oss
|
||||||
description: Automatically apply gpt-oss label
|
description: Automatically apply gpt-oss label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||||
@ -152,7 +142,6 @@ pull_request_rules:
|
|||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@ -173,7 +162,6 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@ -193,7 +181,6 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@ -209,7 +196,6 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@ -222,7 +208,6 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@ -238,7 +223,6 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@ -253,9 +237,9 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
|
- files~=^tests/mistral_tool_use/
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
@ -272,9 +256,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- conflict
|
||||||
- conflict
|
- -closed
|
||||||
- -closed
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -288,12 +271,10 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
- files~=^tests/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
users:
|
users:
|
||||||
@ -301,7 +282,6 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for modelopt changes
|
- name: assign reviewer for modelopt changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||||
@ -316,8 +296,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
@ -326,7 +306,6 @@ pull_request_rules:
|
|||||||
- name: label-kv-connector
|
- name: label-kv-connector
|
||||||
description: Automatically apply kv-connector label
|
description: Automatically apply kv-connector label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||||
|
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
@ -6,16 +6,28 @@ default_stages:
|
|||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
exclude: 'vllm/third_party/.*'
|
exclude: 'vllm/third_party/.*'
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/google/yapf
|
||||||
rev: v0.13.3
|
rev: v0.43.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: yapf
|
||||||
|
args: [--in-place, --verbose]
|
||||||
|
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||||
|
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.11.7
|
||||||
|
hooks:
|
||||||
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.35.5
|
rev: v1.35.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
|
- repo: https://github.com/PyCQA/isort
|
||||||
|
rev: 6.0.1
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v20.1.3
|
||||||
hooks:
|
hooks:
|
||||||
|
@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -86,9 +86,6 @@ find_package(Torch REQUIRED)
|
|||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
|
||||||
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
else()
|
else()
|
||||||
@ -178,15 +175,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Set compression mode for CUDA >=13.x.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
|
||||||
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Set CUDA include flags for CXX compiler.
|
# Set CUDA include flags for CXX compiler.
|
||||||
#
|
#
|
||||||
@ -282,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -317,6 +305,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||||
@ -451,11 +440,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||||
@ -485,11 +470,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
# require CUDA 12.8 or later
|
# require CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
@ -569,11 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@ -592,11 +569,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@ -618,11 +591,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
# CUTLASS MLA Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||||
@ -666,11 +635,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@ -691,11 +656,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@ -714,11 +675,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
|
@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
|
||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||||
|
@ -2,9 +2,9 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from benchmark_utils import TimeCollector
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
from vllm.v1.core.block_pool import BlockPool
|
from vllm.v1.core.block_pool import BlockPool
|
||||||
|
|
||||||
|
@ -5,9 +5,9 @@ import time
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from benchmark_utils import TimeCollector
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
@ -164,7 +164,7 @@ def invoke_main() -> None:
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batched", action="store_true", help="consider time to prepare batch"
|
"--batched", action="store_true", help="consider time to prepare batch"
|
||||||
)
|
) # noqa: E501
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
|
@ -37,13 +37,14 @@ from typing import Optional
|
|||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
RequestFuncInput,
|
RequestFuncInput,
|
||||||
RequestFuncOutput,
|
RequestFuncOutput,
|
||||||
)
|
)
|
||||||
from tqdm.asyncio import tqdm
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@ -909,13 +910,13 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
|
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
|
|
||||||
@ -158,7 +158,7 @@ def bench_fp8(
|
|||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
),
|
),
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
|
@ -55,7 +55,9 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -63,7 +65,9 @@ benchmark() {
|
|||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
@ -38,12 +38,16 @@ wait_for_server() {
|
|||||||
launch_chunked_prefill() {
|
launch_chunked_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
--gpu-memory-utilization 0.6 &
|
--gpu-memory-utilization 0.6 &
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
@ -58,14 +62,18 @@ launch_chunked_prefill() {
|
|||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
@ -1,174 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils import (
|
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def run_benchmark(
|
|
||||||
num_tokens: int,
|
|
||||||
num_heads: int,
|
|
||||||
head_size: int,
|
|
||||||
block_size: int,
|
|
||||||
num_blocks: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
num_iters: int,
|
|
||||||
benchmark_mode: str,
|
|
||||||
device: str = "cuda",
|
|
||||||
) -> float:
|
|
||||||
"""Return latency (seconds) for given num_tokens."""
|
|
||||||
|
|
||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
|
||||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
|
||||||
|
|
||||||
current_platform.seed_everything(42)
|
|
||||||
torch.set_default_device(device)
|
|
||||||
|
|
||||||
# create random key / value tensors [T, H, D].
|
|
||||||
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
|
||||||
value = torch.randn_like(key)
|
|
||||||
|
|
||||||
# prepare the slot mapping.
|
|
||||||
# each token is assigned a unique slot in the KV-cache.
|
|
||||||
num_slots = block_size * num_blocks
|
|
||||||
if num_tokens > num_slots:
|
|
||||||
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
|
||||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
|
||||||
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
|
||||||
|
|
||||||
key_caches, value_caches = create_kv_caches_with_random(
|
|
||||||
num_blocks,
|
|
||||||
block_size,
|
|
||||||
1, # num_layers
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
kv_cache_dtype,
|
|
||||||
dtype,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
|
||||||
# to free unused memory
|
|
||||||
del key_caches, value_caches
|
|
||||||
|
|
||||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
|
||||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
|
||||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
|
||||||
|
|
||||||
function_under_test = lambda: ops.reshape_and_cache(
|
|
||||||
key, # noqa: F821
|
|
||||||
value, # noqa: F821
|
|
||||||
key_cache, # noqa: F821
|
|
||||||
value_cache, # noqa: F821
|
|
||||||
slot_mapping, # noqa: F821
|
|
||||||
kv_cache_dtype,
|
|
||||||
k_scale,
|
|
||||||
v_scale,
|
|
||||||
)
|
|
||||||
|
|
||||||
if benchmark_mode == "cudagraph":
|
|
||||||
g = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(g):
|
|
||||||
function_under_test()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
function_under_test = lambda: g.replay()
|
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.perf_counter()
|
|
||||||
for _ in range(n_iters):
|
|
||||||
function_under_test()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.perf_counter()
|
|
||||||
return (end - start) / n_iters
|
|
||||||
|
|
||||||
# warm-up
|
|
||||||
run_cuda_benchmark(3)
|
|
||||||
|
|
||||||
lat = run_cuda_benchmark(num_iters)
|
|
||||||
|
|
||||||
# free tensors to mitigate OOM when sweeping
|
|
||||||
del key, value, key_cache, value_cache, slot_mapping
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
return lat
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
rows = []
|
|
||||||
for exp in range(1, 17):
|
|
||||||
n_tok = 2**exp
|
|
||||||
lat = run_benchmark(
|
|
||||||
num_tokens=n_tok,
|
|
||||||
num_heads=args.num_heads,
|
|
||||||
head_size=args.head_size,
|
|
||||||
block_size=args.block_size,
|
|
||||||
num_blocks=args.num_blocks,
|
|
||||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
|
||||||
num_iters=args.iters,
|
|
||||||
benchmark_mode=args.mode,
|
|
||||||
device="cuda",
|
|
||||||
)
|
|
||||||
rows.append([n_tok, lat * 1e6]) # convert to microseconds
|
|
||||||
|
|
||||||
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
|
|
||||||
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser()
|
|
||||||
|
|
||||||
parser.add_argument("--num-heads", type=int, default=128)
|
|
||||||
parser.add_argument(
|
|
||||||
"--head-size",
|
|
||||||
type=int,
|
|
||||||
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
|
||||||
default=128,
|
|
||||||
)
|
|
||||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
|
||||||
parser.add_argument("--num-blocks", type=int, default=128 * 128)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--dtype",
|
|
||||||
type=str,
|
|
||||||
choices=["half", "bfloat16", "float"],
|
|
||||||
default="bfloat16",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--kv-cache-dtype",
|
|
||||||
type=str,
|
|
||||||
choices=["auto", "fp8"],
|
|
||||||
default="auto",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("--iters", type=int, default=200)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--mode",
|
|
||||||
type=str,
|
|
||||||
choices=["cudagraph", "no_graph"],
|
|
||||||
default="cudagraph",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
main(args)
|
|
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
# fmt: off
|
||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -8,7 +9,7 @@ import torch
|
|||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
per_token_group_quant_fp8,
|
per_token_group_quant_fp8,
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.deep_gemm import (
|
from vllm.utils.deep_gemm import (
|
||||||
@ -19,21 +20,19 @@ from vllm.utils.deep_gemm import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_shape(
|
def benchmark_shape(m: int,
|
||||||
m: int,
|
n: int,
|
||||||
n: int,
|
k: int,
|
||||||
k: int,
|
warmup: int = 100,
|
||||||
warmup: int = 100,
|
repeat: int = 10000,
|
||||||
repeat: int = 10000,
|
verbose: bool = False) -> dict:
|
||||||
verbose: bool = False,
|
|
||||||
) -> dict:
|
|
||||||
"""Benchmark all implementations for a specific (m, n, k) shape."""
|
"""Benchmark all implementations for a specific (m, n, k) shape."""
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
|
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
|
||||||
|
|
||||||
# Create test tensors
|
# Create test tensors
|
||||||
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
|
A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
|
||||||
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
|
||||||
|
|
||||||
# Reference result in BF16
|
# Reference result in BF16
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
@ -50,39 +49,34 @@ def benchmark_shape(
|
|||||||
# Pre-quantize A for all implementations
|
# Pre-quantize A for all implementations
|
||||||
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||||
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
|
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
||||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||||
A, block_size[1], column_major_scales=True
|
A, block_size[1], column_major_scales=True)
|
||||||
)
|
|
||||||
|
|
||||||
# === DeepGEMM Implementation ===
|
# === DeepGEMM Implementation ===
|
||||||
def deepgemm_gemm():
|
def deepgemm_gemm():
|
||||||
fp8_gemm_nt(
|
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
|
||||||
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
|
(B_deepgemm, B_scale_deepgemm),
|
||||||
)
|
C_deepgemm)
|
||||||
return C_deepgemm
|
return C_deepgemm
|
||||||
|
|
||||||
# === vLLM Triton Implementation ===
|
# === vLLM Triton Implementation ===
|
||||||
def vllm_triton_gemm():
|
def vllm_triton_gemm():
|
||||||
return w8a8_triton_block_scaled_mm(
|
return w8a8_block_fp8_matmul(A_vllm,
|
||||||
A_vllm,
|
B_vllm,
|
||||||
B_vllm,
|
A_scale_vllm,
|
||||||
A_scale_vllm,
|
B_scale_vllm,
|
||||||
B_scale_vllm,
|
block_size,
|
||||||
block_size,
|
output_dtype=torch.bfloat16)
|
||||||
output_dtype=torch.bfloat16,
|
|
||||||
)
|
|
||||||
|
|
||||||
# === vLLM CUTLASS Implementation ===
|
# === vLLM CUTLASS Implementation ===
|
||||||
def vllm_cutlass_gemm():
|
def vllm_cutlass_gemm():
|
||||||
return ops.cutlass_scaled_mm(
|
return ops.cutlass_scaled_mm(A_vllm_cutlass,
|
||||||
A_vllm_cutlass,
|
B_vllm.T,
|
||||||
B_vllm.T,
|
scale_a=A_scale_vllm_cutlass,
|
||||||
scale_a=A_scale_vllm_cutlass,
|
scale_b=B_scale_vllm.T,
|
||||||
scale_b=B_scale_vllm.T,
|
out_dtype=torch.bfloat16)
|
||||||
out_dtype=torch.bfloat16,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Run correctness check first
|
# Run correctness check first
|
||||||
if verbose:
|
if verbose:
|
||||||
@ -99,23 +93,26 @@ def benchmark_shape(
|
|||||||
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
|
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
|
||||||
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
|
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
|
||||||
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
|
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
|
||||||
print(
|
print("vLLM Triton vs DeepGEMM difference: "
|
||||||
"vLLM Triton vs DeepGEMM difference: "
|
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
|
||||||
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
|
print("vLLM CUTLASS vs DeepGEMM difference: "
|
||||||
)
|
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
|
||||||
print(
|
|
||||||
"vLLM CUTLASS vs DeepGEMM difference: "
|
|
||||||
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Benchmark implementations
|
# Benchmark implementations
|
||||||
implementations = {
|
implementations = {
|
||||||
"DeepGEMM": deepgemm_gemm,
|
"DeepGEMM": deepgemm_gemm,
|
||||||
"vLLM Triton": vllm_triton_gemm,
|
"vLLM Triton": vllm_triton_gemm,
|
||||||
"vLLM CUTLASS": vllm_cutlass_gemm,
|
"vLLM CUTLASS": vllm_cutlass_gemm
|
||||||
}
|
}
|
||||||
|
|
||||||
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
|
benchmark_results = {
|
||||||
|
"shape": {
|
||||||
|
"m": m,
|
||||||
|
"n": n,
|
||||||
|
"k": k
|
||||||
|
},
|
||||||
|
"implementations": {}
|
||||||
|
}
|
||||||
|
|
||||||
for name, func in implementations.items():
|
for name, func in implementations.items():
|
||||||
# Warmup
|
# Warmup
|
||||||
@ -143,36 +140,38 @@ def benchmark_shape(
|
|||||||
"tflops": tflops,
|
"tflops": tflops,
|
||||||
"gb_s": gb_s,
|
"gb_s": gb_s,
|
||||||
"diff": {
|
"diff": {
|
||||||
"DeepGEMM": 0.0
|
"DeepGEMM":
|
||||||
if name == "DeepGEMM"
|
0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
|
||||||
else calc_diff(func(), C_deepgemm),
|
"Reference":
|
||||||
"Reference": deepgemm_diff
|
deepgemm_diff if name == "DeepGEMM" else
|
||||||
if name == "DeepGEMM"
|
(vllm_triton_diff
|
||||||
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
|
if name == "vLLM Triton" else vllm_cutlass_diff)
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
|
print(
|
||||||
|
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
|
||||||
|
)
|
||||||
|
|
||||||
# Calculate speedups
|
# Calculate speedups
|
||||||
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
|
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
|
||||||
for name, data in benchmark_results["implementations"].items():
|
for name, data in benchmark_results["implementations"].items():
|
||||||
if name != "DeepGEMM":
|
if name != "DeepGEMM":
|
||||||
speedup = baseline / data["time_ms"]
|
speedup = baseline / data["time_ms"]
|
||||||
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
|
benchmark_results["implementations"][name][
|
||||||
|
"speedup_vs_deepgemm"] = speedup
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(f"DeepGEMM is {1/speedup:.2f}x "
|
||||||
f"DeepGEMM is {1 / speedup:.2f}x "
|
f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
|
||||||
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
|
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
|
||||||
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
|
"time_ms"]
|
||||||
|
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
|
||||||
|
"time_ms"]
|
||||||
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
|
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
|
||||||
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
|
benchmark_results["implementations"]["vLLM CUTLASS"][
|
||||||
cutlass_vs_triton
|
"speedup_vs_triton"] = cutlass_vs_triton
|
||||||
)
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
|
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
|
||||||
@ -184,7 +183,8 @@ def benchmark_shape(
|
|||||||
|
|
||||||
def format_table_row(values, widths):
|
def format_table_row(values, widths):
|
||||||
"""Format a row with specified column widths."""
|
"""Format a row with specified column widths."""
|
||||||
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
|
return "| " + " | ".join(f"{val:{w}}"
|
||||||
|
for val, w in zip(values, widths)) + " |"
|
||||||
|
|
||||||
|
|
||||||
def print_table(headers, rows, title=None):
|
def print_table(headers, rows, title=None):
|
||||||
@ -292,50 +292,38 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
for result in all_results:
|
for result in all_results:
|
||||||
shape = result["shape"]
|
shape = result["shape"]
|
||||||
impl_data = result["implementations"]["DeepGEMM"]
|
impl_data = result["implementations"]["DeepGEMM"]
|
||||||
deepgemm_rows.append(
|
deepgemm_rows.append([
|
||||||
[
|
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
||||||
shape["m"],
|
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
|
||||||
shape["n"],
|
])
|
||||||
shape["k"],
|
|
||||||
f"{impl_data['time_us']:.1f}",
|
|
||||||
f"{impl_data['tflops']:.1f}",
|
|
||||||
f"{impl_data['gb_s']:.1f}",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
|
print_table(deepgemm_headers,
|
||||||
|
deepgemm_rows,
|
||||||
|
title="DeepGEMM Implementation:")
|
||||||
|
|
||||||
# Print vLLM Triton table
|
# Print vLLM Triton table
|
||||||
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
|
triton_headers = [
|
||||||
|
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
|
||||||
|
]
|
||||||
triton_rows = []
|
triton_rows = []
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
shape = result["shape"]
|
shape = result["shape"]
|
||||||
impl_data = result["implementations"]["vLLM Triton"]
|
impl_data = result["implementations"]["vLLM Triton"]
|
||||||
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
|
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||||
triton_rows.append(
|
triton_rows.append([
|
||||||
[
|
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
||||||
shape["m"],
|
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
||||||
shape["n"],
|
format_speedup(speedup)
|
||||||
shape["k"],
|
])
|
||||||
f"{impl_data['time_us']:.1f}",
|
|
||||||
f"{impl_data['tflops']:.1f}",
|
|
||||||
f"{impl_data['gb_s']:.1f}",
|
|
||||||
format_speedup(speedup),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
|
print_table(triton_headers,
|
||||||
|
triton_rows,
|
||||||
|
title="vLLM Triton Implementation:")
|
||||||
|
|
||||||
# Print vLLM CUTLASS table
|
# Print vLLM CUTLASS table
|
||||||
cutlass_headers = [
|
cutlass_headers = [
|
||||||
"m",
|
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
|
||||||
"n",
|
"vs Triton"
|
||||||
"k",
|
|
||||||
"Time (μs)",
|
|
||||||
"TFLOPS",
|
|
||||||
"GB/s",
|
|
||||||
"vs DeepGEMM",
|
|
||||||
"vs Triton",
|
|
||||||
]
|
]
|
||||||
cutlass_rows = []
|
cutlass_rows = []
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
@ -343,27 +331,28 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
impl_data = result["implementations"]["vLLM CUTLASS"]
|
impl_data = result["implementations"]["vLLM CUTLASS"]
|
||||||
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
|
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
|
||||||
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
|
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
|
||||||
cutlass_rows.append(
|
cutlass_rows.append([
|
||||||
[
|
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
|
||||||
shape["m"],
|
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
|
||||||
shape["n"],
|
format_speedup(vs_deepgemm),
|
||||||
shape["k"],
|
format_speedup(vs_triton)
|
||||||
f"{impl_data['time_us']:.1f}",
|
])
|
||||||
f"{impl_data['tflops']:.1f}",
|
|
||||||
f"{impl_data['gb_s']:.1f}",
|
|
||||||
format_speedup(vs_deepgemm),
|
|
||||||
format_speedup(vs_triton),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
|
print_table(cutlass_headers,
|
||||||
|
cutlass_rows,
|
||||||
|
title="vLLM CUTLASS Implementation:")
|
||||||
|
|
||||||
# Calculate and print averages
|
# Calculate and print averages
|
||||||
print("\n===== AVERAGE PERFORMANCE =====")
|
print("\n===== AVERAGE PERFORMANCE =====")
|
||||||
|
|
||||||
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
|
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
|
||||||
avg_metrics = {
|
avg_metrics = {
|
||||||
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
|
impl: {
|
||||||
|
"tflops": 0,
|
||||||
|
"gb_s": 0,
|
||||||
|
"time_ms": 0
|
||||||
|
}
|
||||||
|
for impl in implementations
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
@ -381,9 +370,9 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
|
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
|
||||||
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
|
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
|
||||||
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
|
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
|
||||||
avg_rows.append(
|
avg_rows.append([
|
||||||
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
|
impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
|
||||||
)
|
])
|
||||||
|
|
||||||
print_table(avg_headers, avg_rows)
|
print_table(avg_headers, avg_rows)
|
||||||
|
|
||||||
@ -391,19 +380,21 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
avg_speedups = {
|
avg_speedups = {
|
||||||
"DeepGEMM vs vLLM Triton": 0,
|
"DeepGEMM vs vLLM Triton": 0,
|
||||||
"DeepGEMM vs vLLM CUTLASS": 0,
|
"DeepGEMM vs vLLM CUTLASS": 0,
|
||||||
"vLLM CUTLASS vs vLLM Triton": 0,
|
"vLLM CUTLASS vs vLLM Triton": 0
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
|
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
|
||||||
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
|
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
|
||||||
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
|
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
|
||||||
|
"time_ms"]
|
||||||
|
|
||||||
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
avg_speedups[
|
||||||
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
"DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
|
||||||
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
|
avg_speedups[
|
||||||
vllm_triton_time / vllm_cutlass_time
|
"DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
|
||||||
)
|
avg_speedups[
|
||||||
|
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
|
||||||
|
|
||||||
print("\n===== AVERAGE SPEEDUPS =====")
|
print("\n===== AVERAGE SPEEDUPS =====")
|
||||||
speedup_headers = ["Comparison", "Speedup"]
|
speedup_headers = ["Comparison", "Speedup"]
|
||||||
@ -421,7 +412,8 @@ def run_benchmarks(verbose: bool = False):
|
|||||||
|
|
||||||
for result in all_results:
|
for result in all_results:
|
||||||
for impl in implementations:
|
for impl in implementations:
|
||||||
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
|
avg_diff[impl] += result["implementations"][impl]["diff"][
|
||||||
|
"Reference"]
|
||||||
|
|
||||||
diff_headers = ["Implementation", "Avg Diff vs Reference"]
|
diff_headers = ["Implementation", "Avg Diff vs Reference"]
|
||||||
diff_rows = []
|
diff_rows = []
|
||||||
|
49
benchmarks/pyproject.toml
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||||
|
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||||
|
# following differences:
|
||||||
|
# - ruff line length is overridden to 88
|
||||||
|
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
"vllm/version.py" = ["F401"]
|
||||||
|
"vllm/_version.py" = ["ALL"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
"UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
"I",
|
||||||
|
# flake8-logging-format
|
||||||
|
"G",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# Loop control variable not used within loop body
|
||||||
|
"B007",
|
||||||
|
# f-string format
|
||||||
|
"UP032",
|
||||||
|
# Can remove once 3.10+ is the minimum Python version
|
||||||
|
"UP007",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.lint.isort]
|
||||||
|
known-first-party = ["vllm"]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
@ -213,7 +213,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
endif()
|
endif()
|
||||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||||
add_compile_definitions(VLLM_USE_ACL)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
@ -227,7 +226,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||||
set(ONEDNN_VERBOSE "ON")
|
set(ONEDNN_VERBOSE "OFF")
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
FetchContent_MakeAvailable(oneDNN)
|
||||||
|
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
@ -16,7 +16,7 @@ import shutil
|
|||||||
|
|
||||||
from torch.utils.hipify.hipify_python import hipify
|
from torch.utils.hipify.hipify_python import hipify
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
# Project directory where all the source + include files live.
|
# Project directory where all the source + include files live.
|
||||||
@ -34,14 +34,15 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Source files to convert.
|
# Source files to convert.
|
||||||
parser.add_argument(
|
parser.add_argument("sources",
|
||||||
"sources", help="Source files to hipify.", nargs="*", default=[]
|
help="Source files to hipify.",
|
||||||
)
|
nargs="*",
|
||||||
|
default=[])
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Limit include scope to project_dir only
|
# Limit include scope to project_dir only
|
||||||
includes = [os.path.join(args.project_dir, "*")]
|
includes = [os.path.join(args.project_dir, '*')]
|
||||||
|
|
||||||
# Get absolute path for all source files.
|
# Get absolute path for all source files.
|
||||||
extra_files = [os.path.abspath(s) for s in args.sources]
|
extra_files = [os.path.abspath(s) for s in args.sources]
|
||||||
@ -50,31 +51,25 @@ if __name__ == "__main__":
|
|||||||
# The directory might already exist to hold object files so we ignore that.
|
# The directory might already exist to hold object files so we ignore that.
|
||||||
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
|
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
|
||||||
|
|
||||||
hipify_result = hipify(
|
hipify_result = hipify(project_directory=args.project_dir,
|
||||||
project_directory=args.project_dir,
|
output_directory=args.output_dir,
|
||||||
output_directory=args.output_dir,
|
header_include_dirs=[],
|
||||||
header_include_dirs=[],
|
includes=includes,
|
||||||
includes=includes,
|
extra_files=extra_files,
|
||||||
extra_files=extra_files,
|
show_detailed=True,
|
||||||
show_detailed=True,
|
is_pytorch_extension=True,
|
||||||
is_pytorch_extension=True,
|
hipify_extra_files_only=True)
|
||||||
hipify_extra_files_only=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
hipified_sources = []
|
hipified_sources = []
|
||||||
for source in args.sources:
|
for source in args.sources:
|
||||||
s_abs = os.path.abspath(source)
|
s_abs = os.path.abspath(source)
|
||||||
hipified_s_abs = (
|
hipified_s_abs = (hipify_result[s_abs].hipified_path if
|
||||||
hipify_result[s_abs].hipified_path
|
(s_abs in hipify_result
|
||||||
if (
|
and hipify_result[s_abs].hipified_path is not None)
|
||||||
s_abs in hipify_result
|
else s_abs)
|
||||||
and hipify_result[s_abs].hipified_path is not None
|
|
||||||
)
|
|
||||||
else s_abs
|
|
||||||
)
|
|
||||||
hipified_sources.append(hipified_s_abs)
|
hipified_sources.append(hipified_s_abs)
|
||||||
|
|
||||||
assert len(hipified_sources) == len(args.sources)
|
assert (len(hipified_sources) == len(args.sources))
|
||||||
|
|
||||||
# Print hipified source files.
|
# Print hipified source files.
|
||||||
print("\n".join(hipified_sources))
|
print("\n".join(hipified_sources))
|
||||||
|
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
|
|||||||
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
||||||
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
||||||
|
|
||||||
# If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
||||||
# remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
|
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
|
||||||
set(_CUDA_ARCHS)
|
set(_CUDA_ARCHS)
|
||||||
foreach(_arch ${_SRC_CUDA_ARCHS})
|
foreach(_arch ${_SRC_CUDA_ARCHS})
|
||||||
if(_arch MATCHES "[af]$")
|
if(_arch MATCHES "\\a$")
|
||||||
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
||||||
string(REGEX REPLACE "[af]$" "" _base "${_arch}")
|
string(REPLACE "a" "" _base "${_arch}")
|
||||||
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
||||||
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
||||||
list(APPEND _CUDA_ARCHS "${_arch}")
|
list(APPEND _CUDA_ARCHS "${_arch}")
|
||||||
|
@ -16,7 +16,9 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cfloat>
|
#include <cfloat> // FLT_MIN
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
@ -208,20 +210,6 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Used to copy/convert one element
|
|
||||||
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
|
||||||
struct CopyWithScaleOp {
|
|
||||||
float scale;
|
|
||||||
|
|
||||||
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
dst = static_cast<OutT>(src);
|
|
||||||
} else {
|
|
||||||
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_kernel(
|
__global__ void reshape_and_cache_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -237,51 +225,59 @@ __global__ void reshape_and_cache_kernel(
|
|||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
if (slot_idx < 0) {
|
if (slot_idx < 0) {
|
||||||
|
// Padding token that should be ignored.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
const int64_t block_idx = slot_idx / block_size;
|
||||||
const int64_t block_offset = slot_idx % block_size;
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
const int h_block_count = head_size / x; // head_size//x
|
|
||||||
|
|
||||||
const int h_block_idx = threadIdx.x;
|
const int n = num_heads * head_size;
|
||||||
if (h_block_idx >= num_heads * h_block_count) {
|
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||||
return;
|
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||||
}
|
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||||
|
|
||||||
const int head_idx = h_block_idx / h_block_count;
|
const int head_idx = i / head_size;
|
||||||
const int h_block = h_block_idx % h_block_count;
|
const int head_offset = i % head_size;
|
||||||
|
const int x_idx = head_offset / x;
|
||||||
|
const int x_offset = head_offset % x;
|
||||||
|
|
||||||
const scalar_t* __restrict__ key_src =
|
const int64_t tgt_key_idx =
|
||||||
key + token_idx * key_stride + head_idx * head_size + h_block * x;
|
block_idx * num_heads * (head_size / x) * block_size * x +
|
||||||
const int64_t src_value_start =
|
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
|
||||||
token_idx * value_stride + head_idx * head_size + h_block * x;
|
block_offset * x + x_offset;
|
||||||
|
const int64_t tgt_value_idx =
|
||||||
cache_t* __restrict__ key_dst =
|
block_idx * num_heads * head_size * block_size +
|
||||||
key_cache + block_idx * num_heads * h_block_count * block_size * x +
|
head_idx * head_size * block_size + head_offset * block_size +
|
||||||
head_idx * h_block_count * block_size * x + h_block * block_size * x +
|
block_offset;
|
||||||
block_offset * x;
|
scalar_t tgt_key = key[src_key_idx];
|
||||||
const int64_t tgt_value_start =
|
scalar_t tgt_value = value[src_value_idx];
|
||||||
block_idx * num_heads * h_block_count * x * block_size +
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
head_idx * h_block_count * x * block_size + h_block * x * block_size +
|
key_cache[tgt_key_idx] = tgt_key;
|
||||||
block_offset;
|
value_cache[tgt_value_idx] = tgt_value;
|
||||||
|
} else {
|
||||||
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
key_cache[tgt_key_idx] =
|
||||||
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
|
||||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
value_cache[tgt_value_idx] =
|
||||||
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
|
||||||
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
}
|
||||||
|
|
||||||
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op);
|
|
||||||
|
|
||||||
const scalar_t* __restrict__ value_src = value + src_value_start;
|
|
||||||
cache_t* __restrict__ value_dst = value_cache + tgt_value_start;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < x; i++) {
|
|
||||||
v_op(value_dst[i * block_size], value_src[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Used by vectorization_utils to copy/convert one element
|
||||||
|
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||||
|
struct CopyWithScaleOp {
|
||||||
|
float scale;
|
||||||
|
|
||||||
|
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||||
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
|
dst = static_cast<OutT>(src);
|
||||||
|
} else {
|
||||||
|
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_flash_kernel(
|
__global__ void reshape_and_cache_flash_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
@ -428,81 +424,84 @@ __global__ void concat_and_cache_ds_mla_kernel(
|
|||||||
const int64_t dst_idx_start =
|
const int64_t dst_idx_start =
|
||||||
block_idx * block_stride + block_offset * entry_stride;
|
block_idx * block_stride + block_offset * entry_stride;
|
||||||
|
|
||||||
// For the NoPE part, each tile of 128 elements is handled by half of one warp
|
// Create 4 tile scales in shared memory
|
||||||
// (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
__shared__ float smem[20];
|
||||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
float* shard_abs_max = smem;
|
||||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32 threads).
|
float* tile_scales = smem + 16;
|
||||||
// So in total, we use 3 warps (96 threads) per block.
|
|
||||||
|
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||||
|
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||||
|
// The first thread of the first warp in each tile writes the scale
|
||||||
|
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||||
|
// by another 2 warps (64 threads).
|
||||||
|
// So in total, we use 18 warps (576 threads) per block.
|
||||||
|
|
||||||
// Cast kv_cache to 16_bit for RoPE values
|
// Cast kv_cache to 16_bit for RoPE values
|
||||||
scalar_t* kv_cache_16bit =
|
scalar_t* kv_cache_16bit =
|
||||||
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
||||||
|
|
||||||
// The last warp handles the RoPE part
|
// The last 64 threads handle the RoPE part
|
||||||
if (threadIdx.x >= 64) {
|
if (threadIdx.x >= kv_lora_rank) {
|
||||||
// Each thread handles two elements of RoPE
|
const int8_t pe_idx = threadIdx.x - kv_lora_rank;
|
||||||
const int8_t pe_idx_start = (threadIdx.x - 64) * 2;
|
const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
|
||||||
const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start;
|
|
||||||
// Vectorized load of two 16-bit values, performed as one 32-bit load
|
|
||||||
const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]);
|
|
||||||
// RoPE values start after the packed 8-bit NoPE values and the
|
// RoPE values start after the packed 8-bit NoPE values and the
|
||||||
// 32-bit scales
|
// 32-bit scales
|
||||||
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start;
|
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
|
||||||
// Vectorized store of two 16-bit values, performed as one 32-bit store
|
kv_cache_16bit[dst_idx] = k_pe[src_idx];
|
||||||
*reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The first two warps handle the NoPE part
|
// Determine the scale for each chunk of NoPE
|
||||||
const int8_t warp_idx = threadIdx.x >> 5;
|
const int16_t tile_idx = threadIdx.x >> 7;
|
||||||
const int8_t lane_idx = threadIdx.x & 31;
|
const int16_t warp_idx = (threadIdx.x & 127) >> 5;
|
||||||
const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4);
|
const int16_t lane_idx = threadIdx.x & 31;
|
||||||
|
|
||||||
// Each thread handles 8 elements of NoPE
|
// Load the NoPE element for this thread into registers
|
||||||
// Load the NoPE elements for this thread into registers
|
const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
|
||||||
const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8);
|
const scalar_t src_val = kv_c[src_idx];
|
||||||
// Vectorized load of eight 16-bit values, performed as an int4 load
|
|
||||||
const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]);
|
|
||||||
const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4);
|
|
||||||
|
|
||||||
// Max absolute value of this thread's elements
|
// Warp-level reduction to find the max absolute value in the warp
|
||||||
float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])),
|
float max_abs = fabsf(src_val);
|
||||||
fmaxf(fabsf(vals[2]), fabsf(vals[3]))),
|
|
||||||
fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])),
|
|
||||||
fmaxf(fabsf(vals[6]), fabsf(vals[7]))));
|
|
||||||
|
|
||||||
// Warp-level reduction to find the max absolute value in each half-warp
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = 8; offset > 0; offset /= 2) {
|
for (int offset = 16; offset > 0; offset /= 2) {
|
||||||
max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16));
|
#ifdef USE_ROCM
|
||||||
|
max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
|
||||||
|
#else
|
||||||
|
max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the scale for the tile
|
// The first lane of each warp in each tile writes the max_abs of this part
|
||||||
float tile_scale = max_abs / 448.f;
|
// of the tile to shared memory
|
||||||
tile_scale = fmaxf(tile_scale, FLT_MIN);
|
if (lane_idx == 0) {
|
||||||
|
shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
// The first lane of each half-warp writes the scale to kv_cache
|
// The first lane of the first warp in each tile computes the scale for the
|
||||||
if ((lane_idx == 0) || (lane_idx == 16)) {
|
// tile and writes it to shared memory and to kv_cache
|
||||||
|
if (warp_idx == 0 && lane_idx == 0) {
|
||||||
|
float4 shard_abs_max_vec =
|
||||||
|
reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
|
||||||
|
float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
|
||||||
|
fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
|
||||||
|
448.f;
|
||||||
|
|
||||||
|
// Avoid division by zero in `scaled_convert`
|
||||||
|
tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
|
||||||
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
||||||
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
||||||
kv_cache_32bit[dst_idx] = tile_scale;
|
kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now all threads in the block scale and write their elements
|
__syncthreads();
|
||||||
// NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes)
|
|
||||||
const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8);
|
|
||||||
|
|
||||||
uint8_t result[8];
|
// Now all threads in the block scale and write their element
|
||||||
#pragma unroll
|
const float scale_val = tile_scales[tile_idx];
|
||||||
for (int i = 0; i < 8; i++) {
|
const int64_t dst_idx = dst_idx_start + threadIdx.x;
|
||||||
result[i] =
|
kv_cache[dst_idx] =
|
||||||
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
||||||
vals[i], tile_scale);
|
src_val, scale_val);
|
||||||
}
|
|
||||||
|
|
||||||
// Store as aligned 64-bit writes
|
|
||||||
*reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) =
|
|
||||||
*reinterpret_cast<const uint64_t*>(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
@ -607,10 +606,9 @@ void reshape_and_cache(
|
|||||||
|
|
||||||
int key_stride = key.stride(0);
|
int key_stride = key.stride(0);
|
||||||
int value_stride = value.stride(0);
|
int value_stride = value.stride(0);
|
||||||
int head_div_x = head_size / x;
|
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * head_div_x, 512));
|
dim3 block(std::min(num_heads * head_size, 512));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
@ -743,12 +741,13 @@ void concat_and_cache_mla(
|
|||||||
|
|
||||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
if (kv_cache_dtype == "fp8_ds_mla") {
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
// For the NoPE part, each tile of 128 elements is handled by half of one
|
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||||
// warp (16 threads). There are 4 total tiles, so 2 warps (64 threads).
|
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||||
// Lanes 0 and 16 of each warp write the scale values for that warp's tiles.
|
// The first thread of the first warp in each tile writes the scale
|
||||||
// The RoPE part (last 64 elements) is handled by another 1 warp (32
|
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||||
// threads). So in total, we use 3 warps (96 threads) per block.
|
// by another 2 warps (64 threads).
|
||||||
dim3 block(96);
|
// So in total, we use 18 warps (576 threads) per block.
|
||||||
|
dim3 block(576);
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||||
CALL_CONCAT_AND_CACHE_DS_MLA);
|
CALL_CONCAT_AND_CACHE_DS_MLA);
|
||||||
} else {
|
} else {
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <string>
|
|
||||||
#include <cctype>
|
|
||||||
|
|
||||||
namespace vllm {
|
|
||||||
|
|
||||||
// vllm_kernel_override_batch_invariant(); returns true
|
|
||||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
|
||||||
inline bool vllm_kernel_override_batch_invariant() {
|
|
||||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
|
||||||
const char* val = std::getenv(env_key.c_str());
|
|
||||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
|
@ -137,8 +137,9 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void DNNLMatMulPrimitiveHandler::prepack_weight(
|
void DNNLMatMulPrimitiveHandler::prepack_weight(
|
||||||
void* original_b_ptr, dnnl::memory::desc original_b_md,
|
void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
|
||||||
dnnl::memory::desc b_target_mem_desc) {
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||||
|
{b_k_stride_, b_n_stride_});
|
||||||
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
|
dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
|
||||||
dnnl::memory packed_weight(b_target_mem_desc, default_engine());
|
dnnl::memory packed_weight(b_target_mem_desc, default_engine());
|
||||||
{
|
{
|
||||||
@ -249,9 +250,7 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
|
|||||||
if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
|
if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
|
||||||
assert(!use_azp_);
|
assert(!use_azp_);
|
||||||
};
|
};
|
||||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
prepack_weight(args.b_ptr,
|
||||||
{b_k_stride_, b_n_stride_});
|
|
||||||
prepack_weight(args.b_ptr, original_b_md,
|
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||||
.use_bias = false,
|
.use_bias = false,
|
||||||
@ -413,25 +412,12 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
|||||||
assert(ab_type_ == dnnl::memory::data_type::f32 ||
|
assert(ab_type_ == dnnl::memory::data_type::f32 ||
|
||||||
ab_type_ == dnnl::memory::data_type::bf16 ||
|
ab_type_ == dnnl::memory::data_type::bf16 ||
|
||||||
ab_type_ == dnnl::memory::data_type::f16);
|
ab_type_ == dnnl::memory::data_type::f16);
|
||||||
|
prepack_weight(args.b_ptr,
|
||||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
|
||||||
{b_k_stride_, b_n_stride_});
|
|
||||||
|
|
||||||
prepack_weight(args.b_ptr, original_b_md,
|
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
MSizeCacheKey{
|
MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||||
#ifdef VLLM_USE_ACL
|
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
||||||
// Arm Compute Library (ACL) backend for oneDNN does
|
.use_bias = false,
|
||||||
// not support runtime
|
.bias_type = dnnl::memory::data_type::undef},
|
||||||
// dimensions, so we set M to a default value
|
|
||||||
.a_m_size = 128,
|
|
||||||
.a_m_stride = b_k_size_,
|
|
||||||
#else
|
|
||||||
.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
|
||||||
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
|
||||||
#endif
|
|
||||||
.use_bias = false,
|
|
||||||
.bias_type = dnnl::memory::data_type::undef},
|
|
||||||
true)
|
true)
|
||||||
.weights_desc());
|
.weights_desc());
|
||||||
init_runtime_memory_cache(args);
|
init_runtime_memory_cache(args);
|
||||||
@ -457,30 +443,12 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
|
|||||||
c_storage->set_data_handle((void*)args.c_ptr);
|
c_storage->set_data_handle((void*)args.c_ptr);
|
||||||
c_mem_desc->dims[0] = args.a_m_size;
|
c_mem_desc->dims[0] = args.a_m_size;
|
||||||
|
|
||||||
#ifndef VLLM_USE_ACL
|
|
||||||
// We do not support in ACL backend of oneDNN, we handle bias by:
|
|
||||||
// 1. copying it into the result tensor
|
|
||||||
// 2. attaching a fused-sum post-op to the matmul primitive
|
|
||||||
if (args.use_bias) {
|
if (args.use_bias) {
|
||||||
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
|
auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
|
||||||
bias_storage->set_data_handle((void*)args.bias_ptr);
|
bias_storage->set_data_handle((void*)args.bias_ptr);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
dnnl::matmul matmul = get_matmul_cache(args);
|
|
||||||
|
|
||||||
// With ACL backend of oneDNN, the required memory format might change when the
|
dnnl::matmul matmul = get_matmul_cache(args);
|
||||||
// source tensor dims change. This does not really happen in practice, so isn't
|
|
||||||
// a performance hit, but we need to support it because the API allows for it.
|
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
auto new_expected_wei_desc =
|
|
||||||
dnnl::matmul::primitive_desc(
|
|
||||||
const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc()))
|
|
||||||
.weights_desc();
|
|
||||||
if (new_expected_wei_desc != b_target_mem_desc_) {
|
|
||||||
prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(),
|
|
||||||
b_target_mem_desc_, new_expected_wei_desc);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
|
||||||
scratchpad_storage->set_data_handle(
|
scratchpad_storage->set_data_handle(
|
||||||
@ -516,13 +484,7 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
|
|||||||
} else {
|
} else {
|
||||||
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
|
a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
|
||||||
{key.a_m_stride, 1});
|
{key.a_m_stride, 1});
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
// ACL's backend of oneDNN always expects the weight format to be "any"
|
|
||||||
b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
|
|
||||||
dnnl::memory::format_tag::any);
|
|
||||||
#else
|
|
||||||
b_md = b_target_mem_desc_;
|
b_md = b_target_mem_desc_;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
|
dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
|
||||||
dnnl::memory::format_tag::ab);
|
dnnl::memory::format_tag::ab);
|
||||||
@ -532,18 +494,8 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
|
|||||||
|
|
||||||
if (key.use_bias) {
|
if (key.use_bias) {
|
||||||
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
|
dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
|
||||||
// Since ACL's matmuls don't support passing a bias_md, we apply the bias
|
|
||||||
// through a fused-sum post-op
|
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
dnnl::post_ops post_ops;
|
|
||||||
post_ops.append_sum();
|
|
||||||
attr.set_post_ops(post_ops);
|
|
||||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
|
||||||
attr);
|
|
||||||
#else
|
|
||||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
|
||||||
c_md, attr);
|
c_md, attr);
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
|
||||||
attr);
|
attr);
|
||||||
@ -559,23 +511,13 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
|
|||||||
default_engine(), nullptr);
|
default_engine(), nullptr);
|
||||||
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
|
set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
|
||||||
|
|
||||||
// ACL matmuls don't support bias_md, so we don't need these
|
|
||||||
#ifndef VLLM_USE_ACL
|
|
||||||
memory_cache_[DNNL_ARG_BIAS] =
|
memory_cache_[DNNL_ARG_BIAS] =
|
||||||
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
||||||
default_engine(), nullptr);
|
default_engine(), nullptr);
|
||||||
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
|
set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
|
||||||
#endif
|
|
||||||
memory_cache_[DNNL_ARG_SCRATCHPAD] =
|
memory_cache_[DNNL_ARG_SCRATCHPAD] =
|
||||||
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
|
||||||
default_engine(), nullptr);
|
default_engine(), nullptr);
|
||||||
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
|
set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_onednn_acl_supported() {
|
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
return true;
|
|
||||||
#else
|
|
||||||
return false;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
@ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler {
|
|||||||
protected:
|
protected:
|
||||||
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
|
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
|
||||||
|
|
||||||
void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md,
|
void prepack_weight(void* original_b_ptr,
|
||||||
dnnl::memory::desc b_target_mem_desc);
|
dnnl::memory::desc b_target_mem_desc);
|
||||||
|
|
||||||
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
|
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
|
||||||
|
@ -527,42 +527,21 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
|
|||||||
MatMulPrimitiveHandler* ptr =
|
MatMulPrimitiveHandler* ptr =
|
||||||
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
reinterpret_cast<MatMulPrimitiveHandler*>(handler);
|
||||||
|
|
||||||
// ACL matmuls expect contiguous source tensors
|
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
torch::Tensor a_contig = a.contiguous();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
MatMulPrimitiveHandler::ExecArgs exec_args;
|
MatMulPrimitiveHandler::ExecArgs exec_args;
|
||||||
|
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
exec_args.a_m_size = a_contig.size(0);
|
|
||||||
exec_args.a_m_stride = a_contig.stride(0);
|
|
||||||
#else
|
|
||||||
exec_args.a_m_size = a.size(0);
|
exec_args.a_m_size = a.size(0);
|
||||||
exec_args.a_m_stride = a.stride(0);
|
exec_args.a_m_stride = a.stride(0);
|
||||||
#endif
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
|
VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
|
||||||
if (bias.has_value()) {
|
if (bias.has_value()) {
|
||||||
exec_args.use_bias = true;
|
exec_args.use_bias = true;
|
||||||
exec_args.bias_type = get_dnnl_type<scalar_t>();
|
exec_args.bias_type = get_dnnl_type<scalar_t>();
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
// ACL matmuls in oneDNN do not support a bias.
|
|
||||||
// We handle a matmul with bias by doing: c = bias; c += matmul(a, b)
|
|
||||||
c.copy_(bias.value());
|
|
||||||
#else
|
|
||||||
exec_args.bias_ptr = bias->data_ptr<scalar_t>();
|
exec_args.bias_ptr = bias->data_ptr<scalar_t>();
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
exec_args.use_bias = false;
|
exec_args.use_bias = false;
|
||||||
exec_args.bias_type = get_dnnl_type<void>();
|
exec_args.bias_type = get_dnnl_type<void>();
|
||||||
exec_args.bias_ptr = nullptr;
|
exec_args.bias_ptr = nullptr;
|
||||||
}
|
}
|
||||||
#ifdef VLLM_USE_ACL
|
|
||||||
exec_args.a_ptr = a_contig.data_ptr<scalar_t>();
|
|
||||||
#else
|
|
||||||
exec_args.a_ptr = a.data_ptr<scalar_t>();
|
exec_args.a_ptr = a.data_ptr<scalar_t>();
|
||||||
|
|
||||||
#endif
|
|
||||||
exec_args.c_ptr = c.data_ptr<scalar_t>();
|
exec_args.c_ptr = c.data_ptr<scalar_t>();
|
||||||
|
|
||||||
ptr->execute(exec_args);
|
ptr->execute(exec_args);
|
||||||
|
@ -27,8 +27,6 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
|
|||||||
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
|
void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
|
||||||
const std::optional<torch::Tensor>& bias, int64_t handler);
|
const std::optional<torch::Tensor>& bias, int64_t handler);
|
||||||
|
|
||||||
bool is_onednn_acl_supported();
|
|
||||||
|
|
||||||
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
||||||
torch::Tensor& kv_cache, double scale,
|
torch::Tensor& kv_cache, double scale,
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens);
|
torch::Tensor& block_tables, torch::Tensor& seq_lens);
|
||||||
@ -183,9 +181,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
"int handler) -> ()");
|
"int handler) -> ()");
|
||||||
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
|
ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
|
||||||
|
|
||||||
// Check if oneDNN was built with ACL backend
|
|
||||||
ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported);
|
|
||||||
|
|
||||||
// Create oneDNN W8A8 handler
|
// Create oneDNN W8A8 handler
|
||||||
ops.def(
|
ops.def(
|
||||||
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
|
"create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
|
||||||
|
@ -27,7 +27,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "u4b8",
|
VLLMDataType.u4b8: "u4b8",
|
||||||
VLLMDataType.u8b128: "u8b128",
|
VLLMDataType.u8b128: "u8b128",
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
@ -35,7 +35,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||||
VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
|
VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||||
@ -43,7 +43,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
|||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: 4,
|
VLLMDataType.u4b8: 4,
|
||||||
VLLMDataType.u8b128: 8,
|
VLLMDataType.u8b128: 8,
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
@ -67,13 +67,15 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
DataType.f32: "at::ScalarType::Float",
|
DataType.f32: "at::ScalarType::Float",
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMKernelScheduleTag: dict[
|
VLLMKernelScheduleTag: dict[Union[
|
||||||
Union[MixedInputKernelScheduleType, KernelScheduleType], str
|
MixedInputKernelScheduleType, KernelScheduleType], str] = {
|
||||||
] = {
|
**KernelScheduleTag, # type: ignore
|
||||||
**KernelScheduleTag, # type: ignore
|
**{
|
||||||
**{
|
MixedInputKernelScheduleType.TmaWarpSpecialized:
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
"cutlass::gemm::KernelTmaWarpSpecialized",
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501
|
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501
|
"cutlass::gemm::KernelTmaWarpSpecializedPingpong",
|
||||||
},
|
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
|
||||||
}
|
"cutlass::gemm::KernelTmaWarpSpecializedCooperative",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -8,37 +8,11 @@
|
|||||||
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Compile-time estimate of max threads per SM for launch bounds.
|
// compile-time estimate of max threads per SM for launch bounds.
|
||||||
// Families: 1024, 1536, 2048 threads/SM.
|
|
||||||
#ifndef VLLM_MAX_THREADS_PER_SM
|
#ifndef VLLM_MAX_THREADS_PER_SM
|
||||||
#ifdef __CUDA_ARCH__
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||||
|
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||||
/* 1024 thr/SM: Turing (sm_75) */
|
|
||||||
#if (__CUDA_ARCH__ == 750)
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 1024
|
|
||||||
|
|
||||||
/* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
|
|
||||||
GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
|
|
||||||
#elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
|
|
||||||
(__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \
|
|
||||||
(__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
|
|
||||||
(__CUDA_ARCH__ == 1210)
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
|
||||||
|
|
||||||
/* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
|
|
||||||
Hopper (sm_90), Blackwell (sm_100/103) */
|
|
||||||
#elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
|
|
||||||
(__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \
|
|
||||||
(__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
|
||||||
|
|
||||||
/* Fallback: use 2048 for unknown future CCs */
|
|
||||||
#else
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
/* Host pass (no __CUDA_ARCH__): neutral default */
|
|
||||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include "type_convert.cuh"
|
#include "type_convert.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -414,9 +413,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
|||||||
wt_ptr % req_alignment_bytes == 0;
|
wt_ptr % req_alignment_bytes == 0;
|
||||||
bool offsets_are_multiple_of_vector_width =
|
bool offsets_are_multiple_of_vector_width =
|
||||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
|
||||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
|
||||||
!batch_invariant_launch) {
|
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
@ -462,8 +459,7 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
|||||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
|
||||||
LAUNCH_FUSED_POLY_NORM(8);
|
LAUNCH_FUSED_POLY_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_POLY_NORM(0);
|
LAUNCH_FUSED_POLY_NORM(0);
|
||||||
|
@ -9,7 +9,6 @@
|
|||||||
#include "quantization/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -241,9 +240,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
|||||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
bool ptrs_are_aligned =
|
bool ptrs_are_aligned =
|
||||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
|
||||||
!batch_invariant_launch) {
|
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
|
@ -17,30 +17,25 @@ FILE_HEAD = """
|
|||||||
namespace MARLIN_NAMESPACE_NAME {
|
namespace MARLIN_NAMESPACE_NAME {
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
TEMPLATE = (
|
TEMPLATE = ("template __global__ void Marlin<"
|
||||||
"template __global__ void Marlin<"
|
"{{scalar_t}}, "
|
||||||
"{{scalar_t}}, "
|
"{{w_type_id}}, "
|
||||||
"{{w_type_id}}, "
|
"{{s_type_id}}, "
|
||||||
"{{s_type_id}}, "
|
"{{threads}}, "
|
||||||
"{{threads}}, "
|
"{{thread_m_blocks}}, "
|
||||||
"{{thread_m_blocks}}, "
|
"{{thread_n_blocks}}, "
|
||||||
"{{thread_n_blocks}}, "
|
"{{thread_k_blocks}}, "
|
||||||
"{{thread_k_blocks}}, "
|
"{{'true' if m_block_size_8 else 'false'}}, "
|
||||||
"{{'true' if m_block_size_8 else 'false'}}, "
|
"{{stages}}, "
|
||||||
"{{stages}}, "
|
"{{group_blocks}}, "
|
||||||
"{{group_blocks}}, "
|
"{{'true' if is_zp_float else 'false'}}>"
|
||||||
"{{'true' if is_zp_float else 'false'}}>"
|
"( MARLIN_KERNEL_PARAMS );")
|
||||||
"( MARLIN_KERNEL_PARAMS );"
|
|
||||||
)
|
|
||||||
|
|
||||||
# int8 with zero point case (vllm::kU8) is also supported,
|
# int8 with zero point case (vllm::kU8) is also supported,
|
||||||
# we don't add it to reduce wheel size.
|
# we don't add it to reduce wheel size.
|
||||||
SCALAR_TYPES = [
|
SCALAR_TYPES = [
|
||||||
"vllm::kU4",
|
"vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
|
||||||
"vllm::kU4B8",
|
"vllm::kFE2M1f"
|
||||||
"vllm::kU8B128",
|
|
||||||
"vllm::kFE4M3fn",
|
|
||||||
"vllm::kFE2M1f",
|
|
||||||
]
|
]
|
||||||
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
|
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
|
||||||
|
|
||||||
@ -63,12 +58,11 @@ def generate_new_kernels():
|
|||||||
all_template_str_list = []
|
all_template_str_list = []
|
||||||
|
|
||||||
for group_blocks, m_blocks, thread_configs in itertools.product(
|
for group_blocks, m_blocks, thread_configs in itertools.product(
|
||||||
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
|
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
|
||||||
):
|
|
||||||
# act order case only support gptq-int4 and gptq-int8
|
# act order case only support gptq-int4 and gptq-int8
|
||||||
if group_blocks == 0 and scalar_type not in [
|
if group_blocks == 0 and scalar_type not in [
|
||||||
"vllm::kU4B8",
|
"vllm::kU4B8", "vllm::kU8B128"
|
||||||
"vllm::kU8B128",
|
|
||||||
]:
|
]:
|
||||||
continue
|
continue
|
||||||
if thread_configs[2] == 256:
|
if thread_configs[2] == 256:
|
||||||
|
@ -21,7 +21,6 @@
|
|||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "../cub_helpers.h"
|
#include "../cub_helpers.h"
|
||||||
#include "../core/batch_invariant.hpp"
|
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -406,8 +405,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
|||||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||||
static constexpr int VPT = Constants::VPT;
|
static constexpr int VPT = Constants::VPT;
|
||||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
|
||||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||||
|
|
||||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||||
|
@ -231,7 +231,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
} else {
|
} else {
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
@ -245,7 +245,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
} else {
|
} else {
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
@ -259,7 +259,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
} else {
|
} else {
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
|
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
@ -271,10 +271,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
// TMA epilogue isn't compatible with Swap A/B
|
// TMA epilogue isn't compatible with Swap A/B
|
||||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||||
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
|
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
|
||||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
|
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, a_scales, b_scales);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
@ -25,10 +25,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
|
if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
|
||||||
int8_func(c, a, b, a_scales, b_scales, bias);
|
int8_func(c, a, b, a_scales, b_scales, bias);
|
||||||
} else {
|
} else {
|
||||||
int32_t version_num = get_sm_version_num();
|
TORCH_CHECK(false, "Int8 not supported for this architecture");
|
||||||
TORCH_CHECK(
|
|
||||||
false, "Int8 not supported on SM", version_num,
|
|
||||||
". Use FP8 quantization instead, or run on older arch (SM < 100).");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
@ -67,9 +67,8 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
std::optional<torch::Tensor> const& bias);
|
std::optional<torch::Tensor> const& bias);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
|
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
|
||||||
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
|
|
||||||
void get_cutlass_moe_mm_data_caller(
|
void get_cutlass_moe_mm_data_caller(
|
||||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||||
@ -254,7 +253,7 @@ void cutlass_moe_mm(
|
|||||||
bool per_act_token, bool per_out_ch) {
|
bool per_act_token, bool per_out_ch) {
|
||||||
int32_t version_num = get_sm_version_num();
|
int32_t version_num = get_sm_version_num();
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
#if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100
|
||||||
if (version_num >= 100 && version_num < 110) {
|
if (version_num >= 100) {
|
||||||
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
@ -262,7 +261,7 @@ void cutlass_moe_mm(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
|
||||||
if (version_num >= 90 && version_num < 100) {
|
if (version_num >= 90) {
|
||||||
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
|
||||||
expert_offsets, problem_sizes, a_strides, b_strides,
|
expert_offsets, problem_sizes, a_strides, b_strides,
|
||||||
c_strides, per_act_token, per_out_ch);
|
c_strides, per_act_token, per_out_ch);
|
||||||
|
@ -14,8 +14,6 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "core/registration.h"
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <cutlass/arch/arch.h>
|
#include <cutlass/arch/arch.h>
|
||||||
|
|
||||||
@ -420,7 +418,3 @@ void cutlass_fp4_group_mm(
|
|||||||
"12.8 or above.");
|
"12.8 or above.");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
|
||||||
m.impl("cutlass_fp4_group_mm", &cutlass_fp4_group_mm);
|
|
||||||
}
|
|
||||||
|
@ -17,32 +17,28 @@ FILE_HEAD = """
|
|||||||
namespace MARLIN_NAMESPACE_NAME {
|
namespace MARLIN_NAMESPACE_NAME {
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
TEMPLATE = (
|
TEMPLATE = ("template __global__ void Marlin<"
|
||||||
"template __global__ void Marlin<"
|
"{{scalar_t}}, "
|
||||||
"{{scalar_t}}, "
|
"{{w_type_id}}, "
|
||||||
"{{w_type_id}}, "
|
"{{s_type_id}}, "
|
||||||
"{{s_type_id}}, "
|
"{{threads}}, "
|
||||||
"{{threads}}, "
|
"{{thread_m_blocks}}, "
|
||||||
"{{thread_m_blocks}}, "
|
"{{thread_n_blocks}}, "
|
||||||
"{{thread_n_blocks}}, "
|
"{{thread_k_blocks}}, "
|
||||||
"{{thread_k_blocks}}, "
|
"{{'true' if m_block_size_8 else 'false'}}, "
|
||||||
"{{'true' if m_block_size_8 else 'false'}}, "
|
"{{stages}}, "
|
||||||
"{{stages}}, "
|
"{{group_blocks}}, "
|
||||||
"{{group_blocks}}, "
|
"{{'true' if is_zp_float else 'false'}}>"
|
||||||
"{{'true' if is_zp_float else 'false'}}>"
|
"( MARLIN_KERNEL_PARAMS );")
|
||||||
"( MARLIN_KERNEL_PARAMS );"
|
|
||||||
)
|
|
||||||
|
|
||||||
# int8 with zero point case (vllm::kU8) is also supported,
|
# int8 with zero point case (vllm::kU8) is also supported,
|
||||||
# we don't add it to reduce wheel size.
|
# we don't add it to reduce wheel size.
|
||||||
SCALAR_TYPES = [
|
SCALAR_TYPES = [
|
||||||
"vllm::kU4",
|
"vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn",
|
||||||
"vllm::kU4B8",
|
"vllm::kFE2M1f"
|
||||||
"vllm::kU8B128",
|
|
||||||
"vllm::kFE4M3fn",
|
|
||||||
"vllm::kFE2M1f",
|
|
||||||
]
|
]
|
||||||
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
|
THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128),
|
||||||
|
(128, 64, 128)]
|
||||||
|
|
||||||
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
|
THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
|
||||||
# group_blocks:
|
# group_blocks:
|
||||||
@ -63,12 +59,11 @@ def generate_new_kernels():
|
|||||||
all_template_str_list = []
|
all_template_str_list = []
|
||||||
|
|
||||||
for group_blocks, m_blocks, thread_configs in itertools.product(
|
for group_blocks, m_blocks, thread_configs in itertools.product(
|
||||||
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
|
GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
|
||||||
):
|
|
||||||
# act order case only support gptq-int4 and gptq-int8
|
# act order case only support gptq-int4 and gptq-int8
|
||||||
if group_blocks == 0 and scalar_type not in [
|
if group_blocks == 0 and scalar_type not in [
|
||||||
"vllm::kU4B8",
|
"vllm::kU4B8", "vllm::kU8B128"
|
||||||
"vllm::kU8B128",
|
|
||||||
]:
|
]:
|
||||||
continue
|
continue
|
||||||
if thread_configs[2] == 256:
|
if thread_configs[2] == 256:
|
||||||
@ -98,7 +93,8 @@ def generate_new_kernels():
|
|||||||
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
|
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
|
||||||
|
|
||||||
is_zp_float_list = [False]
|
is_zp_float_list = [False]
|
||||||
if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
|
if dtype == "fp16" and scalar_type == "vllm::kU4" and \
|
||||||
|
group_blocks == 4:
|
||||||
# HQQ (is_zp_float = true) only supports
|
# HQQ (is_zp_float = true) only supports
|
||||||
# 4bit quantization and fp16
|
# 4bit quantization and fp16
|
||||||
is_zp_float_list.append(True)
|
is_zp_float_list.append(True)
|
||||||
|
@ -12,21 +12,20 @@ from functools import reduce
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
from vllm_cutlass_library_extension import (
|
# yapf conflicts with isort for this block
|
||||||
DataType,
|
# yapf: disable
|
||||||
EpilogueScheduleTag,
|
from vllm_cutlass_library_extension import (DataType, EpilogueScheduleTag,
|
||||||
EpilogueScheduleType,
|
EpilogueScheduleType,
|
||||||
MixedInputKernelScheduleType,
|
MixedInputKernelScheduleType,
|
||||||
TileSchedulerTag,
|
TileSchedulerTag,
|
||||||
TileSchedulerType,
|
TileSchedulerType, VLLMDataType,
|
||||||
VLLMDataType,
|
VLLMDataTypeNames,
|
||||||
VLLMDataTypeNames,
|
VLLMDataTypeSize, VLLMDataTypeTag,
|
||||||
VLLMDataTypeSize,
|
VLLMDataTypeTorchDataTypeTag,
|
||||||
VLLMDataTypeTag,
|
VLLMDataTypeVLLMScalarTypeTag,
|
||||||
VLLMDataTypeTorchDataTypeTag,
|
VLLMKernelScheduleTag)
|
||||||
VLLMDataTypeVLLMScalarTypeTag,
|
|
||||||
VLLMKernelScheduleTag,
|
# yapf: enable
|
||||||
)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Generator templating
|
# Generator templating
|
||||||
@ -287,23 +286,18 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
|||||||
tile_shape = (
|
tile_shape = (
|
||||||
f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
|
f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}"
|
||||||
)
|
)
|
||||||
cluster_shape = (
|
cluster_shape = (f"{schedule_config.cluster_shape_mnk[0]}" +
|
||||||
f"{schedule_config.cluster_shape_mnk[0]}"
|
f"x{schedule_config.cluster_shape_mnk[1]}" +
|
||||||
+ f"x{schedule_config.cluster_shape_mnk[1]}"
|
f"x{schedule_config.cluster_shape_mnk[2]}")
|
||||||
+ f"x{schedule_config.cluster_shape_mnk[2]}"
|
kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule]\
|
||||||
)
|
.split("::")[-1]
|
||||||
kernel_schedule = VLLMKernelScheduleTag[schedule_config.kernel_schedule].split(
|
epilogue_schedule = EpilogueScheduleTag[
|
||||||
"::"
|
schedule_config.epilogue_schedule].split("::")[-1]
|
||||||
)[-1]
|
tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler]\
|
||||||
epilogue_schedule = EpilogueScheduleTag[schedule_config.epilogue_schedule].split(
|
.split("::")[-1]
|
||||||
"::"
|
|
||||||
)[-1]
|
|
||||||
tile_scheduler = TileSchedulerTag[schedule_config.tile_scheduler].split("::")[-1]
|
|
||||||
|
|
||||||
return (
|
return (f"{tile_shape}_{cluster_shape}_{kernel_schedule}" +
|
||||||
f"{tile_shape}_{cluster_shape}_{kernel_schedule}"
|
f"_{epilogue_schedule}_{tile_scheduler}")
|
||||||
+ f"_{epilogue_schedule}_{tile_scheduler}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# mostly unique shorter sch_sig
|
# mostly unique shorter sch_sig
|
||||||
@ -322,24 +316,18 @@ def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
|
|||||||
|
|
||||||
# unique type_name
|
# unique type_name
|
||||||
def generate_type_signature(kernel_types: TypeConfig):
|
def generate_type_signature(kernel_types: TypeConfig):
|
||||||
return str(
|
return str("".join([
|
||||||
"".join(
|
VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
||||||
[
|
for field in fields(TypeConfig)
|
||||||
VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
]))
|
||||||
for field in fields(TypeConfig)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_type_option_name(kernel_types: TypeConfig):
|
def generate_type_option_name(kernel_types: TypeConfig):
|
||||||
return ", ".join(
|
return ", ".join([
|
||||||
[
|
f"{field.name.replace('b_', 'with_')+'_type'}=" +
|
||||||
f"{field.name.replace('b_', 'with_') + '_type'}="
|
VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
||||||
+ VLLMDataTypeNames[getattr(kernel_types, field.name)]
|
for field in fields(TypeConfig)
|
||||||
for field in fields(TypeConfig)
|
])
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def is_power_of_two(n):
|
def is_power_of_two(n):
|
||||||
@ -347,6 +335,7 @@ def is_power_of_two(n):
|
|||||||
|
|
||||||
|
|
||||||
def to_cute_constant(value: list[int]):
|
def to_cute_constant(value: list[int]):
|
||||||
|
|
||||||
def _to_cute_constant(value: int):
|
def _to_cute_constant(value: int):
|
||||||
if is_power_of_two(value):
|
if is_power_of_two(value):
|
||||||
return f"_{value}"
|
return f"_{value}"
|
||||||
@ -361,11 +350,11 @@ def to_cute_constant(value: list[int]):
|
|||||||
|
|
||||||
def unique_schedules(impl_configs: list[ImplConfig]):
|
def unique_schedules(impl_configs: list[ImplConfig]):
|
||||||
# Use dict over set for deterministic ordering
|
# Use dict over set for deterministic ordering
|
||||||
return list(
|
return list({
|
||||||
{
|
sch: None
|
||||||
sch: None for impl_config in impl_configs for sch in impl_config.schedules
|
for impl_config in impl_configs
|
||||||
}.keys()
|
for sch in impl_config.schedules
|
||||||
)
|
}.keys())
|
||||||
|
|
||||||
|
|
||||||
def unsigned_type_with_bitwidth(num_bits):
|
def unsigned_type_with_bitwidth(num_bits):
|
||||||
@ -391,7 +380,7 @@ template_globals = {
|
|||||||
"gen_type_sig": generate_type_signature,
|
"gen_type_sig": generate_type_signature,
|
||||||
"unique_schedules": unique_schedules,
|
"unique_schedules": unique_schedules,
|
||||||
"unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
|
"unsigned_type_with_bitwidth": unsigned_type_with_bitwidth,
|
||||||
"gen_type_option_name": generate_type_option_name,
|
"gen_type_option_name": generate_type_option_name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -409,28 +398,23 @@ prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
|
|||||||
def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
||||||
sources = []
|
sources = []
|
||||||
|
|
||||||
sources.append(
|
sources.append((
|
||||||
(
|
"machete_mm_dispatch",
|
||||||
"machete_mm_dispatch",
|
mm_dispatch_template.render(impl_configs=impl_configs),
|
||||||
mm_dispatch_template.render(impl_configs=impl_configs),
|
))
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
prepack_types = []
|
prepack_types = []
|
||||||
for impl_config in impl_configs:
|
for impl_config in impl_configs:
|
||||||
convert_type = (
|
convert_type = impl_config.types.a \
|
||||||
impl_config.types.a
|
if impl_config.types.b_group_scale == DataType.void \
|
||||||
if impl_config.types.b_group_scale == DataType.void
|
else impl_config.types.b_group_scale
|
||||||
else impl_config.types.b_group_scale
|
|
||||||
)
|
|
||||||
prepack_types.append(
|
prepack_types.append(
|
||||||
PrepackTypeConfig(
|
PrepackTypeConfig(
|
||||||
a=impl_config.types.a,
|
a=impl_config.types.a,
|
||||||
b_num_bits=VLLMDataTypeSize[impl_config.types.b],
|
b_num_bits=VLLMDataTypeSize[impl_config.types.b],
|
||||||
convert=convert_type,
|
convert=convert_type,
|
||||||
accumulator=impl_config.types.accumulator,
|
accumulator=impl_config.types.accumulator,
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
def prepacked_type_key(prepack_type: PrepackTypeConfig):
|
def prepacked_type_key(prepack_type: PrepackTypeConfig):
|
||||||
# For now, we can just use the first accumulator type seen since
|
# For now, we can just use the first accumulator type seen since
|
||||||
@ -446,14 +430,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
|||||||
unique_prepack_types.append(prepack_type)
|
unique_prepack_types.append(prepack_type)
|
||||||
prepack_types_seen.add(key)
|
prepack_types_seen.add(key)
|
||||||
|
|
||||||
sources.append(
|
sources.append((
|
||||||
(
|
"machete_prepack",
|
||||||
"machete_prepack",
|
prepack_dispatch_template.render(types=unique_prepack_types, ),
|
||||||
prepack_dispatch_template.render(
|
))
|
||||||
types=unique_prepack_types,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split up impls across files
|
# Split up impls across files
|
||||||
num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
|
num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
|
||||||
@ -486,12 +466,10 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
|||||||
curr_impl_in_file += len(files_impls[-1][-1].schedules)
|
curr_impl_in_file += len(files_impls[-1][-1].schedules)
|
||||||
|
|
||||||
for part, file_impls in enumerate(files_impls):
|
for part, file_impls in enumerate(files_impls):
|
||||||
sources.append(
|
sources.append((
|
||||||
(
|
f"machete_mm_impl_part{part+1}",
|
||||||
f"machete_mm_impl_part{part + 1}",
|
mm_impl_template.render(impl_configs=file_impls),
|
||||||
mm_impl_template.render(impl_configs=file_impls),
|
))
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return sources
|
return sources
|
||||||
|
|
||||||
@ -536,7 +514,8 @@ def generate():
|
|||||||
# For now we use the same heuristic for all types
|
# For now we use the same heuristic for all types
|
||||||
# Heuristic is currently tuned for H100s
|
# Heuristic is currently tuned for H100s
|
||||||
default_heuristic = [
|
default_heuristic = [
|
||||||
(cond, ScheduleConfig(*tile_config, **sch_common_params)) # type: ignore
|
(cond, ScheduleConfig(*tile_config,
|
||||||
|
**sch_common_params)) # type: ignore
|
||||||
for cond, tile_config in default_tile_heuristic_config.items()
|
for cond, tile_config in default_tile_heuristic_config.items()
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -562,18 +541,14 @@ def generate():
|
|||||||
a_token_scale=DataType.void,
|
a_token_scale=DataType.void,
|
||||||
out=a,
|
out=a,
|
||||||
accumulator=DataType.f32,
|
accumulator=DataType.f32,
|
||||||
)
|
) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
|
||||||
for b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
|
for a in (DataType.f16, DataType.bf16))
|
||||||
for a in (DataType.f16, DataType.bf16)
|
|
||||||
)
|
|
||||||
|
|
||||||
impl_configs += [
|
impl_configs += [
|
||||||
ImplConfig(x[0], x[1], x[2])
|
ImplConfig(x[0], x[1], x[2])
|
||||||
for x in zip(
|
for x in zip(GPTQ_kernel_type_configs,
|
||||||
GPTQ_kernel_type_configs,
|
itertools.repeat(get_unique_schedules(default_heuristic)),
|
||||||
itertools.repeat(get_unique_schedules(default_heuristic)),
|
itertools.repeat(default_heuristic))
|
||||||
itertools.repeat(default_heuristic),
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
AWQ_kernel_type_configs = list(
|
AWQ_kernel_type_configs = list(
|
||||||
@ -586,18 +561,14 @@ def generate():
|
|||||||
a_token_scale=DataType.void,
|
a_token_scale=DataType.void,
|
||||||
out=a,
|
out=a,
|
||||||
accumulator=DataType.f32,
|
accumulator=DataType.f32,
|
||||||
)
|
) for b in (DataType.u4, DataType.u8)
|
||||||
for b in (DataType.u4, DataType.u8)
|
for a in (DataType.f16, DataType.bf16))
|
||||||
for a in (DataType.f16, DataType.bf16)
|
|
||||||
)
|
|
||||||
|
|
||||||
impl_configs += [
|
impl_configs += [
|
||||||
ImplConfig(x[0], x[1], x[2])
|
ImplConfig(x[0], x[1], x[2])
|
||||||
for x in zip(
|
for x in zip(AWQ_kernel_type_configs,
|
||||||
AWQ_kernel_type_configs,
|
itertools.repeat(get_unique_schedules(default_heuristic)),
|
||||||
itertools.repeat(get_unique_schedules(default_heuristic)),
|
itertools.repeat(default_heuristic))
|
||||||
itertools.repeat(default_heuristic),
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# TODO: Support W4A8 when ready
|
# TODO: Support W4A8 when ready
|
||||||
|
@ -40,8 +40,7 @@ using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
|
|||||||
#define __HIP__FP8MFMA__
|
#define __HIP__FP8MFMA__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__) || \
|
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
|
||||||
defined(__gfx1150__) || defined(__gfx1151__))
|
|
||||||
#define __HIP__GFX11__
|
#define __HIP__GFX11__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -397,7 +397,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
|
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
|
||||||
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
|
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
|
||||||
{stride_tag});
|
{stride_tag});
|
||||||
// conditionally compiled so impl registration is in source file
|
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
|
||||||
|
|
||||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
// quantization, as well as bias
|
// quantization, as well as bias
|
||||||
|
@ -381,28 +381,18 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
|||||||
git clone --depth 1 --recursive --shallow-submodules \
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
--branch ${FLASHINFER_GIT_REF} \
|
--branch ${FLASHINFER_GIT_REF} \
|
||||||
${FLASHINFER_GIT_REPO} flashinfer
|
${FLASHINFER_GIT_REPO} flashinfer
|
||||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
|
||||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
|
||||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
|
||||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
|
||||||
else
|
|
||||||
# CUDA 12.8+ supports 10.0a and 12.0
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
|
||||||
fi
|
|
||||||
pushd flashinfer
|
pushd flashinfer
|
||||||
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
|
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||||
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
|
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||||
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
|
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||||
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
|
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||||
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||||
# Download pre-compiled cubins
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
else
|
||||||
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
|
# CUDA 12.8+ supports 10.0a and 12.0
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||||
fi
|
fi
|
||||||
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
|
||||||
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
|
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
|
||||||
@ -446,7 +436,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
ARG DEEPGEMM_GIT_REF
|
ARG DEEPGEMM_GIT_REF
|
||||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||||
|
|
||||||
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
|
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
|
||||||
RUN set -eux; \
|
RUN set -eux; \
|
||||||
@ -464,12 +454,6 @@ ENV CUDA_HOME=/usr/local/cuda
|
|||||||
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
|
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
|
||||||
&& bash install_python_libraries.sh
|
&& bash install_python_libraries.sh
|
||||||
|
|
||||||
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
|
||||||
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
|
|
||||||
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
|
|
||||||
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
|
|
||||||
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
|
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
@ -542,7 +526,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
else \
|
else \
|
||||||
BITSANDBYTES_VERSION="0.46.1"; \
|
BITSANDBYTES_VERSION="0.46.1"; \
|
||||||
fi; \
|
fi; \
|
||||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3]>=0.14.0'
|
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
@ -555,5 +539,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
|||||||
|
|
||||||
FROM vllm-openai-base AS vllm-openai
|
FROM vllm-openai-base AS vllm-openai
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
|
@ -47,7 +47,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
|||||||
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
|
||||||
# Install Python dependencies
|
# Install Python dependencies
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=cache,target=/root/.cache/ccache \
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
|
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||||
|
|
||||||
######################### TEST DEPS #########################
|
######################### TEST DEPS #########################
|
||||||
FROM base AS vllm-test-deps
|
FROM base AS vllm-test-deps
|
||||||
@ -117,7 +117,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
|||||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install -r requirements/cpu-test.txt
|
uv pip install -r requirements/cpu-test.txt
|
||||||
|
|
||||||
######################### DEV IMAGE #########################
|
######################### DEV IMAGE #########################
|
||||||
FROM vllm-build AS vllm-dev
|
FROM vllm-build AS vllm-dev
|
||||||
@ -130,12 +130,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install -e tests/vllm_test_utils
|
uv pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||||
|
|
||||||
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
|
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
|
||||||
|
|
||||||
@ -160,12 +160,11 @@ ADD ./benchmarks/ ./benchmarks/
|
|||||||
ADD ./vllm/collect_env.py .
|
ADD ./vllm/collect_env.py .
|
||||||
ADD ./.buildkite/ ./.buildkite/
|
ADD ./.buildkite/ ./.buildkite/
|
||||||
|
|
||||||
# Create symlink for vllm-workspace to maintain CI compatibility
|
|
||||||
RUN ln -sf /workspace /vllm-workspace
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install -e tests/vllm_test_utils
|
uv pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
ENTRYPOINT ["bash"]
|
||||||
|
|
||||||
######################### RELEASE IMAGE #########################
|
######################### RELEASE IMAGE #########################
|
||||||
FROM base AS vllm-openai
|
FROM base AS vllm-openai
|
||||||
@ -177,4 +176,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
|
||||||
uv pip install dist/*.whl
|
uv pip install dist/*.whl
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
@ -314,4 +314,4 @@ WORKDIR /workspace/
|
|||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
@ -15,7 +15,7 @@ FROM ${BASE_IMAGE} AS base
|
|||||||
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
ENV ROCM_PATH=/opt/rocm
|
ENV ROCM_PATH=/opt/rocm
|
||||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
|
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
|
||||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
||||||
|
|
||||||
@ -141,4 +141,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
|||||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
@ -309,4 +309,4 @@ USER 2000
|
|||||||
WORKDIR /home/vllm
|
WORKDIR /home/vllm
|
||||||
|
|
||||||
# Set the default entrypoint
|
# Set the default entrypoint
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
@ -69,4 +69,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
search:
|
search:
|
||||||
exclude: true
|
boost: 0.5
|
||||||
|
Before Width: | Height: | Size: 627 KiB |
Before Width: | Height: | Size: 350 KiB |
Before Width: | Height: | Size: 814 KiB |
Before Width: | Height: | Size: 267 KiB |
Before Width: | Height: | Size: 354 KiB |
Before Width: | Height: | Size: 781 KiB |
Before Width: | Height: | Size: 51 KiB |
Before Width: | Height: | Size: 359 KiB |
Before Width: | Height: | Size: 82 KiB |
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||||
|
|
||||||
- [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
|
|
||||||
- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
|
- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
|
||||||
- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
|
- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
|
||||||
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
|
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
|
||||||
|
@ -53,7 +53,7 @@ llm = LLM(model="adept/fuyu-8b",
|
|||||||
By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
|
By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
CUDA graph capture increases GPU memory usage. Adjust capture sizes if you need to conserve memory.
|
CUDA graph capture takes up more memory in V1 than in V0.
|
||||||
|
|
||||||
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as re
|
|||||||
|
|
||||||
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
||||||
|
|
||||||
In vLLM V1, **chunked prefill is always enabled by default** so that behavior is consistent across supported models.
|
In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
|
||||||
|
|
||||||
With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
|
With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
|
||||||
|
|
||||||
@ -49,7 +49,7 @@ You can tune the performance by adjusting `max_num_batched_tokens`:
|
|||||||
- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
|
- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
|
||||||
- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
|
- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
|
||||||
- For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
|
- For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
|
||||||
- If `max_num_batched_tokens` is the same as `max_model_len`, the scheduler behaves similarly to the legacy policy where large prefills ran without chunking (while still prioritizing decodes).
|
- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
@ -661,7 +661,8 @@ Benchmark the performance of multi-modal requests in vLLM.
|
|||||||
Start vLLM:
|
Start vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
--dtype bfloat16 \
|
--dtype bfloat16 \
|
||||||
--limit-mm-per-prompt '{"image": 1}' \
|
--limit-mm-per-prompt '{"image": 1}' \
|
||||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
--allowed-local-media-path /path/to/sharegpt4v/images
|
||||||
@ -687,7 +688,8 @@ vllm bench serve \
|
|||||||
Start vLLM:
|
Start vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2.5-VL-7B-Instruct \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
||||||
--dtype bfloat16 \
|
--dtype bfloat16 \
|
||||||
--limit-mm-per-prompt '{"video": 1}' \
|
--limit-mm-per-prompt '{"video": 1}' \
|
||||||
--allowed-local-media-path /path/to/sharegpt4video/videos
|
--allowed-local-media-path /path/to/sharegpt4video/videos
|
||||||
@ -821,30 +823,6 @@ The latest performance results are hosted on the public [vLLM Performance Dashbo
|
|||||||
|
|
||||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||||
|
|
||||||
### Continuous Benchmarking
|
|
||||||
|
|
||||||
The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
|
|
||||||
|
|
||||||
#### How It Works
|
|
||||||
|
|
||||||
The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
|
|
||||||
|
|
||||||
- **Serving tests**: Measure request handling and API performance
|
|
||||||
- **Throughput tests**: Evaluate token generation rates
|
|
||||||
- **Latency tests**: Assess response time characteristics
|
|
||||||
|
|
||||||
#### Benchmark Configuration
|
|
||||||
|
|
||||||
The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
|
|
||||||
|
|
||||||
1. Navigate to the appropriate GPU directory in the benchmarks configuration
|
|
||||||
2. Add your model specifications to the corresponding configuration files
|
|
||||||
3. The new models will be included in the next scheduled benchmark run
|
|
||||||
|
|
||||||
#### Viewing Results
|
|
||||||
|
|
||||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
|
||||||
|
|
||||||
[](){ #nightly-benchmarks }
|
[](){ #nightly-benchmarks }
|
||||||
|
|
||||||
## Nightly Benchmarks
|
## Nightly Benchmarks
|
||||||
|
@ -133,7 +133,8 @@ We consider 3 different scenarios:
|
|||||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||||
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||||
Please avoid reintroducing legacy cache managers such as `MambaCacheManager` or any previously removed code paths from older implementations.
|
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||||
|
V0-only classes and code will be removed in the very near future.
|
||||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
||||||
|
|
||||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||||
|
@ -66,12 +66,35 @@ Further update the model as follows:
|
|||||||
!!! important
|
!!! important
|
||||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||||
|
|
||||||
!!! note
|
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||||
By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in
|
|
||||||
[PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
|
|
||||||
This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings].
|
|
||||||
|
|
||||||
You may override this method if additional logic is required for your model when merging embeddings.
|
??? code
|
||||||
|
|
||||||
|
```python
|
||||||
|
from .utils import merge_multimodal_embeddings
|
||||||
|
|
||||||
|
class YourModelForImage2Seq(nn.Module):
|
||||||
|
...
|
||||||
|
|
||||||
|
def get_input_embeddings(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
# `get_input_embeddings` should already be implemented for the language
|
||||||
|
# model as one of the requirements of basic vLLM model implementation.
|
||||||
|
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||||
|
|
||||||
|
if multimodal_embeddings is not None:
|
||||||
|
inputs_embeds = merge_multimodal_embeddings(
|
||||||
|
input_ids=input_ids,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
multimodal_embeddings=multimodal_embeddings,
|
||||||
|
placeholder_token_id=self.config.image_token_index)
|
||||||
|
|
||||||
|
return inputs_embeds
|
||||||
|
```
|
||||||
|
|
||||||
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
||||||
|
|
||||||
@ -258,21 +281,17 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
|
||||||
) -> MultiModalDataDict:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
image_overrides = mm_options.get("image") if mm_options else None
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images,
|
num_images=num_images)
|
||||||
overrides=image_overrides)
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -442,20 +461,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
|
||||||
) -> MultiModalDataDict:
|
) -> MultiModalDataDict:
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
image_overrides = mm_options.get("image") if mm_options else None
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=target_width,
|
self._get_dummy_images(width=target_width,
|
||||||
height=target_height,
|
height=target_height,
|
||||||
num_images=num_images,
|
num_images=num_images)
|
||||||
overrides=image_overrides)
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -39,7 +39,8 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
VLLM_TORCH_PROFILER_DIR=./vllm_profile \
|
||||||
vllm serve meta-llama/Meta-Llama-3-70B
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model meta-llama/Meta-Llama-3-70B
|
||||||
```
|
```
|
||||||
|
|
||||||
vllm bench command:
|
vllm bench command:
|
||||||
@ -159,22 +160,6 @@ GUI example:
|
|||||||
|
|
||||||
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
|
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
|
||||||
|
|
||||||
## Continuous Profiling
|
|
||||||
|
|
||||||
There is a [GitHub CI workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-profiling.yml) in the PyTorch infrastructure repository that provides continuous profiling for different models on vLLM. This automated profiling helps track performance characteristics over time and across different model configurations.
|
|
||||||
|
|
||||||
### How It Works
|
|
||||||
|
|
||||||
The workflow currently runs weekly profiling sessions for selected models, generating detailed performance traces that can be analyzed using different tools to identify performance regressions or optimization opportunities. But, it can be triggered manually as well, using the Github Action tool.
|
|
||||||
|
|
||||||
### Adding New Models
|
|
||||||
|
|
||||||
To extend the continuous profiling to additional models, you can modify the [profiling-tests.json](https://github.com/pytorch/pytorch-integration-testing/blob/main/vllm-profiling/cuda/profiling-tests.json) configuration file in the PyTorch integration testing repository. Simply add your model specifications to this file to include them in the automated profiling runs.
|
|
||||||
|
|
||||||
### Viewing Profiling Results
|
|
||||||
|
|
||||||
The profiling traces generated by the continuous profiling workflow are publicly available on the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). Look for the **Profiling traces** table to access and download the traces for different models and runs.
|
|
||||||
|
|
||||||
## Profiling vLLM Python Code
|
## Profiling vLLM Python Code
|
||||||
|
|
||||||
The Python standard library includes
|
The Python standard library includes
|
||||||
@ -223,11 +208,3 @@ One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
|
|||||||
pip install snakeviz
|
pip install snakeviz
|
||||||
snakeviz expensive_function.prof
|
snakeviz expensive_function.prof
|
||||||
```
|
```
|
||||||
|
|
||||||
### Analyzing Garbage Collection Costs
|
|
||||||
|
|
||||||
Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
|
|
||||||
|
|
||||||
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
|
|
||||||
- VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
|
|
||||||
collected objects for each gc.collect
|
|
||||||
|
@ -19,7 +19,8 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
|
|||||||
1. Start the vLLM server with the supported chat completion model, e.g.
|
1. Start the vLLM server with the supported chat completion model, e.g.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve mistralai/Mistral-7B-Instruct-v0.2
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model mistralai/Mistral-7B-Instruct-v0.2
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Call it with AutoGen:
|
1. Call it with AutoGen:
|
||||||
|
@ -1,170 +0,0 @@
|
|||||||
# Hugging Face Inference Endpoints
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Models compatible with vLLM can be deployed on Hugging Face Inference Endpoints, either starting from the [Hugging Face Hub](https://huggingface.co) or directly from the [Inference Endpoints](https://endpoints.huggingface.co/) interface. This allows you to serve models in a fully managed environment with GPU acceleration, auto-scaling, and monitoring, without managing the infrastructure manually.
|
|
||||||
|
|
||||||
For advanced details on vLLM integration and deployment options, see [Advanced Deployment Details](#advanced-deployment-details).
|
|
||||||
|
|
||||||
## Deployment Methods
|
|
||||||
|
|
||||||
- [**Method 1: Deploy from the Catalog.**](#method-1-deploy-from-the-catalog) One-click deploy models from the Hugging Face Hub with ready-made optimized configurations.
|
|
||||||
- [**Method 2: Guided Deployment (Transformers Models).**](#method-2-guided-deployment-transformers-models) Instantly deploy models tagged with `transformers` from the Hub UI using the **Deploy** button.
|
|
||||||
- [**Method 3: Manual Deployment (Advanced Models).**](#method-3-manual-deployment-advanced-models) For models that either use custom code with the `transformers` tag, or don’t run with standard `transformers` but are supported by vLLM. This method requires manual configuration.
|
|
||||||
|
|
||||||
### Method 1: Deploy from the Catalog
|
|
||||||
|
|
||||||
This is the easiest way to get started with vLLM on Hugging Face Inference Endpoints. You can browse a catalog of models with verified and optimized deployment configuration at [Inference Endpoints](https://endpoints.huggingface.co/catalog) to maximize performance.
|
|
||||||
|
|
||||||
1. Go to [Endpoints Catalog](https://endpoints.huggingface.co/catalog) and in the **Inference Server** options, select `vLLM`.This will display the current list of models with optimized preconfigured options.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
1. Select the desired model and click **Create Endpoint**.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
1. Once the deployment is ready, you can use the endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console, remembering to append `/v1` as required.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# pip install openai
|
|
||||||
from openai import OpenAI
|
|
||||||
import os
|
|
||||||
|
|
||||||
client = OpenAI(
|
|
||||||
base_url = DEPLOYMENT_URL,
|
|
||||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
|
||||||
)
|
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
|
||||||
model = "HuggingFaceTB/SmolLM3-3B",
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "Give me a brief explanation of gravity in simple terms."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
stream = True
|
|
||||||
)
|
|
||||||
|
|
||||||
for message in chat_completion:
|
|
||||||
print(message.choices[0].delta.content, end = "")
|
|
||||||
```
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
The catalog provides models optimized for vLLM, including GPU settings and inference engine configurations. You can monitor the endpoint and update the **container or its configuration** from the Inference Endpoints UI.
|
|
||||||
|
|
||||||
### Method 2: Guided Deployment (Transformers Models)
|
|
||||||
|
|
||||||
This method applies to models with the [`transformers` library tag](https://huggingface.co/models?library=transformers) in their metadata. It allows you to deploy a model directly from the Hub UI without manual configuration.
|
|
||||||
|
|
||||||
1. Navigate to a model on [Hugging Face Hub](https://huggingface.co/models).
|
|
||||||
For this example we will use the [`ibm-granite/granite-docling-258M`](https://huggingface.co/ibm-granite/granite-docling-258M) model. You can verify that the model is compatible by checking the front matter in the [README](https://huggingface.co/ibm-granite/granite-docling-258M/blob/main/README.md), where the library is tagged as `library: transformers`.
|
|
||||||
|
|
||||||
2. Locate the **Deploy** button. The button appears for models tagged with `transformers` at the top right of the [model card](https://huggingface.co/ibm-granite/granite-docling-258M).
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
3. Click to **Deploy** button > **HF Inference Endpoints**. You will be taken to the Inference Endpoints interface to configure the deployment.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
4. Select the Hardware (we choose AWS>GPU>T4 for the example) and Container Configuration. Choose `vLLM` as the container type and finalize the deployment pressing **Create Endpoint**.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
5. Use the deployed endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console (remember to add `/v1` needed). You can then use your endpoint programmatically or via the SDK.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# pip install openai
|
|
||||||
from openai import OpenAI
|
|
||||||
import os
|
|
||||||
|
|
||||||
client = OpenAI(
|
|
||||||
base_url = DEPLOYMENT_URL,
|
|
||||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
|
||||||
)
|
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
|
||||||
model = "ibm-granite/granite-docling-258M",
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "image_url",
|
|
||||||
"image_url": {
|
|
||||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "Convert this page to docling."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
stream = True
|
|
||||||
)
|
|
||||||
|
|
||||||
for message in chat_completion:
|
|
||||||
print(message.choices[0].delta.content, end = "")
|
|
||||||
```
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
This method uses best-guess defaults. You may need to adjust the configuration to fit your specific requirements.
|
|
||||||
|
|
||||||
### Method 3: Manual Deployment (Advanced Models)
|
|
||||||
|
|
||||||
Some models require manual deployment because they:
|
|
||||||
|
|
||||||
- Use custom code with the `transformers` tag
|
|
||||||
- Don't run with standard `transformers` but are supported by `vLLM`
|
|
||||||
|
|
||||||
These models cannot be deployed using the **Deploy** button on the model card.
|
|
||||||
|
|
||||||
In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.ocr`](https://huggingface.co/rednote-hilab/dots.ocr) model, an OCR model integrated with vLLM (see vLLM [PR](https://github.com/vllm-project/vllm/pull/24645)).
|
|
||||||
|
|
||||||
1. Start a new deployment. Go to [Inference Endpoints](https://endpoints.huggingface.co/) and click `New`.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
2. Search the model in the Hub. In the dialog, switch to **Hub** and search for the desired model.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
3. Choosing infrastructure. On the configuration page, select the cloud provider and hardware from the available options.
|
|
||||||
For this demo, we choose AWS and L4 GPU. Adjust according to your hardware needs.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
4. Configure the container. Scroll to the **Container Configuration** and select `vLLM` as the container type.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
5. Create the endpoint. Click **Create Endpoint** to deploy the model.
|
|
||||||
|
|
||||||
Once the endpoint is ready, you can use it with the OpenAI Completion API, cURL, or other SDKs. Remember to append `/v1` to the deployment URL if needed.
|
|
||||||
|
|
||||||
!!! note
|
|
||||||
You can adjust the **container settings** (Container URI, Container Arguments) from the Inference Endpoints UI and press **Update Endpoint**. This redeploys the endpoint with the updated container configuration. Changes to the model itself require creating a new endpoint or redeploying with a different model. For example, for this demo, you may need to update the Container URI to the nightly image (`vllm/vllm-openai:nightly`) and add the `--trust-remote-code` flag in the container arguments.
|
|
||||||
|
|
||||||
## Advanced Deployment Details
|
|
||||||
|
|
||||||
With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
|
|
||||||
|
|
||||||
Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
|
|
||||||
|
|
||||||
The platform integrates seamlessly with the Hugging Face Hub, allowing you to deploy any vLLM- or `transformers`-compatible model, track usage, and update the inference engine directly. The vLLM engine comes preconfigured, enabling optimized inference and easy switching between models or engines without modifying your code. This setup simplifies production deployment: endpoints are ready in minutes, include monitoring and logging, and let you focus on serving models rather than maintaining infrastructure.
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
|
|
||||||
- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
|
|
||||||
- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
|
|
||||||
- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
|
|
@ -20,7 +20,7 @@ To get started with Open WebUI using vLLM, follow these steps:
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
vllm serve <model> --host 0.0.0.0 --port 8000
|
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Start the Open WebUI Docker container:
|
3. Start the Open WebUI Docker container:
|
||||||
|
@ -32,7 +32,6 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
PYTHONUNBUFFERED: 1
|
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -48,8 +47,9 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
vllm serve $MODEL_NAME \
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log &
|
2>&1 | tee api_server.log &
|
||||||
@ -131,7 +131,6 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
PYTHONUNBUFFERED: 1
|
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -147,8 +146,9 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
vllm serve $MODEL_NAME \
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log
|
2>&1 | tee api_server.log
|
||||||
@ -243,7 +243,6 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
|||||||
ports: 8081 # Expose to internet traffic.
|
ports: 8081 # Expose to internet traffic.
|
||||||
|
|
||||||
envs:
|
envs:
|
||||||
PYTHONUNBUFFERED: 1
|
|
||||||
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
|
||||||
|
|
||||||
@ -259,8 +258,9 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
|
|||||||
run: |
|
run: |
|
||||||
conda activate vllm
|
conda activate vllm
|
||||||
echo 'Starting vllm api server...'
|
echo 'Starting vllm api server...'
|
||||||
vllm serve $MODEL_NAME \
|
python -u -m vllm.entrypoints.openai.api_server \
|
||||||
--port 8081 \
|
--port 8081 \
|
||||||
|
--model $MODEL_NAME \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
|
||||||
2>&1 | tee api_server.log
|
2>&1 | tee api_server.log
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
# KAITO
|
|
||||||
|
|
||||||
[KAITO](https://kaito-project.github.io/kaito/docs/) is a Kubernetes operator that supports deploying and serving LLMs with vLLM. It offers managing large models via container images with built-in OpenAI-compatible inference, auto-provisioning GPU nodes and curated model presets.
|
|
||||||
|
|
||||||
Please refer to [quick start](https://kaito-project.github.io/kaito/docs/quick-start) for more details.
|
|
@ -55,7 +55,7 @@ sudo kubectl port-forward svc/vllm-router-service 30080:80
|
|||||||
And then you can send out a query to the OpenAI-compatible API to check the available models:
|
And then you can send out a query to the OpenAI-compatible API to check the available models:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -o- http://localhost:30080/v1/models
|
curl -o- http://localhost:30080/models
|
||||||
```
|
```
|
||||||
|
|
||||||
??? console "Output"
|
??? console "Output"
|
||||||
@ -78,7 +78,7 @@ curl -o- http://localhost:30080/v1/models
|
|||||||
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
|
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST http://localhost:30080/v1/completions \
|
curl -X POST http://localhost:30080/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "facebook/opt-125m",
|
"model": "facebook/opt-125m",
|
||||||
|
@ -12,7 +12,6 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
|||||||
|
|
||||||
- [Helm](frameworks/helm.md)
|
- [Helm](frameworks/helm.md)
|
||||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||||
- [KAITO](integrations/kaito.md)
|
|
||||||
- [KServe](integrations/kserve.md)
|
- [KServe](integrations/kserve.md)
|
||||||
- [KubeRay](integrations/kuberay.md)
|
- [KubeRay](integrations/kuberay.md)
|
||||||
- [kubernetes-sigs/lws](frameworks/lws.md)
|
- [kubernetes-sigs/lws](frameworks/lws.md)
|
||||||
|
@ -69,11 +69,6 @@ Sometimes you may see the API server entrypoint used directly instead of via the
|
|||||||
python -m vllm.entrypoints.openai.api_server --model <model>
|
python -m vllm.entrypoints.openai.api_server --model <model>
|
||||||
```
|
```
|
||||||
|
|
||||||
!!! warning
|
|
||||||
|
|
||||||
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
|
||||||
and may become unsupported in a future release.
|
|
||||||
|
|
||||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||||
|
|
||||||
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||||
|
@ -242,8 +242,30 @@ Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kern
|
|||||||
|
|
||||||
## FusedMoEPrepareAndFinalize Implementations
|
## FusedMoEPrepareAndFinalize Implementations
|
||||||
|
|
||||||
See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-modular-all2all-backends) for a list of all the available modular prepare and finalize subclasses.
|
The following table lists the `FusedMoEPrepareAndFinalize` implementations at the time of writing,
|
||||||
|
|
||||||
|
| Implementation | Type | Comments |
|
||||||
|
| :--- | :--- | :--- |
|
||||||
|
| DeepEPHTPrepareAndFinalize | Contiguous / Non-Batched | Uses the DeepEP High-Throughput all2all kernels. |
|
||||||
|
| DeepEPLLPrepareAndFinalize | Batched | Uses the DeepEP Low-Latency all2all kernels. |
|
||||||
|
| PplxPrepareAndFinalize | Batched | Uses the Perplexity all2all kernels. |
|
||||||
|
| FlashInferCutlassMoEPrepareAndFinalize | Contiguous | |
|
||||||
|
| MoEPrepareAndFinalizeNoEP | Contiguous | This implementation is used when there is no EP. i.e. no all2all kernels are invoked. |
|
||||||
|
| BatchedPrepareAndFinalize | Batched | A reference prepare/finalize class that reorganizes the tokens into expert batched format, i.e. E x max_num_tokens x K. (Doesn’t use any all2all kernels. This is primarily used in unit testing) |
|
||||||
|
|
||||||
## FusedMoEPermuteExpertsUnpermute
|
## FusedMoEPermuteExpertsUnpermute
|
||||||
|
|
||||||
See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-experts-kernels) for a list of all the available modular experts.
|
The following table lists the `FusedMoEPermuteExpertsUnpermute` implementations at the time of writing,
|
||||||
|
|
||||||
|
| Implementation | Type | Comment |
|
||||||
|
| :--- | :--- | :--- |
|
||||||
|
| BatchedDeepGemmExperts | Batched | Uses the DeepGemm’s Masked Grouped Gemm kernels for the fused_moe operation. |
|
||||||
|
| BatchedTritonExperts | Batched | Uses a Triton Kernel for the Batched matmuls. |
|
||||||
|
| BatchedTritonOrDeepGemmExperts | Batched | Chooses either the `BatchedDeepGemmExperts` or `BatchedTritonExperts` based on environment settings. |
|
||||||
|
| DeepGemmExperts | Contiguous / Non-Batched | Uses DeepGemm’s Grouped Gemm kernels for fused_moe operation. |
|
||||||
|
| TritonExperts | Contiguous / Non-Batched | Uses a Triton Kernel for fused_moe matmuls. |
|
||||||
|
| TritonOrDeepGemmExperts | Contiguous / Non-Batched | Chooses either the `DeepGemmExperts` or `TritonExperts` based on fused_moe inputs. |
|
||||||
|
| CutlassExpertsFP8 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp8 matmuls. |
|
||||||
|
| CutlassExpertsFP4 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp4 matmuls. |
|
||||||
|
| FlashInferExperts | Contiguous | Uses fused_moe operation from FlashInfer |
|
||||||
|
| NaiveBatchedExperts | Batched | Reference Batched Experts implementation. Primarily used in unit tests. |
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
# Metrics
|
# Metrics
|
||||||
|
|
||||||
vLLM exposes a rich set of metrics to support observability and capacity planning for the V1 engine.
|
Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
|
||||||
|
|
||||||
## Objectives
|
## Objectives
|
||||||
|
|
||||||
- Provide comprehensive coverage of engine and request level metrics to aid production monitoring.
|
- Achieve parity of metrics between v0 and v1.
|
||||||
- Prioritize Prometheus integrations, as this is what we expect to be used in production environments.
|
- The priority use case is accessing these metrics via Prometheus, as this is what we expect to be used in production environments.
|
||||||
- Offer logging support (i.e. printing metrics to the info log) for ad-hoc testing, debugging, development, and exploratory use cases.
|
- Logging support (i.e. printing metrics to the info log) is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
@ -17,9 +17,9 @@ Metrics in vLLM can be categorized as follows:
|
|||||||
|
|
||||||
The mental model is that server-level metrics help explain the values of request-level metrics.
|
The mental model is that server-level metrics help explain the values of request-level metrics.
|
||||||
|
|
||||||
### Metrics Overview
|
### v0 Metrics
|
||||||
|
|
||||||
The following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix and are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md):
|
In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
||||||
|
|
||||||
- `vllm:num_requests_running` (Gauge)
|
- `vllm:num_requests_running` (Gauge)
|
||||||
- `vllm:num_requests_swapped` (Gauge)
|
- `vllm:num_requests_swapped` (Gauge)
|
||||||
@ -57,6 +57,8 @@ The following metrics are exposed via a Prometheus-compatible `/metrics` endpoin
|
|||||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||||
|
|
||||||
|
These are documented under [Inferencing and Serving -> Production Metrics](../usage/metrics.md).
|
||||||
|
|
||||||
### Grafana Dashboard
|
### Grafana Dashboard
|
||||||
|
|
||||||
vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
vLLM also provides [a reference example](../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
|
||||||
@ -84,7 +86,7 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
|
|||||||
|
|
||||||
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
||||||
|
|
||||||
During those migrations we briefly lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
||||||
@ -95,6 +97,10 @@ http_request_duration_highr_seconds_count 201.0
|
|||||||
http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
|
http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Multi-process Mode
|
||||||
|
|
||||||
|
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
|
||||||
|
|
||||||
### Built in Python/Process Metrics
|
### Built in Python/Process Metrics
|
||||||
|
|
||||||
The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:
|
The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:
|
||||||
@ -110,7 +116,22 @@ The following metrics are supported by default by `prometheus_client`, but they
|
|||||||
- `process_open_fds`
|
- `process_open_fds`
|
||||||
- `process_max_fds`
|
- `process_max_fds`
|
||||||
|
|
||||||
This is relevant because if we move away from multiprocess mode we get these back. However, it's questionable how relevant these are if they don't aggregate these stats for all processes that make up a vLLM instance.
|
This is relevant because if we move away from multiprocess mode in v1,
|
||||||
|
we get these back. However, it's questionable how relevant these are
|
||||||
|
if they don't aggregate these stats for all processes that make up a
|
||||||
|
vLLM instance.
|
||||||
|
|
||||||
|
### v0 PRs and Issues
|
||||||
|
|
||||||
|
For background, these are some of the relevant PRs which added the v0 metrics:
|
||||||
|
|
||||||
|
- <gh-pr:1890>
|
||||||
|
- <gh-pr:2316>
|
||||||
|
- <gh-pr:2730>
|
||||||
|
- <gh-pr:4464>
|
||||||
|
- <gh-pr:7279>
|
||||||
|
|
||||||
|
Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
|
||||||
|
|
||||||
## v1 Design
|
## v1 Design
|
||||||
|
|
||||||
@ -375,8 +396,9 @@ recent metric is used, but only from currently running processes.
|
|||||||
|
|
||||||
This was added in <gh-pr:9477> and there is
|
This was added in <gh-pr:9477> and there is
|
||||||
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
||||||
If we revisit this design and deprecate the old metric, we should
|
If we revisit this design and deprecate the old metric, we should reduce
|
||||||
coordinate with downstream users so they can migrate before the removal.
|
the need for a significant deprecation period by making the change in
|
||||||
|
v0 also and asking this project to move to the new metric.
|
||||||
|
|
||||||
### Prefix Cache metrics
|
### Prefix Cache metrics
|
||||||
|
|
||||||
@ -469,7 +491,7 @@ if seq_group.is_finished():
|
|||||||
|
|
||||||
This seems duplicative, and one of them should be removed. The latter
|
This seems duplicative, and one of them should be removed. The latter
|
||||||
is used by the Grafana dashboard, so we should deprecate or remove the
|
is used by the Grafana dashboard, so we should deprecate or remove the
|
||||||
former.
|
former from v0.
|
||||||
|
|
||||||
### Prefix Cache Hit Rate
|
### Prefix Cache Hit Rate
|
||||||
|
|
||||||
@ -478,7 +500,7 @@ See above - we now expose 'queries' and 'hits' counters rather than a
|
|||||||
|
|
||||||
### KV Cache Offloading
|
### KV Cache Offloading
|
||||||
|
|
||||||
Two legacy metrics relate to a "swapped" preemption mode that is no
|
Two v0 metrics relate to a "swapped" preemption mode that is no
|
||||||
longer relevant in v1:
|
longer relevant in v1:
|
||||||
|
|
||||||
- `vllm:num_requests_swapped`
|
- `vllm:num_requests_swapped`
|
||||||
@ -489,7 +511,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
|||||||
memory. This is also known as "KV cache offloading" and is configured
|
memory. This is also known as "KV cache offloading" and is configured
|
||||||
with `--swap-space` and `--preemption-mode`.
|
with `--swap-space` and `--preemption-mode`.
|
||||||
|
|
||||||
Historically, [vLLM has long supported beam search](gh-issue:6226). The
|
In v0, [vLLM has long supported beam search](gh-issue:6226). The
|
||||||
SequenceGroup encapsulated the idea of N Sequences which
|
SequenceGroup encapsulated the idea of N Sequences which
|
||||||
all shared the same prompt kv blocks. This enabled KV cache block
|
all shared the same prompt kv blocks. This enabled KV cache block
|
||||||
sharing between requests, and copy-on-write to do branching. CPU
|
sharing between requests, and copy-on-write to do branching. CPU
|
||||||
@ -502,7 +524,7 @@ and the part of the prompt that was evicted can be recomputed.
|
|||||||
|
|
||||||
SequenceGroup was removed in V1, although a replacement will be
|
SequenceGroup was removed in V1, although a replacement will be
|
||||||
required for "parallel sampling" (`n>1`).
|
required for "parallel sampling" (`n>1`).
|
||||||
[Beam search was moved out of the core](gh-issue:8306). There was a
|
[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
|
||||||
lot of complex code for a very uncommon feature.
|
lot of complex code for a very uncommon feature.
|
||||||
|
|
||||||
In V1, with prefix caching being better (zero over head) and therefore
|
In V1, with prefix caching being better (zero over head) and therefore
|
||||||
@ -513,7 +535,7 @@ better.
|
|||||||
|
|
||||||
### Parallel Sampling
|
### Parallel Sampling
|
||||||
|
|
||||||
Some legacy metrics are only relevant in the context of "parallel
|
Some v0 metrics are only relevant in the context of "parallel
|
||||||
sampling". This is where the `n` parameter in a request is used to
|
sampling". This is where the `n` parameter in a request is used to
|
||||||
request multiple completions from the same prompt.
|
request multiple completions from the same prompt.
|
||||||
|
|
||||||
@ -532,7 +554,7 @@ also add these metrics.
|
|||||||
|
|
||||||
### Speculative Decoding
|
### Speculative Decoding
|
||||||
|
|
||||||
Some legacy metrics are specific to "speculative decoding". This is where
|
Some v0 metrics are specific to "speculative decoding". This is where
|
||||||
we generate candidate tokens using a faster, approximate method or
|
we generate candidate tokens using a faster, approximate method or
|
||||||
model and then validate those tokens with the larger model.
|
model and then validate those tokens with the larger model.
|
||||||
|
|
||||||
@ -544,7 +566,7 @@ model and then validate those tokens with the larger model.
|
|||||||
|
|
||||||
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
||||||
speculative decoding to v1. Other techniques will follow. We should
|
speculative decoding to v1. Other techniques will follow. We should
|
||||||
revisit these metrics in this context.
|
revisit the v0 metrics in this context.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
We should probably expose acceptance rate as separate accepted
|
We should probably expose acceptance rate as separate accepted
|
||||||
@ -617,7 +639,7 @@ metrics are often relatively straightforward to add:
|
|||||||
metrics are usually of very limited use unless they can be enabled
|
metrics are usually of very limited use unless they can be enabled
|
||||||
by default and in production.
|
by default and in production.
|
||||||
3. They have an impact on development and maintenance of the
|
3. They have an impact on development and maintenance of the
|
||||||
project. Every metric added over time has made this effort more
|
project. Every metric added to v0 has made this v1 effort more
|
||||||
time-consuming, and perhaps not all metrics justify this ongoing
|
time-consuming, and perhaps not all metrics justify this ongoing
|
||||||
investment in their maintenance.
|
investment in their maintenance.
|
||||||
|
|
||||||
@ -628,7 +650,7 @@ performance and health. Tracing, on the other hand, tracks individual
|
|||||||
requests as they move through different services and components. Both
|
requests as they move through different services and components. Both
|
||||||
fall under the more general heading of "Observability".
|
fall under the more general heading of "Observability".
|
||||||
|
|
||||||
vLLM has support for OpenTelemetry tracing:
|
v0 has support for OpenTelemetry tracing:
|
||||||
|
|
||||||
- Added by <gh-pr:4687>
|
- Added by <gh-pr:4687>
|
||||||
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
||||||
@ -641,11 +663,11 @@ OpenTelemetry has a
|
|||||||
[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
[Gen AI Working Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
|
||||||
|
|
||||||
Since metrics is a big enough topic on its own, we are going to tackle
|
Since metrics is a big enough topic on its own, we are going to tackle
|
||||||
the topic of tracing separately.
|
the topic of tracing in v1 separately.
|
||||||
|
|
||||||
### OpenTelemetry Model Forward vs Execute Time
|
### OpenTelemetry Model Forward vs Execute Time
|
||||||
|
|
||||||
The current implementation exposes the following two metrics:
|
In v0, we have the following two metrics:
|
||||||
|
|
||||||
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
|
||||||
in the model forward pass when this request was in the batch.
|
in the model forward pass when this request was in the batch.
|
||||||
|
@ -1,121 +0,0 @@
|
|||||||
# Fused MoE Kernel features
|
|
||||||
|
|
||||||
The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels.
|
|
||||||
|
|
||||||
## Fused MoE Modular All2All backends
|
|
||||||
|
|
||||||
There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend.
|
|
||||||
|
|
||||||
The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
|
|
||||||
|
|
||||||
The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
|
|
||||||
|
|
||||||
The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
|
|
||||||
|
|
||||||
Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step).
|
|
||||||
|
|
||||||
Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag.
|
|
||||||
|
|
||||||
unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP.
|
|
||||||
|
|
||||||
<style>
|
|
||||||
td {
|
|
||||||
padding: 0.5rem !important;
|
|
||||||
white-space: nowrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
th {
|
|
||||||
padding: 0.5rem !important;
|
|
||||||
min-width: 0 !important;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
|
|
||||||
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Sub-class |
|
|
||||||
|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
|
|
||||||
| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
|
|
||||||
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
|
|
||||||
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
|
|
||||||
| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
|
|
||||||
| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
|
|
||||||
| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
|
|
||||||
| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] |
|
|
||||||
| BatchedPrepareAndFinalize<sup>5</sup> | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] |
|
|
||||||
|
|
||||||
!!! info "Table key"
|
|
||||||
1. All types: mxfp4, nvfp4, int4, int8, fp8
|
|
||||||
2. A,T quantization occurs after dispatch.
|
|
||||||
3. All quantization happens after dispatch.
|
|
||||||
4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
|
|
||||||
5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
|
|
||||||
6. This depends on the experts implementation.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
- G - Grouped
|
|
||||||
- G(N) - Grouped w/block size N
|
|
||||||
- A - Per activation token
|
|
||||||
- T - Per tensor
|
|
||||||
|
|
||||||
Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
|
||||||
|
|
||||||
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
|
|
||||||
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
|
|
||||||
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
|
|
||||||
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
|
|
||||||
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
|
|
||||||
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
|
|
||||||
|
|
||||||
## Fused MoE Experts Kernels
|
|
||||||
|
|
||||||
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
|
||||||
|
|
||||||
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
|
|
||||||
|
|
||||||
Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
|
|
||||||
|
|
||||||
Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results.
|
|
||||||
|
|
||||||
As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts.
|
|
||||||
|
|
||||||
Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`.
|
|
||||||
|
|
||||||
To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats.
|
|
||||||
|
|
||||||
| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
|
|
||||||
|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
|
|
||||||
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
|
|
||||||
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
|
|
||||||
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
|
|
||||||
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
|
|
||||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
|
||||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
|
||||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
|
||||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
|
||||||
|
|
||||||
| marlin experts | standard | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts] |
|
|
||||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
|
||||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
|
||||||
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
|
||||||
| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_moe_impl] |
|
|
||||||
| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
|
|
||||||
| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
|
|
||||||
|
|
||||||
!!! info "Table key"
|
|
||||||
1. All types: mxfp4, nvfp4, int4, int8, fp8
|
|
||||||
2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params
|
|
||||||
3. uint4, uint8, fp8, fp4
|
|
||||||
4. This is a naive implementation of experts that supports batched format. Mainly used for testing.
|
|
||||||
5. The `activation` parameter is ignored and SwiGlu is used by default instead.
|
|
||||||
6. Only handled by or supported when used with modular kernels.
|
|
||||||
|
|
||||||
## Modular Kernel "families"
|
|
||||||
|
|
||||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
|
|
||||||
|
|
||||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
|
||||||
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
|
||||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
|
|
||||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
|
@ -60,6 +60,30 @@ Multiple vLLM dependencies indicate either a preference or requirement for using
|
|||||||
It is perhaps more accurate to say that there are known problems with using
|
It is perhaps more accurate to say that there are known problems with using
|
||||||
`fork` after initializing these dependencies.
|
`fork` after initializing these dependencies.
|
||||||
|
|
||||||
|
## Current State (v0)
|
||||||
|
|
||||||
|
The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`.
|
||||||
|
|
||||||
|
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
|
||||||
|
|
||||||
|
When we know we own the process because the `vllm` command was used, we use
|
||||||
|
`spawn` because it's the most widely compatible.
|
||||||
|
|
||||||
|
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
|
||||||
|
|
||||||
|
The `multiproc_xpu_executor` forces the use of `spawn`.
|
||||||
|
|
||||||
|
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/executor/multiproc_xpu_executor.py#L14-L18>
|
||||||
|
|
||||||
|
There are other miscellaneous places hard-coding the use of `spawn`:
|
||||||
|
|
||||||
|
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/all_reduce_utils.py#L135>
|
||||||
|
- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
|
||||||
|
|
||||||
|
Related PRs:
|
||||||
|
|
||||||
|
- <gh-pr:8823>
|
||||||
|
|
||||||
## Prior State in v1
|
## Prior State in v1
|
||||||
|
|
||||||
There was an environment variable to control whether multiprocessing is used in
|
There was an environment variable to control whether multiprocessing is used in
|
||||||
|
@ -49,7 +49,7 @@ Every plugin has three parts:
|
|||||||
|
|
||||||
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
|
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
|
||||||
|
|
||||||
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
|
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
|
||||||
|
|
||||||
## Guidelines for Writing Plugins
|
## Guidelines for Writing Plugins
|
||||||
|
|
||||||
|
@ -94,6 +94,9 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
|
|||||||
|
|
||||||
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
|
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
Cache isolation is not supported in engine V0.
|
||||||
|
|
||||||
## Data Structure
|
## Data Structure
|
||||||
|
|
||||||
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
|
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
|
||||||
@ -186,7 +189,7 @@ Time 1:
|
|||||||
Cache Blocks: 0, 1, 3
|
Cache Blocks: 0, 1, 3
|
||||||
```
|
```
|
||||||
|
|
||||||
As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. Because the block table in vLLM v1 is append-only, changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
|
As can be seen, block 3 is a new full block and is cached. However, it is redundant as block 1, meaning that we cached the same block twice. In v0, when detecting block 3 is duplicated, we free block 3 and let Request 2 use block 1 instead, so its block table becomes `[0, 1]` in Time 1. However, the block table in vLLM v1 is append-only, meaning that changing the block table from `[0, 3]` to `[0, 1]` is not allowed. As a result, we will have duplicated blocks for the hash key E-H. This duplication will be eliminated when the request is freed.
|
||||||
|
|
||||||
### Free
|
### Free
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ vLLM will take all the available factors into consideration, and decide a direct
|
|||||||
|
|
||||||
The factors considered include:
|
The factors considered include:
|
||||||
|
|
||||||
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
|
- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
|
||||||
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
|
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
|
||||||
- The model's forward function and the relevant functions called by the forward function (see below)
|
- The model's forward function and the relevant functions called by the forward function (see below)
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ th:not(:first-child) {
|
|||||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
||||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
||||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
|
||||||
|
|
||||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||||
|
@ -166,7 +166,7 @@ The `DummyLogitsProcessor.update_state()` implementation maintains a "sparse" re
|
|||||||
|
|
||||||
### Wrapping an Existing Request-Level Logits Processor
|
### Wrapping an Existing Request-Level Logits Processor
|
||||||
|
|
||||||
Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. Earlier request-level processors were implemented as `Callable` objects conforming to the following type annotation:
|
Although the vLLM engine applies logits processors at batch granularity, some users may want to use vLLM with a "request-level" logits processor implementation - an implementation which operates on individual requests. This will be especially true if your logits processor was developed for vLLM version 0, which required it to be a `Callable` (as described [here](https://docs.vllm.ai/en/v0.10.1.1/api/vllm/logits_process.html)) conforming to the following type annotation:
|
||||||
|
|
||||||
``` python
|
``` python
|
||||||
RequestLogitsProcessor = Union[
|
RequestLogitsProcessor = Union[
|
||||||
|
@ -431,7 +431,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
|||||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||||
|
|
||||||
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
||||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
|
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
||||||
|
|
||||||
### Image Inputs
|
### Image Inputs
|
||||||
|
|
||||||
|
@ -84,7 +84,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
|||||||
- Connection info is passed via KVTransferParams from prefiller to decoder for handshake
|
- Connection info is passed via KVTransferParams from prefiller to decoder for handshake
|
||||||
|
|
||||||
- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
||||||
- Default: 480
|
- Default: 120
|
||||||
- If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
- If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
||||||
|
|
||||||
## Multi-Instance Setup
|
## Multi-Instance Setup
|
||||||
|
@ -6,11 +6,7 @@ This quantization method is particularly useful for reducing model size while ma
|
|||||||
Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
|
Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
|
INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
|
||||||
|
|
||||||
!!! warning
|
|
||||||
**Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell).
|
|
||||||
Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
|
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
|
@ -64,7 +64,8 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
|
|||||||
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
|
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
|
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model Qwen/Qwen3-0.6B \
|
||||||
--enable-sleep-mode \
|
--enable-sleep-mode \
|
||||||
--port 8000
|
--port 8000
|
||||||
```
|
```
|
||||||
|
@ -16,8 +16,8 @@ Speculative decoding is a technique which improves inter-token latency in memory
|
|||||||
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
Speculative decoding with a draft model requires the V1 engine.
|
In vllm v0.10.0, speculative decoding with a draft model is not supported.
|
||||||
Older releases that predate V1 (such as the 0.10.x series) raise a `NotImplementedError`.
|
If you use the following code, you will get a `NotImplementedError`.
|
||||||
|
|
||||||
??? code
|
??? code
|
||||||
|
|
||||||
@ -48,9 +48,10 @@ The following code configures vLLM in an offline mode to use speculative decodin
|
|||||||
To perform the same with an online mode launch the server:
|
To perform the same with an online mode launch the server:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve facebook/opt-6.7b \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
|
--model facebook/opt-6.7b \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
-tp 1 \
|
-tp 1 \
|
||||||
--gpu_memory_utilization 0.8 \
|
--gpu_memory_utilization 0.8 \
|
||||||
|
@ -191,14 +191,10 @@ VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pyth
|
|||||||
|
|
||||||
For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
|
For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
|
||||||
|
|
||||||
### IBM Granite
|
#### IBM Granite
|
||||||
|
|
||||||
Supported models:
|
Supported models:
|
||||||
|
|
||||||
* `ibm-granite/granite-4.0-h-small` and other Granite 4.0 models
|
|
||||||
|
|
||||||
Recommended flags: `--tool-call-parser hermes`
|
|
||||||
|
|
||||||
* `ibm-granite/granite-3.0-8b-instruct`
|
* `ibm-granite/granite-3.0-8b-instruct`
|
||||||
|
|
||||||
Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
|
Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
|
||||||
@ -327,10 +323,8 @@ Flags: `--tool-call-parser longcat`
|
|||||||
|
|
||||||
Supported models:
|
Supported models:
|
||||||
|
|
||||||
* `zai-org/GLM-4.5`
|
* `ZhipuAI/GLM-4.5`
|
||||||
* `zai-org/GLM-4.5-Air`
|
* `ZhipuAI/GLM-4.5-Air`
|
||||||
* `zai-org/GLM-4.6`
|
|
||||||
* `zai-org/GLM-4.6-Air`
|
|
||||||
|
|
||||||
Flags: `--tool-call-parser glm45`
|
Flags: `--tool-call-parser glm45`
|
||||||
|
|
||||||
|
@ -25,4 +25,3 @@ The backends below live **outside** the main `vllm` repository and follow the
|
|||||||
| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
|
| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
|
||||||
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
|
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
|
||||||
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
|
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
|
||||||
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
|
|
||||||
|
@ -46,22 +46,22 @@ Execute the following commands to build and install vLLM from source.
|
|||||||
Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
|
Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
|
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||||
uv pip install -v \
|
uv pip install -v \
|
||||||
--torch-backend auto \
|
--torch-backend auto \
|
||||||
-r requirements/build.txt \
|
-r requirements-build.txt \
|
||||||
-r requirements/cpu.txt \
|
-r requirements-cpu.txt \
|
||||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||||
uv pip install dist/*.whl
|
uv pip install dist/*.whl
|
||||||
```
|
```
|
||||||
|
|
||||||
??? console "pip"
|
??? console "pip"
|
||||||
```bash
|
```bash
|
||||||
sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
|
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||||
pip install -v \
|
pip install -v \
|
||||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||||
-r requirements/build.txt \
|
-r requirements-build.txt \
|
||||||
-r requirements/cpu.txt \
|
-r requirements-cpu.txt \
|
||||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||||
pip install dist/*.whl
|
pip install dist/*.whl
|
||||||
```
|
```
|
||||||
|
@ -67,7 +67,8 @@ docker run -it \
|
|||||||
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve facebook/opt-13b \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model=facebook/opt-13b \
|
||||||
--dtype=bfloat16 \
|
--dtype=bfloat16 \
|
||||||
--max_model_len=1024 \
|
--max_model_len=1024 \
|
||||||
--distributed-executor-backend=mp \
|
--distributed-executor-backend=mp \
|
||||||
|
@ -33,11 +33,8 @@ def auto_mock(module, attr, max_mocks=50):
|
|||||||
try:
|
try:
|
||||||
# First treat attr as an attr, then as a submodule
|
# First treat attr as an attr, then as a submodule
|
||||||
with patch("importlib.metadata.version", return_value="0.0.0"):
|
with patch("importlib.metadata.version", return_value="0.0.0"):
|
||||||
return getattr(
|
return getattr(importlib.import_module(module), attr,
|
||||||
importlib.import_module(module),
|
importlib.import_module(f"{module}.{attr}"))
|
||||||
attr,
|
|
||||||
importlib.import_module(f"{module}.{attr}"),
|
|
||||||
)
|
|
||||||
except importlib.metadata.PackageNotFoundError as e:
|
except importlib.metadata.PackageNotFoundError as e:
|
||||||
raise e
|
raise e
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
@ -45,8 +42,7 @@ def auto_mock(module, attr, max_mocks=50):
|
|||||||
sys.modules[e.name] = PydanticMagicMock()
|
sys.modules[e.name] = PydanticMagicMock()
|
||||||
|
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
|
f"Failed to import {module}.{attr} after mocking {max_mocks} imports")
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
latency = auto_mock("vllm.benchmarks", "latency")
|
latency = auto_mock("vllm.benchmarks", "latency")
|
||||||
@ -65,7 +61,9 @@ class MarkdownFormatter(HelpFormatter):
|
|||||||
"""Custom formatter that generates markdown for argument groups."""
|
"""Custom formatter that generates markdown for argument groups."""
|
||||||
|
|
||||||
def __init__(self, prog, starting_heading_level=3):
|
def __init__(self, prog, starting_heading_level=3):
|
||||||
super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
|
super().__init__(prog,
|
||||||
|
max_help_position=float('inf'),
|
||||||
|
width=float('inf'))
|
||||||
self._section_heading_prefix = "#" * starting_heading_level
|
self._section_heading_prefix = "#" * starting_heading_level
|
||||||
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
||||||
self._markdown_output = []
|
self._markdown_output = []
|
||||||
@ -87,19 +85,23 @@ class MarkdownFormatter(HelpFormatter):
|
|||||||
|
|
||||||
def add_arguments(self, actions):
|
def add_arguments(self, actions):
|
||||||
for action in actions:
|
for action in actions:
|
||||||
if len(action.option_strings) == 0 or "--help" in action.option_strings:
|
if (len(action.option_strings) == 0
|
||||||
|
or "--help" in action.option_strings):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
option_strings = f"`{'`, `'.join(action.option_strings)}`"
|
option_strings = f'`{"`, `".join(action.option_strings)}`'
|
||||||
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
|
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
|
||||||
self._markdown_output.append(heading_md)
|
self._markdown_output.append(heading_md)
|
||||||
|
|
||||||
if choices := action.choices:
|
if choices := action.choices:
|
||||||
choices = f"`{'`, `'.join(str(c) for c in choices)}`"
|
choices = f'`{"`, `".join(str(c) for c in choices)}`'
|
||||||
self._markdown_output.append(f"Possible choices: {choices}\n\n")
|
self._markdown_output.append(
|
||||||
elif (metavar := action.metavar) and isinstance(metavar, (list, tuple)):
|
f"Possible choices: {choices}\n\n")
|
||||||
metavar = f"`{'`, `'.join(str(m) for m in metavar)}`"
|
elif ((metavar := action.metavar)
|
||||||
self._markdown_output.append(f"Possible choices: {metavar}\n\n")
|
and isinstance(metavar, (list, tuple))):
|
||||||
|
metavar = f'`{"`, `".join(str(m) for m in metavar)}`'
|
||||||
|
self._markdown_output.append(
|
||||||
|
f"Possible choices: {metavar}\n\n")
|
||||||
|
|
||||||
if action.help:
|
if action.help:
|
||||||
self._markdown_output.append(f"{action.help}\n\n")
|
self._markdown_output.append(f"{action.help}\n\n")
|
||||||
@ -114,7 +116,7 @@ class MarkdownFormatter(HelpFormatter):
|
|||||||
|
|
||||||
def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
|
def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
|
||||||
"""Create a parser for the given class with markdown formatting.
|
"""Create a parser for the given class with markdown formatting.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cls: The class to create a parser for
|
cls: The class to create a parser for
|
||||||
**kwargs: Additional keyword arguments to pass to `cls.add_cli_args`.
|
**kwargs: Additional keyword arguments to pass to `cls.add_cli_args`.
|
||||||
@ -141,17 +143,24 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
|||||||
|
|
||||||
# Create parsers to document
|
# Create parsers to document
|
||||||
parsers = {
|
parsers = {
|
||||||
"engine_args": create_parser(EngineArgs.add_cli_args),
|
"engine_args":
|
||||||
"async_engine_args": create_parser(
|
create_parser(EngineArgs.add_cli_args),
|
||||||
AsyncEngineArgs.add_cli_args, async_args_only=True
|
"async_engine_args":
|
||||||
),
|
create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True),
|
||||||
"serve": create_parser(cli_args.make_arg_parser),
|
"serve":
|
||||||
"chat": create_parser(ChatCommand.add_cli_args),
|
create_parser(cli_args.make_arg_parser),
|
||||||
"complete": create_parser(CompleteCommand.add_cli_args),
|
"chat":
|
||||||
"bench_latency": create_parser(latency.add_cli_args),
|
create_parser(ChatCommand.add_cli_args),
|
||||||
"bench_throughput": create_parser(throughput.add_cli_args),
|
"complete":
|
||||||
"bench_serve": create_parser(serve.add_cli_args),
|
create_parser(CompleteCommand.add_cli_args),
|
||||||
"run-batch": create_parser(run_batch.make_arg_parser),
|
"bench_latency":
|
||||||
|
create_parser(latency.add_cli_args),
|
||||||
|
"bench_throughput":
|
||||||
|
create_parser(throughput.add_cli_args),
|
||||||
|
"bench_serve":
|
||||||
|
create_parser(serve.add_cli_args),
|
||||||
|
"run-batch":
|
||||||
|
create_parser(run_batch.make_arg_parser),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Generate documentation for each parser
|
# Generate documentation for each parser
|
||||||
|