mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-21 07:13:52 +08:00
Compare commits
8 Commits
disable-sd
...
benchmark-
Author | SHA1 | Date | |
---|---|---|---|
a7b809e0f0 | |||
7efc568418 | |||
9ec11b459c | |||
244d5cc749 | |||
816693fd00 | |||
7c16128106 | |||
7bb88b2edc | |||
ae4f3e2aeb |
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||
tasks:
|
||||
|
@ -1,5 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||
tasks:
|
||||
|
@ -1,5 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||
tasks:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||
tasks:
|
||||
|
@ -5,12 +5,7 @@
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
if [[ -n "$container_id" ]]; then
|
||||
podman rm -f "$container_id" || true
|
||||
fi
|
||||
podman system prune -f
|
||||
}
|
||||
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
@ -18,17 +13,17 @@ remove_docker_container
|
||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||
|
||||
# Run the image
|
||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
|
||||
|
||||
function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
podman exec cpu-test-ubi9-ppc bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
podman exec cpu-test-ubi9-ppc bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
@ -38,8 +33,6 @@ function cpu_tests() {
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
|
||||
|
@ -19,7 +19,6 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_XLA_CACHE_PATH= \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo HARDWARE \
|
||||
|
@ -299,7 +299,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
source_file_dependencies:
|
||||
@ -584,8 +583,6 @@ steps:
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
|
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -12,7 +12,6 @@
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
|
||||
# vLLM V1
|
||||
|
22
.github/mergify.yml
vendored
22
.github/mergify.yml
vendored
@ -126,28 +126,6 @@ pull_request_rules:
|
||||
remove:
|
||||
- tpu
|
||||
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/source/features/tool_calling.md
|
||||
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
|
||||
- files=docs/source/getting_started/examples/chat_with_tools.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tool-calling
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- conflict
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,6 +3,7 @@
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
!vllm/vllm_flash_attn/fa_utils.py
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
@ -251,7 +251,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||
# Please keep this in sync with FetchContent_Declare line below.
|
||||
set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -269,7 +269,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||
GIT_TAG v3.9.0
|
||||
GIT_TAG v3.8.0
|
||||
GIT_PROGRESS TRUE
|
||||
|
||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||
@ -290,8 +290,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
||||
"csrc/cutlass_extensions/common.cpp")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_EXT_SRC}"
|
||||
@ -464,26 +463,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(FP4_ARCHS)
|
||||
endif()
|
||||
|
||||
# CUTLASS MLA Archs and flags
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${MLA_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
||||
# Add MLA-specific include directories only to MLA source files
|
||||
set_source_files_properties(${SRCS}
|
||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
||||
# clear MLA_ARCHS
|
||||
set(MLA_ARCHS)
|
||||
endif()
|
||||
|
||||
#
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||
|
@ -63,16 +63,14 @@ class Request:
|
||||
output_len: int
|
||||
|
||||
|
||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase,
|
||||
length: int) -> list[int]:
|
||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
|
||||
vocab = tokenizer.get_vocab()
|
||||
all_special_ids = set(tokenizer.all_special_ids)
|
||||
|
||||
# Remove the special tokens.
|
||||
return random.choices(
|
||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
||||
k=length,
|
||||
)
|
||||
vocab = {
|
||||
k: v
|
||||
for k, v in vocab.items() if k not in tokenizer.all_special_ids
|
||||
}
|
||||
return random.choices(list(vocab.values()), k=length)
|
||||
|
||||
|
||||
def sample_requests_from_dataset(
|
||||
|
@ -713,7 +713,7 @@ def main(args: argparse.Namespace):
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
if args.save_result or args.append_result:
|
||||
if args.save_result:
|
||||
result_json: dict[str, Any] = {}
|
||||
|
||||
# Setup
|
||||
@ -734,14 +734,6 @@ def main(args: argparse.Namespace):
|
||||
raise ValueError(
|
||||
"Invalid metadata format. Please use KEY=VALUE format."
|
||||
)
|
||||
# Traffic
|
||||
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||
< float("inf") else "inf")
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
# Merge with benchmark result
|
||||
result_json = {**result_json, **benchmark_result}
|
||||
|
||||
if not args.save_detailed:
|
||||
# Remove fields with too many data points
|
||||
@ -751,6 +743,17 @@ def main(args: argparse.Namespace):
|
||||
]:
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
if field in benchmark_result:
|
||||
del benchmark_result[field]
|
||||
|
||||
# Traffic
|
||||
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||
< float("inf") else "inf")
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
# Merge with benchmark result
|
||||
result_json = {**result_json, **benchmark_result}
|
||||
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
@ -761,11 +764,9 @@ def main(args: argparse.Namespace):
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(file_name,
|
||||
mode="a+" if args.append_result else "w",
|
||||
encoding='utf-8') as outfile:
|
||||
with open(file_name, mode="a+", encoding='utf-8') as outfile:
|
||||
# Append a newline.
|
||||
if args.append_result and outfile.tell() != 0:
|
||||
if outfile.tell() != 0:
|
||||
outfile.write("\n")
|
||||
json.dump(result_json, outfile)
|
||||
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
||||
@ -898,11 +899,6 @@ if __name__ == "__main__":
|
||||
help="When saving the results, whether to include per request "
|
||||
"information such as response, error, ttfs, tpots, etc.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--append-result",
|
||||
action="store_true",
|
||||
help="Append the benchmark result to the existing json file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metadata",
|
||||
metavar="KEY=VALUE",
|
||||
|
@ -150,17 +150,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||
|
||||
elif args.dataset == "grammar":
|
||||
schema = """
|
||||
root ::= select_statement
|
||||
?start: select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
?table_name: identifier
|
||||
|
||||
condition ::= column "= " number
|
||||
?column_name: identifier
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
prompt = "Generate an SQL query to show the 'username' \
|
||||
and 'email' from the 'users' table."
|
||||
|
13
benchmarks/convert_to_csv.py
Normal file
13
benchmarks/convert_to_csv.py
Normal file
@ -0,0 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import pandas as pd
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--input-path", type=str, required=True)
|
||||
parser.add_argument("--output-path", type=str, required=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
df = pd.read_json(args.input_path, lines=True)
|
||||
df.to_csv(args.output_path)
|
@ -17,14 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
||||
from utils import ArgPool, Bench, CudaGraphBenchParams
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
if HAS_TRITON:
|
||||
from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
|
||||
lora_shrink)
|
||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||
_LORA_B_PTR_DICT)
|
||||
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
|
@ -553,8 +553,9 @@ def main(args: argparse.Namespace):
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
else:
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
if not hasattr(config, "hidden_size"):
|
||||
# Support for llama4
|
||||
config = config.text_config
|
||||
# Default: Mixtral.
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
|
27
benchmarks/sweep.sh
Normal file
27
benchmarks/sweep.sh
Normal file
@ -0,0 +1,27 @@
|
||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||
REQUEST_RATES=(1 10 15 20)
|
||||
INPUT_LEN=1000
|
||||
OUTPUT_LEN=100
|
||||
TOTAL_SECONDS=120
|
||||
|
||||
for REQUEST_RATE in "${REQUEST_RATES[@]}";
|
||||
do
|
||||
NUM_PROMPTS=$(($TOTAL_SECONDS * $REQUEST_RATE))
|
||||
|
||||
echo ""
|
||||
echo "===== RUNNING $MODEL FOR $NUM_PROMPTS PROMPTS WITH $REQUEST_RATE QPS ====="
|
||||
echo ""
|
||||
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--model $MODEL \
|
||||
--dataset-name random \
|
||||
--random-input-len $INPUT_LEN \
|
||||
--random-output-len $OUTPUT_LEN \
|
||||
--request-rate $REQUEST_RATE \
|
||||
--num-prompts $NUM_PROMPTS \
|
||||
--seed $REQUEST_RATE \
|
||||
--ignore-eos \
|
||||
--result-filename "results.json" \
|
||||
--save-result
|
||||
|
||||
done
|
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
|
||||
GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
@ -1,38 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale);
|
||||
#endif
|
||||
|
||||
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale) {
|
||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
||||
return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale);
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
|
||||
}
|
@ -1,225 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/kernel_hardware_info.h"
|
||||
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
|
||||
#include "device/sm100_mla.hpp"
|
||||
#include "kernel/sm100_mla_tile_scheduler.hpp"
|
||||
|
||||
using namespace cute;
|
||||
using namespace cutlass::fmha::kernel;
|
||||
|
||||
template <typename T, bool PersistenceOption = true>
|
||||
struct MlaSm100 {
|
||||
using Element = T;
|
||||
using ElementAcc = float;
|
||||
using ElementOut = T;
|
||||
|
||||
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
|
||||
using TileShapeH = cute::tuple_element_t<0, TileShape>;
|
||||
using TileShapeD = cute::tuple_element_t<2, TileShape>;
|
||||
|
||||
// H K (D_latent D_rope) B
|
||||
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
|
||||
|
||||
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
|
||||
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
|
||||
using StrideO = StrideK; // H D B
|
||||
using StrideLSE = cute::tuple<_1, int>; // H B
|
||||
|
||||
using TileScheduler =
|
||||
std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
|
||||
Sm100MlaIndividualTileScheduler>;
|
||||
|
||||
using FmhaKernel =
|
||||
cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
|
||||
TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
|
||||
/*kIsCpAsync=*/true>;
|
||||
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
typename T::Fmha::Arguments args_from_options(
|
||||
at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
|
||||
at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
|
||||
at::Tensor const& page_table, double scale) {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = q_nope.device().index();
|
||||
hw_info.sm_count =
|
||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
hw_info.device_id);
|
||||
|
||||
int batches = q_nope.sizes()[0];
|
||||
int page_count_per_seq = page_table.sizes()[1];
|
||||
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
|
||||
int page_size = kv_c_and_k_pe_cache.sizes()[1];
|
||||
int max_seq_len = page_size * page_count_per_seq;
|
||||
using TileShapeH = typename T::TileShapeH;
|
||||
using TileShapeD = typename T::TileShapeD;
|
||||
auto problem_shape =
|
||||
cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
|
||||
|
||||
auto [H, K, D, B] = problem_shape;
|
||||
auto [D_latent, D_rope] = D;
|
||||
|
||||
using StrideQ = typename T::StrideQ;
|
||||
using StrideK = typename T::StrideK;
|
||||
using StrideO = typename T::StrideO;
|
||||
using StrideLSE = typename T::StrideLSE;
|
||||
|
||||
StrideQ stride_Q_latent = cute::make_tuple(
|
||||
static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
|
||||
StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
|
||||
static_cast<int64_t>(H * D_rope));
|
||||
StrideK stride_C =
|
||||
cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
|
||||
static_cast<int64_t>(page_size * (D_latent + D_rope)));
|
||||
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
|
||||
StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
|
||||
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
|
||||
static_cast<int64_t>(H * D_latent));
|
||||
|
||||
using Element = typename T::Element;
|
||||
using ElementOut = typename T::ElementOut;
|
||||
using ElementAcc = typename T::ElementAcc;
|
||||
auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
|
||||
auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
|
||||
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
|
||||
auto scale_f = static_cast<float>(scale);
|
||||
typename T::Fmha::Arguments arguments{
|
||||
problem_shape,
|
||||
{scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
|
||||
stride_C, C_ptr + D_latent, stride_C,
|
||||
static_cast<int*>(seq_lens.data_ptr()),
|
||||
static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
|
||||
page_size},
|
||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||
hw_info,
|
||||
-1, // split_kv
|
||||
nullptr, // is_var_split_kv
|
||||
};
|
||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
||||
// split_kv automatically based on batch size and sequence length to balance
|
||||
// workload across available SMs. Consider using var_split_kv for manual
|
||||
// control if needed.
|
||||
T::Fmha::set_split_kv(arguments);
|
||||
return arguments;
|
||||
}
|
||||
|
||||
template <typename Element>
|
||||
void runMla(at::Tensor const& out, at::Tensor const& q_nope,
|
||||
at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
|
||||
at::Tensor const& seq_lens, at::Tensor const& page_table,
|
||||
float scale, cudaStream_t stream) {
|
||||
using MlaSm100Type = MlaSm100<Element>;
|
||||
typename MlaSm100Type::Fmha fmha;
|
||||
auto arguments = args_from_options<MlaSm100Type>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
|
||||
size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
|
||||
CUTLASS_CHECK(fmha.can_implement(arguments));
|
||||
|
||||
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
|
||||
|
||||
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
|
||||
}
|
||||
|
||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale) {
|
||||
TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
|
||||
TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
|
||||
TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
|
||||
TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
|
||||
"kv_c_and_k_pe_cache must be a 3D tensor");
|
||||
TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
|
||||
TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
|
||||
TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
|
||||
|
||||
auto B_q_nope = q_nope.size(0);
|
||||
auto H_q_nope = q_nope.size(1);
|
||||
auto D_q_nope = q_nope.size(2);
|
||||
auto B_q_pe = q_pe.size(0);
|
||||
auto H_q_pe = q_pe.size(1);
|
||||
auto D_q_pe = q_pe.size(2);
|
||||
auto B_pt = page_table.size(0);
|
||||
auto PAGE_NUM = page_table.size(1);
|
||||
auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
|
||||
auto D_ckv = kv_c_and_k_pe_cache.size(2);
|
||||
auto B_o = out.size(0);
|
||||
auto H_o = out.size(1);
|
||||
auto D_o = out.size(2);
|
||||
|
||||
TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
|
||||
TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
|
||||
TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
|
||||
TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
|
||||
"H_q_nope, H_q_pe, and H_o must be equal to 128");
|
||||
TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
|
||||
"PAGE_SIZE must be a power of 2");
|
||||
TORCH_CHECK(
|
||||
B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
|
||||
"Batch dims must be same for page_table, q_nope and q_pe, and out");
|
||||
TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
|
||||
"PAGE_NUM must be divisible by 128 / PAGE_SIZE");
|
||||
TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
|
||||
|
||||
TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
|
||||
q_nope.dtype() == at::ScalarType::BFloat16 ||
|
||||
q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
|
||||
"q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
|
||||
TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
|
||||
q_nope.dtype() == q_pe.dtype(),
|
||||
"kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
|
||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
|
||||
"seq_lens must be a 32-bit integer tensor");
|
||||
TORCH_CHECK(page_table.dtype() == torch::kInt32,
|
||||
"page_table must be a 32-bit integer tensor");
|
||||
|
||||
auto in_dtype = q_nope.dtype();
|
||||
at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
|
||||
const cudaStream_t stream =
|
||||
at::cuda::getCurrentCUDAStream(q_nope.get_device());
|
||||
if (in_dtype == at::ScalarType::Half) {
|
||||
runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
|
||||
page_table, scale, stream);
|
||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
||||
runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale, stream);
|
||||
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
|
||||
runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported input data type of MLA");
|
||||
}
|
||||
}
|
@ -270,10 +270,9 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
|
||||
// head_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int64_t block_stride, const int64_t page_stride,
|
||||
const int64_t head_stride, const int64_t key_stride,
|
||||
const int64_t value_stride, const int num_heads, const int head_size,
|
||||
const int block_size, const float* k_scale, const float* v_scale) {
|
||||
const int block_stride, const int key_stride, const int value_stride,
|
||||
const int num_heads, const int head_size, const int block_size,
|
||||
const float* k_scale, const float* v_scale) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
@ -289,8 +288,8 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
const int head_idx = i / head_size;
|
||||
const int head_offset = i % head_size;
|
||||
const int64_t tgt_key_value_idx = block_idx * block_stride +
|
||||
block_offset * page_stride +
|
||||
head_idx * head_stride + head_offset;
|
||||
block_offset * num_heads * head_size +
|
||||
head_idx * head_size + head_offset;
|
||||
scalar_t tgt_key = key[src_key_idx];
|
||||
scalar_t tgt_value = value[src_value_idx];
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
@ -397,16 +396,16 @@ void reshape_and_cache(
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
|
||||
head_stride, key_stride, value_stride, num_heads, head_size, \
|
||||
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
|
||||
value_stride, num_heads, head_size, block_size, \
|
||||
reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()));
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
@ -433,11 +432,9 @@ void reshape_and_cache_flash(
|
||||
int head_size = key.size(2);
|
||||
int block_size = key_cache.size(1);
|
||||
|
||||
int64_t key_stride = key.stride(0);
|
||||
int64_t value_stride = value.stride(0);
|
||||
int64_t block_stride = key_cache.stride(0);
|
||||
int64_t page_stride = key_cache.stride(1);
|
||||
int64_t head_stride = key_cache.stride(2);
|
||||
int key_stride = key.stride(0);
|
||||
int value_stride = value.stride(0);
|
||||
int block_stride = key_cache.stride(0);
|
||||
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
|
@ -128,12 +128,6 @@ void advance_step_flashinfer(
|
||||
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
|
||||
|
||||
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale);
|
||||
|
||||
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
@ -336,7 +336,7 @@ inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
|
||||
|
||||
uint32_t const m = a.size(0);
|
||||
uint32_t const mp2 =
|
||||
std::max(static_cast<uint32_t>(16), next_pow_2(m)); // next power of 2
|
||||
std::max(static_cast<uint32_t>(32), next_pow_2(m)); // next power of 2
|
||||
|
||||
if (mp2 <= 16) {
|
||||
// M in [1, 16]
|
||||
|
@ -321,7 +321,7 @@ inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
|
||||
|
||||
uint32_t const m = a.size(0);
|
||||
uint32_t const mp2 =
|
||||
std::max(static_cast<uint32_t>(16), next_pow_2(m)); // next power of 2
|
||||
std::max(static_cast<uint32_t>(32), next_pow_2(m)); // next power of 2
|
||||
|
||||
if (mp2 <= 16) {
|
||||
// M in [1, 16]
|
||||
|
@ -134,7 +134,7 @@ typename T::Gemm::Arguments args_from_options(
|
||||
using StrideB = typename T::StrideB;
|
||||
using StrideD = typename T::StrideD;
|
||||
using Sm100BlkScaledConfig =
|
||||
typename T::Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
|
||||
typename T::Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
|
||||
|
||||
int m = static_cast<int>(M);
|
||||
int n = static_cast<int>(N);
|
||||
|
@ -150,7 +150,7 @@ __global__ void LLGemm1_kernel(const scalar_t* in_a, const scalar_t* in_b,
|
||||
colB_elem4w = bf4[threadid * 4 + 3];
|
||||
|
||||
scalar2_t Af2;
|
||||
[[maybe_unused]] scalar2_t Bf2;
|
||||
scalar2_t Bf2;
|
||||
float2 S;
|
||||
|
||||
auto Ah2ptr = reinterpret_cast<scalar2_t*>(&rowA_elem4);
|
||||
@ -1597,4 +1597,4 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
@ -130,13 +130,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
") -> ()");
|
||||
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
|
||||
|
||||
// Compute MLA decode using cutlass.
|
||||
ops.def(
|
||||
"cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
|
||||
" Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
|
||||
" Tensor page_table, float scale) -> ()");
|
||||
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
|
||||
|
||||
// Layernorm
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
ops.def(
|
||||
|
@ -162,9 +162,6 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
# Workaround for #17068
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
#################### DEV IMAGE ####################
|
||||
@ -268,9 +265,6 @@ ADD . /vllm-workspace/
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
# install development dependencies (for testing)
|
||||
# Workaround for #17068
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
|
||||
@ -297,7 +291,6 @@ RUN mv vllm test_docs/
|
||||
#################### OPENAI API SERVER ####################
|
||||
# base openai image with additional requirements, for any subsequent openai-style images
|
||||
FROM vllm-base AS vllm-openai-base
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 118 KiB |
Binary file not shown.
Before Width: | Height: | Size: 136 KiB |
Binary file not shown.
Before Width: | Height: | Size: 110 KiB |
Binary file not shown.
Before Width: | Height: | Size: 111 KiB |
@ -177,11 +177,6 @@ def linkcode_resolve(domain, info):
|
||||
for part in info['fullname'].split('.'):
|
||||
obj = getattr(obj, part)
|
||||
|
||||
# Skip decorator wrappers by checking if the object is a function
|
||||
# and has a __wrapped__ attribute (which decorators typically set)
|
||||
while hasattr(obj, '__wrapped__'):
|
||||
obj = obj.__wrapped__
|
||||
|
||||
if not (inspect.isclass(obj) or inspect.isfunction(obj)
|
||||
or inspect.ismethod(obj)):
|
||||
obj = obj.__class__ # Get the class of the instance
|
||||
|
@ -1,47 +0,0 @@
|
||||
(deployment-anything-llm)=
|
||||
|
||||
# Anything LLM
|
||||
|
||||
[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
|
||||
|
||||
It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
|
||||
```
|
||||
|
||||
- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
|
||||
|
||||
- On the bottom left of open settings, AI Prooviders --> LLM:
|
||||
- LLM Provider: Generic OpenAI
|
||||
- Base URL: http://{vllm server host}:{vllm server port}/v1
|
||||
- Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-provider.png
|
||||
:::
|
||||
|
||||
- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-chat-without-doc.png
|
||||
:::
|
||||
|
||||
- Click the upload button:
|
||||
- upload the doc
|
||||
- select the doc and move to the workspace
|
||||
- save and embed
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-upload-doc.png
|
||||
:::
|
||||
|
||||
- Chat again:
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-chat-with-doc.png
|
||||
:::
|
@ -3,7 +3,6 @@
|
||||
:::{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
anything-llm
|
||||
bentoml
|
||||
cerebrium
|
||||
dstack
|
||||
|
@ -1,58 +0,0 @@
|
||||
# Security Guide
|
||||
|
||||
## Inter-Node Communication
|
||||
|
||||
All communications between nodes in a multi-node vLLM deployment are **insecure by default** and must be protected by placing the nodes on an isolated network. This includes:
|
||||
|
||||
1. PyTorch Distributed communications
|
||||
2. KV cache transfer communications
|
||||
3. Tensor, Pipeline, and Data parallel communications
|
||||
|
||||
### Configuration Options for Inter-Node Communications
|
||||
|
||||
The following options control inter-node communications in vLLM:
|
||||
|
||||
1. **Environment Variables:**
|
||||
- `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
|
||||
|
||||
2. **KV Cache Transfer Configuration:**
|
||||
- `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
|
||||
- `--kv-port`: The port for KV cache transfer communications (default: 14579)
|
||||
|
||||
3. **Data Parallel Configuration:**
|
||||
- `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
|
||||
- `data_parallel_master_port`: Port of the data parallel master (default: 29500)
|
||||
|
||||
### Notes on PyTorch Distributed
|
||||
|
||||
vLLM uses PyTorch's distributed features for some inter-node communication. For
|
||||
detailed information about PyTorch Distributed security considerations, please
|
||||
refer to the [PyTorch Security
|
||||
Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
|
||||
|
||||
Key points from the PyTorch security guide:
|
||||
- PyTorch Distributed features are intended for internal communication only
|
||||
- They are not built for use in untrusted environments or networks
|
||||
- No authorization protocol is included for performance reasons
|
||||
- Messages are sent unencrypted
|
||||
- Connections are accepted from anywhere without checks
|
||||
|
||||
### Security Recommendations
|
||||
|
||||
1. **Network Isolation:**
|
||||
- Deploy vLLM nodes on a dedicated, isolated network
|
||||
- Use network segmentation to prevent unauthorized access
|
||||
- Implement appropriate firewall rules
|
||||
|
||||
2. **Configuration Best Practices:**
|
||||
- Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
|
||||
- Configure firewalls to only allow necessary ports between nodes
|
||||
|
||||
3. **Access Control:**
|
||||
- Restrict physical and network access to the deployment environment
|
||||
- Implement proper authentication and authorization for management interfaces
|
||||
- Follow the principle of least privilege for all system components
|
||||
|
||||
## Reporting Security Vulnerabilities
|
||||
|
||||
If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
|
@ -21,11 +21,11 @@ Disaggregated prefill DOES NOT improve throughput.
|
||||
|
||||
## Usage example
|
||||
|
||||
Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
|
||||
Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
|
||||
Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
|
||||
|
||||
## Development
|
||||
|
||||
|
@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github
|
||||
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
|
||||
The main benefits are lower latency and memory usage.
|
||||
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
|
||||
|
||||
```console
|
||||
pip install autoawq
|
||||
```
|
||||
|
||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||
|
||||
```python
|
||||
from awq import AutoAWQForCausalLM
|
||||
|
@ -20,8 +20,8 @@ vLLM reads the model's config file and supports pre-quantized checkpoints.
|
||||
|
||||
You can find pre-quantized models on:
|
||||
|
||||
- [Hugging Face (BitBLAS)](https://huggingface.co/models?search=bitblas)
|
||||
- [Hugging Face (GPTQ)](https://huggingface.co/models?search=gptq)
|
||||
- [Hugging Face (BitBLAS)](https://huggingface.co/models?other=bitblas)
|
||||
- [Hugging Face (GPTQ)](https://huggingface.co/models?other=gptq)
|
||||
|
||||
Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
|
||||
|
||||
|
@ -14,7 +14,7 @@ pip install bitsandbytes>=0.45.3
|
||||
|
||||
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
||||
|
||||
You can find bitsandbytes quantized models on <https://huggingface.co/models?search=bitsandbytes>.
|
||||
You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
|
||||
And usually, these repositories have a config.json file that includes a quantization_config section.
|
||||
|
||||
## Read quantized checkpoint
|
||||
|
@ -16,16 +16,12 @@ GPTQModel is one of the few quantization toolkits in the world that allows `Dyna
|
||||
is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
|
||||
for more details on this and other advanced features.
|
||||
|
||||
## Installation
|
||||
|
||||
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
|
||||
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq).
|
||||
|
||||
```console
|
||||
pip install -U gptqmodel --no-build-isolation -v
|
||||
```
|
||||
|
||||
## Quantizing a model
|
||||
|
||||
After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
|
||||
|
||||
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
@ -53,16 +49,12 @@ model.quantize(calibration_dataset, batch_size=2)
|
||||
model.save(quant_path)
|
||||
```
|
||||
|
||||
## Running a quantized model with vLLM
|
||||
|
||||
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
|
||||
|
||||
```console
|
||||
python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
|
||||
python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
|
||||
```
|
||||
|
||||
## Using GPTQModel with vLLM's Python API
|
||||
|
||||
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
@ -75,22 +67,17 @@ prompts = [
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
|
||||
llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
print("-"*50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-"*50)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
@ -30,4 +30,5 @@ tokenizer.push_to_hub(hub_repo)
|
||||
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
|
||||
```
|
||||
|
||||
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
|
||||
Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI.
|
||||
See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization
|
||||
|
@ -2,11 +2,8 @@
|
||||
|
||||
# Structured Outputs
|
||||
|
||||
vLLM supports the generation of structured outputs using
|
||||
[xgrammar](https://github.com/mlc-ai/xgrammar) or
|
||||
[guidance](https://github.com/guidance-ai/llguidance) as backends.
|
||||
This document shows you some examples of the different options that are
|
||||
available to generate structured outputs.
|
||||
vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
|
||||
This document shows you some examples of the different options that are available to generate structured outputs.
|
||||
|
||||
## Online Serving (OpenAI API)
|
||||
|
||||
@ -18,17 +15,10 @@ The following parameters are supported, which must be added as extra parameters:
|
||||
- `guided_regex`: the output will follow the regex pattern.
|
||||
- `guided_json`: the output will follow the JSON schema.
|
||||
- `guided_grammar`: the output will follow the context free grammar.
|
||||
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
|
||||
- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
|
||||
- `guided_decoding_backend`: used to select the guided decoding backend to use. Additional backend-specific options can be supplied in a comma separated list following a colon after the backend name. For example `"xgrammar:no-fallback"` will not allow vLLM to fallback to a different backend on error.
|
||||
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server) page.
|
||||
|
||||
Structured outputs are supported by default in the OpenAI-Compatible Server. You
|
||||
may choose to specify the backend to use by setting the
|
||||
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
which will try to choose an appropriate backend based on the details of the
|
||||
request. You may also choose a specific backend, along with
|
||||
some options. A full set of options is available in the `vllm serve --help`
|
||||
text.
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
|
||||
|
||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||
|
||||
@ -60,7 +50,7 @@ completion = client.chat.completions.create(
|
||||
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
@ -106,29 +96,26 @@ print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
While not strictly necessary, normally it´s better to indicate in the prompt the
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
results notably in most cases.
|
||||
While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
|
||||
This can improve the results notably in most cases.
|
||||
:::
|
||||
|
||||
Finally we have the `guided_grammar` option, which is probably the most
|
||||
difficult to use, but it´s really powerful. It allows us to define complete
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
As an example, we can use to define a specific format of simplified SQL queries:
|
||||
Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
|
||||
It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
|
||||
|
||||
```python
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
?start: select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
?table_name: identifier
|
||||
|
||||
condition ::= column "= " number
|
||||
?column_name: identifier
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
@ -239,8 +226,6 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
|
||||
Answer: x = -29/8
|
||||
```
|
||||
|
||||
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py>
|
||||
|
||||
## Offline Inference
|
||||
|
||||
Offline inference allows for the same types of guided decoding.
|
||||
@ -251,11 +236,11 @@ The main available options inside `GuidedDecodingParams` are:
|
||||
- `regex`
|
||||
- `choice`
|
||||
- `grammar`
|
||||
- `structural_tag`
|
||||
- `backend`
|
||||
- `whitespace_pattern`
|
||||
|
||||
These parameters can be used in the same way as the parameters from the Online
|
||||
Serving examples above. One example for the usage of the `choice` parameter is
|
||||
shown below:
|
||||
These parameters can be used in the same way as the parameters from the Online Serving examples above.
|
||||
One example for the usage of the `choices` parameter is shown below:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
@ -152,11 +152,10 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha
|
||||
|
||||
Supported models:
|
||||
|
||||
All Llama 3.1, 3.2 and 4 models should be supported.
|
||||
All Llama 3.1 and 3.2 models should be supported.
|
||||
|
||||
* `meta-llama/Llama-3.1-*`
|
||||
* `meta-llama/Llama-3.2-*`
|
||||
* `meta-llama/Llama-4-*`
|
||||
|
||||
The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
|
||||
|
||||
@ -177,12 +176,6 @@ images.
|
||||
|
||||
Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
|
||||
|
||||
VLLM also provides a JSON based chat template for Llama 4:
|
||||
* `examples/tool_chat_template_llama4_json.jinja` - this is based on the "official" chat template for the Llama 4
|
||||
models, but tweaked so that it works better with vLLM.
|
||||
|
||||
For Llama 4 use `--tool-call-parser llama4_json examples/tool_chat_template_llama4_json.jinja`.
|
||||
|
||||
#### IBM Granite
|
||||
|
||||
Supported models:
|
||||
|
@ -44,7 +44,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
|
||||
|
||||
You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
|
||||
or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
|
||||
API (preferred). This section shows how to create TPUs using the queued resource API. For
|
||||
API. This section shows how to create TPUs using the queued resource API. For
|
||||
more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
|
||||
Queued resources enable you to request Cloud TPU resources in a queued manner.
|
||||
When you request queued resources, the request is added to a queue maintained by
|
||||
@ -97,10 +97,10 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
|
||||
`TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
|
||||
- * ACCELERATOR_TYPE
|
||||
* The TPU version you want to use. Specify the TPU version, for example
|
||||
`v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information,
|
||||
see [TPU versions](https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions).
|
||||
`v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
|
||||
see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
|
||||
- * RUNTIME_VERSION
|
||||
* The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes).
|
||||
* The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
|
||||
- * SERVICE_ACCOUNT
|
||||
* The email address for your service account. You can find it in the IAM
|
||||
Cloud Console under *Service Accounts*. For example:
|
||||
|
@ -153,7 +153,7 @@ git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
python use_existing_torch.py
|
||||
pip install -r requirements/build.txt
|
||||
pip install --no-build-isolation -e .
|
||||
pip install -e . --no-build-isolation
|
||||
```
|
||||
|
||||
##### Use the local cutlass for compilation
|
||||
|
@ -44,8 +44,8 @@ This living user guide outlines a few known **important changes and limitations*
|
||||
|-----------------|-----------------------------------------------------------------------------------|
|
||||
| **Prefix Caching** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
|
||||
| **LoRA** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
|
||||
| **LoRA** | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
|
||||
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
||||
| **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
|
||||
| **Spec Decode** | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
|
||||
@ -121,6 +121,11 @@ Although we have re-implemented and partially optimized many features and models
|
||||
These features are already supported in vLLM V1, but their optimization is still
|
||||
in progress.
|
||||
|
||||
- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
|
||||
inferior to that of V0. The team is actively working on improving its
|
||||
performance
|
||||
(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
|
||||
|
||||
- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
|
||||
will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
|
||||
|
||||
|
@ -132,7 +132,6 @@ serving/integrations/index
|
||||
:caption: Deployment
|
||||
:maxdepth: 1
|
||||
|
||||
deployment/security
|
||||
deployment/docker
|
||||
deployment/k8s
|
||||
deployment/nginx
|
||||
|
@ -51,29 +51,3 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer -
|
||||
:::{note}
|
||||
For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
|
||||
:::
|
||||
|
||||
## Sharded Model Loading
|
||||
|
||||
vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag:
|
||||
|
||||
```console
|
||||
vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
|
||||
```
|
||||
|
||||
The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:
|
||||
|
||||
```console
|
||||
vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
|
||||
```
|
||||
|
||||
To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
|
||||
|
||||
The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
|
||||
|
||||
```console
|
||||
vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
|
||||
```
|
||||
|
||||
:::{note}
|
||||
The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint.
|
||||
:::
|
||||
|
@ -59,7 +59,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/basi
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of {class}`~vllm.LLM.generate`.
|
||||
The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`.
|
||||
For example, to search using 5 beams and output at most 50 tokens:
|
||||
|
||||
```python
|
||||
|
@ -159,14 +159,14 @@ For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model
|
||||
|
||||
### Manually enable Matryoshka Embeddings
|
||||
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`.
|
||||
|
||||
For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
|
||||
For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online).
|
||||
|
||||
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
||||
|
||||
```text
|
||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
|
||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}'
|
||||
```
|
||||
|
||||
### Offline Inference
|
||||
@ -204,14 +204,14 @@ curl http://127.0.0.1:8000/v1/embeddings \
|
||||
"input": "Follow the white rabbit.",
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"encoding_format": "float",
|
||||
"dimensions": 32
|
||||
"dimensions": 1
|
||||
}'
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```json
|
||||
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
```
|
||||
|
||||
A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
|
||||
|
@ -40,37 +40,33 @@ You can force the use of `TransformersForCausalLM` by setting `model_impl="trans
|
||||
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
|
||||
:::
|
||||
|
||||
#### Custom models
|
||||
#### Supported features
|
||||
|
||||
If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
|
||||
The Transformers modeling backend explicitly supports the following features:
|
||||
|
||||
For a model to be compatible with the Transformers backend for vLLM it must:
|
||||
- <project:#quantization-index> (except GGUF)
|
||||
- <project:#lora-adapter>
|
||||
- <project:#distributed-serving>
|
||||
|
||||
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
|
||||
* The model directory must have the correct structure (e.g. `config.json` is present).
|
||||
* `config.json` must contain `auto_map.AutoModel`.
|
||||
- be a Transformers backend for vLLM compatible model (see <project:#writing-custom-models>):
|
||||
* Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
|
||||
#### Remote Code
|
||||
|
||||
If the compatible model is:
|
||||
If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
|
||||
|
||||
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for <project:#offline-inference> or `--trust-remode-code` for the <project:#openai-compatible-server>.
|
||||
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for <project:#offline-inference> or `vllm serve <MODEL_DIR>` for the <project:#openai-compatible-server>.
|
||||
Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
|
||||
Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
|
||||
|
||||
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
:::{tip}
|
||||
If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models).
|
||||
:::
|
||||
|
||||
(writing-custom-models)=
|
||||
|
||||
#### Writing custom models
|
||||
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
```python
|
||||
from vllm import LLM
|
||||
llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of your model
|
||||
llm.apply_model(lambda model: print(model.__class__))
|
||||
```
|
||||
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
||||
3. `MyModel` must contain `_supports_attention_backend = True`.
|
||||
|
||||
```{code-block} python
|
||||
:caption: modeling_my_model.py
|
||||
|
||||
@ -79,7 +75,7 @@ from torch import nn
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
|
||||
def forward(self, hidden_states, **kwargs):
|
||||
def forward(self, hidden_states, **kwargs): # <- kwargs are required
|
||||
...
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attn_output, attn_weights = attention_interface(
|
||||
@ -95,11 +91,11 @@ class MyModel(PreTrainedModel):
|
||||
_supports_attention_backend = True
|
||||
```
|
||||
|
||||
Here is what happens in the background when this model is loaded:
|
||||
Here is what happens in the background:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
1. The config is loaded
|
||||
2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
|
||||
3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
|
||||
|
||||
That's it!
|
||||
|
||||
@ -221,16 +217,6 @@ output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
(feature-status-legend)=
|
||||
|
||||
## Feature Status Legend
|
||||
|
||||
- ✅︎ indicates that the feature is supported for the model.
|
||||
|
||||
- 🚧 indicates that the feature is planned but not yet supported for the model.
|
||||
|
||||
- ⚠️ indicates that the feature is available but may have known issues or limitations.
|
||||
|
||||
(supported-text-models)=
|
||||
|
||||
## List of Text-only Language Models
|
||||
@ -332,7 +318,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
- * `GemmaForCausalLM`
|
||||
* Gemma
|
||||
* `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.
|
||||
* `google/gemma-2b`, `google/gemma-7b`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Gemma2ForCausalLM`
|
||||
@ -793,8 +779,6 @@ or `--limit-mm-per-prompt` (online serving). For example, to enable passing up t
|
||||
Offline inference:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2-VL-7B-Instruct",
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
@ -895,13 +879,6 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `GraniteSpeechForConditionalGeneration`
|
||||
* Granite Speech
|
||||
* T + A
|
||||
* `ibm-granite/granite-speech-3.3-8b`
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `H2OVLChatModel`
|
||||
* H2OVL
|
||||
* T + I<sup>E+</sup>
|
||||
@ -1101,7 +1078,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
|
||||
:::{important}
|
||||
Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
|
||||
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": true}'`.
|
||||
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
|
||||
:::
|
||||
|
||||
:::{warning}
|
||||
@ -1116,7 +1093,7 @@ V0 correctly implements the model's attention pattern:
|
||||
|
||||
V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
@ -1130,36 +1107,6 @@ This limitation exists because the model's mixed attention pattern (bidirectiona
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
:::
|
||||
|
||||
:::{warning}
|
||||
The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
|
||||
|
||||
For the best results, we recommend using the following dependency versions (tested on A10 and L40):
|
||||
|
||||
```text
|
||||
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
|
||||
torch==2.5.1
|
||||
torchvision==0.20.1
|
||||
transformers==4.48.1
|
||||
tokenizers==0.21.0
|
||||
tiktoken==0.7.0
|
||||
vllm==0.7.0
|
||||
|
||||
# Optional but recommended for improved performance and stability
|
||||
triton==3.1.0
|
||||
xformers==0.0.28.post3
|
||||
uvloop==0.21.0
|
||||
protobuf==5.29.3
|
||||
openai==1.60.2
|
||||
opencv-python-headless==4.11.0.86
|
||||
pillow==10.4.0
|
||||
|
||||
# Installed FlashAttention (for float16 only)
|
||||
flash-attn>=2.5.6 # Not used in float32, but should be documented
|
||||
```
|
||||
|
||||
**Note:** Make sure you understand the security implications of using outdated packages.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||
@ -1174,7 +1121,7 @@ To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from
|
||||
`pip install git+https://github.com/huggingface/transformers.git`.
|
||||
|
||||
Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
||||
`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
|
||||
`--mm-processor-kwargs '{"use_audio_in_video": True}'`.
|
||||
:::
|
||||
|
||||
### Pooling Models
|
||||
|
@ -77,10 +77,6 @@ bash run_cluster.sh \
|
||||
|
||||
Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
|
||||
|
||||
:::{warning}
|
||||
It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties.
|
||||
:::
|
||||
|
||||
:::{warning}
|
||||
Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
|
||||
:::
|
||||
|
@ -16,7 +16,6 @@ Below, you can find an explanation of every engine argument:
|
||||
:func: _engine_args_parser
|
||||
:prog: vllm serve
|
||||
:nodefaultconst:
|
||||
:markdownhelp:
|
||||
```
|
||||
|
||||
## Async Engine Arguments
|
||||
@ -30,5 +29,4 @@ Additional arguments are available to the asynchronous engine which is used for
|
||||
:func: _async_engine_args_parser
|
||||
:prog: vllm serve
|
||||
:nodefaultconst:
|
||||
:markdownhelp:
|
||||
```
|
||||
|
@ -1,56 +0,0 @@
|
||||
# LMCache Examples
|
||||
|
||||
This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
|
||||
|
||||
## 1. Disaggregated Prefill in vLLM v1
|
||||
|
||||
This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
|
||||
- Install [NIXL](https://github.com/ai-dynamo/nixl).
|
||||
- At least 2 GPUs
|
||||
- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
|
||||
|
||||
### Usage
|
||||
|
||||
Run
|
||||
`cd disagg_prefill_lmcache_v1`
|
||||
to get into `disagg_prefill_lmcache_v1` folder, and then run
|
||||
|
||||
```bash
|
||||
bash disagg_example_nixl.sh
|
||||
```
|
||||
|
||||
to run disaggregated prefill and benchmark the performance.
|
||||
|
||||
### Components
|
||||
|
||||
#### Server Scripts
|
||||
- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
|
||||
- `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
|
||||
- `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
|
||||
|
||||
#### Configuration
|
||||
- `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
|
||||
- `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
|
||||
|
||||
#### Log Files
|
||||
The main script generates several log files:
|
||||
- `prefiller.log` - Logs from the prefill server
|
||||
- `decoder.log` - Logs from the decode server
|
||||
- `proxy.log` - Logs from the proxy server
|
||||
|
||||
## 2. CPU Offload Examples
|
||||
|
||||
- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
|
||||
- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
|
||||
|
||||
## 3. KV Cache Sharing
|
||||
|
||||
The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances.
|
||||
|
||||
## 4. Disaggregated Prefill in vLLM v0
|
||||
|
||||
The `disaggregated_prefill_lmcache_v0.py` provides an example of how to run disaggregated prefill in vLLM v0.
|
@ -1,57 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the example usage of cpu offloading
|
||||
with LMCache in vLLM v1.
|
||||
|
||||
Note that lmcache needs to be installed to run this example.
|
||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||
"""
|
||||
import os
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
# LMCache-related environment variables
|
||||
# Use experimental features in LMCache
|
||||
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
|
||||
# LMCache is set to use 256 tokens per chunk
|
||||
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
|
||||
# Enable local CPU backend in LMCache
|
||||
os.environ["LMCACHE_LOCAL_CPU"] = "True"
|
||||
# Set local CPU memory limit to 5.0 GB
|
||||
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
|
||||
|
||||
# This example script runs two requests with a shared prefix.
|
||||
shared_prompt = "Hello, how are you?" * 1000
|
||||
first_prompt = [
|
||||
shared_prompt + "Hello, my name is",
|
||||
]
|
||||
second_prompt = [
|
||||
shared_prompt + "Tell me a very long story",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# memory. Reduce the value if your GPU has less memory.
|
||||
# Note that LMCache is not compatible with chunked prefill for now.
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8)
|
||||
|
||||
# Should be able to see logs like the following:
|
||||
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
|
||||
# This indicates that the KV cache has been stored in LMCache.
|
||||
outputs = llm.generate(first_prompt, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
@ -1,13 +0,0 @@
|
||||
local_cpu: False
|
||||
max_local_cpu_size: 0
|
||||
#local_disk:
|
||||
max_local_disk_size: 0
|
||||
remote_serde: NULL
|
||||
|
||||
enable_nixl: True
|
||||
nixl_role: "receiver"
|
||||
nixl_peer_host: "localhost"
|
||||
nixl_peer_port: 55555
|
||||
nixl_buffer_size: 1073741824 # 1GB
|
||||
nixl_buffer_device: "cuda"
|
||||
nixl_enable_gc: True
|
@ -1,13 +0,0 @@
|
||||
local_cpu: False
|
||||
max_local_cpu_size: 0
|
||||
#local_disk:
|
||||
max_local_disk_size: 0
|
||||
remote_serde: NULL
|
||||
|
||||
enable_nixl: True
|
||||
nixl_role: "sender"
|
||||
nixl_peer_host: "localhost"
|
||||
nixl_peer_port: 55555
|
||||
nixl_buffer_size: 1073741824 # 1GB
|
||||
nixl_buffer_device: "cuda"
|
||||
nixl_enable_gc: True
|
@ -1,136 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
|
||||
|
||||
|
||||
PIDS=()
|
||||
|
||||
# Switch to the directory of the current script
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
check_hf_token() {
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
|
||||
exit 1
|
||||
fi
|
||||
if [[ "$HF_TOKEN" != hf_* ]]; then
|
||||
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
|
||||
exit 1
|
||||
fi
|
||||
echo "HF_TOKEN is set and valid."
|
||||
}
|
||||
|
||||
check_num_gpus() {
|
||||
# can you check if the number of GPUs are >=2 via nvidia-smi?
|
||||
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
if [ "$num_gpus" -lt 2 ]; then
|
||||
echo "You need at least 2 GPUs to run disaggregated prefill."
|
||||
exit 1
|
||||
else
|
||||
echo "Found $num_gpus GPUs."
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_python_library_installed() {
|
||||
echo "Checking if $1 is installed..."
|
||||
python -c "import $1" > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
if [ "$1" == "nixl" ]; then
|
||||
echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
|
||||
else
|
||||
echo "$1 is not installed. Please install it via pip install $1."
|
||||
fi
|
||||
exit 1
|
||||
else
|
||||
echo "$1 is installed."
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
echo "Stopping everything…"
|
||||
trap - INT TERM # prevent re-entrancy
|
||||
kill -- -$$ # negative PID == “this whole process-group”
|
||||
wait # reap children so we don't leave zombies
|
||||
exit 0
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
local timeout_seconds=1200
|
||||
local start_time=$(date +%s)
|
||||
|
||||
echo "Waiting for server on port $port..."
|
||||
|
||||
while true; do
|
||||
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local now=$(date +%s)
|
||||
if (( now - start_time >= timeout_seconds )); then
|
||||
echo "Timeout waiting for server"
|
||||
return 1
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
check_hf_token
|
||||
check_num_gpus
|
||||
ensure_python_library_installed lmcache
|
||||
ensure_python_library_installed nixl
|
||||
ensure_python_library_installed pandas
|
||||
ensure_python_library_installed datasets
|
||||
ensure_python_library_installed vllm
|
||||
|
||||
trap cleanup INT
|
||||
trap cleanup USR1
|
||||
trap cleanup TERM
|
||||
|
||||
echo "Launching prefiller, decoder and proxy..."
|
||||
echo "Please check prefiller.log, decoder.log and proxy.log for logs."
|
||||
|
||||
bash disagg_vllm_launcher.sh prefiller \
|
||||
> >(tee prefiller.log) 2>&1 &
|
||||
prefiller_pid=$!
|
||||
PIDS+=($prefiller_pid)
|
||||
|
||||
bash disagg_vllm_launcher.sh decoder \
|
||||
> >(tee decoder.log) 2>&1 &
|
||||
decoder_pid=$!
|
||||
PIDS+=($decoder_pid)
|
||||
|
||||
python3 disagg_proxy_server.py \
|
||||
--host localhost \
|
||||
--port 9000 \
|
||||
--prefiller-host localhost \
|
||||
--prefiller-port 8100 \
|
||||
--decoder-host localhost \
|
||||
--decoder-port 8200 \
|
||||
> >(tee proxy.log) 2>&1 &
|
||||
proxy_pid=$!
|
||||
PIDS+=($proxy_pid)
|
||||
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
wait_for_server 9000
|
||||
|
||||
echo "All servers are up. Starting benchmark..."
|
||||
|
||||
# begin benchmark
|
||||
cd ../../../benchmarks/
|
||||
python benchmark_serving.py --port 9000 --seed $(date +%s) \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
||||
|
||||
echo "Benchmarking done. Cleaning up..."
|
||||
|
||||
cleanup
|
||||
|
||||
}
|
||||
|
||||
main
|
@ -1,193 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
Lifespan context manager to handle startup and shutdown events.
|
||||
"""
|
||||
# Startup: Initialize clients
|
||||
prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1'
|
||||
decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1'
|
||||
|
||||
app.state.prefill_client = httpx.AsyncClient(timeout=None,
|
||||
base_url=prefiller_base_url)
|
||||
app.state.decode_client = httpx.AsyncClient(timeout=None,
|
||||
base_url=decoder_base_url)
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown: Close clients
|
||||
await app.state.prefill_client.aclose()
|
||||
await app.state.decode_client.aclose()
|
||||
|
||||
|
||||
# Update FastAPI app initialization to use lifespan
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
|
||||
class StatsCalculator:
|
||||
|
||||
def __init__(self):
|
||||
self._stats = []
|
||||
self._last_log_time = time.time()
|
||||
|
||||
def add(self, value):
|
||||
self._stats.append(value)
|
||||
if time.time() - self._last_log_time > 5:
|
||||
self._log_stats()
|
||||
self._last_log_time = time.time()
|
||||
|
||||
def _log_stats(self):
|
||||
# Print average, median, and 99th percentile
|
||||
np_arr = np.array(self._stats)
|
||||
output_str = f"\nNum requests: {len(self._stats)}" + \
|
||||
"\nPrefill node TTFT stats:" + \
|
||||
f"\n - Average (ms): {np.mean(np_arr)}" + \
|
||||
f"\n - Median (ms): {np.median(np_arr)}" + \
|
||||
f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
|
||||
print("===============================", output_str,
|
||||
"===============================")
|
||||
|
||||
|
||||
stats_calculator = StatsCalculator()
|
||||
counter = 0
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--prefiller-host", type=str, default="localhost")
|
||||
parser.add_argument("--prefiller-port", type=int, default=8100)
|
||||
parser.add_argument("--decoder-host", type=str, default="localhost")
|
||||
parser.add_argument("--decoder-port", type=int, default=8200)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
# Initialize variables to hold the persistent clients
|
||||
app.state.prefill_client = None
|
||||
app.state.decode_client = None
|
||||
|
||||
|
||||
async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
|
||||
req_data: dict):
|
||||
"""
|
||||
Send a request to a service using a persistent client.
|
||||
"""
|
||||
req_data = req_data.copy()
|
||||
req_data['max_tokens'] = 1
|
||||
if 'max_completion_tokens' in req_data:
|
||||
req_data['max_completion_tokens'] = 1
|
||||
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
response = await client.post(endpoint, json=req_data, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
|
||||
async def stream_service_response(client: httpx.AsyncClient, endpoint: str,
|
||||
req_data: dict):
|
||||
"""
|
||||
Asynchronously stream the response from a service using a persistent client.
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
async with client.stream("POST", endpoint, json=req_data,
|
||||
headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
async for chunk in response.aiter_bytes():
|
||||
yield chunk
|
||||
|
||||
|
||||
@app.post("/v1/completions")
|
||||
async def handle_completions(request: Request):
|
||||
global counter, stats_calculator
|
||||
counter += 1
|
||||
|
||||
st = time.time()
|
||||
try:
|
||||
req_data = await request.json()
|
||||
|
||||
# Send request to prefill service, ignore the response
|
||||
await send_request_to_service(app.state.prefill_client, "/completions",
|
||||
req_data)
|
||||
|
||||
et = time.time()
|
||||
stats_calculator.add(et - st)
|
||||
|
||||
# Stream response from decode service
|
||||
async def generate_stream():
|
||||
async for chunk in stream_service_response(app.state.decode_client,
|
||||
"/completions",
|
||||
req_data):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(generate_stream(),
|
||||
media_type="application/json")
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
import traceback
|
||||
exc_info = sys.exc_info()
|
||||
print("Error occurred in disagg prefill proxy server"
|
||||
" - completions endpoint")
|
||||
print(e)
|
||||
print("".join(traceback.format_exception(*exc_info)))
|
||||
raise
|
||||
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
async def handle_chat_completions(request: Request):
|
||||
global counter, stats_calculator
|
||||
counter += 1
|
||||
|
||||
st = time.time()
|
||||
try:
|
||||
req_data = await request.json()
|
||||
|
||||
# Send request to prefill service, ignore the response
|
||||
await send_request_to_service(app.state.prefill_client,
|
||||
"/chat/completions", req_data)
|
||||
|
||||
et = time.time()
|
||||
stats_calculator.add(et - st)
|
||||
|
||||
# Stream response from decode service
|
||||
async def generate_stream():
|
||||
async for chunk in stream_service_response(app.state.decode_client,
|
||||
"/chat/completions",
|
||||
req_data):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(generate_stream(),
|
||||
media_type="application/json")
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
import traceback
|
||||
exc_info = sys.exc_info()
|
||||
print("Error occurred in disagg prefill proxy server "
|
||||
" - chat completions endpoint")
|
||||
print(e)
|
||||
print("".join(traceback.format_exception(*exc_info)))
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
global global_args
|
||||
global_args = parse_args()
|
||||
|
||||
import uvicorn
|
||||
uvicorn.run(app, host=global_args.host, port=global_args.port)
|
@ -1,59 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
if [[ $# -lt 1 ]]; then
|
||||
echo "Usage: $0 <prefiller | decoder> [model]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ $# -eq 1 ]]; then
|
||||
echo "Using default model: meta-llama/Llama-3.1-8B-Instruct"
|
||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||
else
|
||||
echo "Using model: $2"
|
||||
MODEL=$2
|
||||
fi
|
||||
|
||||
|
||||
if [[ $1 == "prefiller" ]]; then
|
||||
# Prefiller listens on port 8100
|
||||
prefill_config_file=$SCRIPT_DIR/configs/lmcache-prefiller-config.yaml
|
||||
|
||||
UCX_TLS=cuda_ipc,cuda_copy,tcp \
|
||||
LMCACHE_CONFIG_FILE=$prefill_config_file \
|
||||
LMCACHE_USE_EXPERIMENTAL=True \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
vllm serve $MODEL \
|
||||
--port 8100 \
|
||||
--disable-log-requests \
|
||||
--enforce-eager \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
|
||||
|
||||
|
||||
elif [[ $1 == "decoder" ]]; then
|
||||
# Decoder listens on port 8200
|
||||
decode_config_file=$SCRIPT_DIR/configs/lmcache-decoder-config.yaml
|
||||
|
||||
UCX_TLS=cuda_ipc,cuda_copy,tcp \
|
||||
LMCACHE_CONFIG_FILE=$decode_config_file \
|
||||
LMCACHE_USE_EXPERIMENTAL=True \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
CUDA_VISIBLE_DEVICES=1 \
|
||||
vllm serve $MODEL \
|
||||
--port 8200 \
|
||||
--disable-log-requests \
|
||||
--enforce-eager \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
|
||||
|
||||
|
||||
else
|
||||
echo "Invalid role: $1"
|
||||
echo "Should be either prefill, decode"
|
||||
exit 1
|
||||
fi
|
@ -1,130 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the example usage of remote KV cache sharing
|
||||
with LMCache.
|
||||
We will launch 2 vllm instances, and launch an additional LMCache server.
|
||||
KV cache is transferred in the following manner:
|
||||
(1) vLLM instance 1 -> LMCache server (KV cache store).
|
||||
(2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve).
|
||||
|
||||
Note that lmcache needs to be installed to run this example.
|
||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||
"""
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from multiprocessing import Event, Process
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
# LMCache-related environment variables
|
||||
# The port to start LMCache server
|
||||
port = 8100
|
||||
# Use experimental features in LMCache
|
||||
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
|
||||
# LMCache is set to use 256 tokens per chunk
|
||||
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
|
||||
# Disable local CPU backend in LMCache
|
||||
os.environ["LMCACHE_LOCAL_CPU"] = "False"
|
||||
# Set local CPU memory buffer limit to 5.0 GB
|
||||
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
|
||||
# Set the remote URL for LMCache server
|
||||
os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
|
||||
# Set the serializer/deserializer between vllm and LMCache server
|
||||
# `naive` indicates using raw bytes of the tensor without any compression
|
||||
os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
|
||||
|
||||
prompts = [
|
||||
"Hello, how are you?" * 1000,
|
||||
]
|
||||
|
||||
|
||||
def run_store(store_done, prompts):
|
||||
# We use GPU 0 for KV cache store process.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# memory. Reduce the value if your GPU has less memory.
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
print("KV cache store is finished.")
|
||||
store_done.set()
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
|
||||
|
||||
def run_retrieve(store_done, prompts, timeout=1):
|
||||
# We use GPU 1 for KV cache retrieve process.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# of memory. Reduce the value if your GPU has less memory.
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
|
||||
print("Waiting for KV cache store to finish...")
|
||||
store_done.wait()
|
||||
time.sleep(timeout)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
|
||||
|
||||
def run_lmcache_server(port):
|
||||
server_proc = subprocess.Popen([
|
||||
"python", "-m", "lmcache.experimental.server", "localhost",
|
||||
str(port)
|
||||
])
|
||||
return server_proc
|
||||
|
||||
|
||||
def main():
|
||||
store_done = Event()
|
||||
store_process = Process(target=run_store, args=(store_done, prompts))
|
||||
retrieve_process = Process(target=run_retrieve, args=(store_done, prompts))
|
||||
lmcache_server_process = run_lmcache_server(port)
|
||||
|
||||
# Start KV cache store process
|
||||
store_process.start()
|
||||
|
||||
# Start KV cache retrieve process
|
||||
retrieve_process.start()
|
||||
|
||||
# Clean up the processes
|
||||
store_process.join()
|
||||
retrieve_process.terminate()
|
||||
lmcache_server_process.terminate()
|
||||
lmcache_server_process.wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -38,37 +38,6 @@ class ModelRequestData(NamedTuple):
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Granite Speech
|
||||
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
||||
# NOTE - the setting in this example are somehat different than what is
|
||||
# optimal for granite speech, and it is generally recommended to use beam
|
||||
# search. Check the model README for suggested settings.
|
||||
# https://huggingface.co/ibm-granite/granite-speech-3.3-8b
|
||||
model_name = "ibm-granite/granite-speech-3.3-8b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
# The model has an audio-specific lora directly in its model dir;
|
||||
# it should be enabled whenever you pass audio inputs to the model.
|
||||
speech_lora_path = model_name
|
||||
audio_placeholder = "<|audio|>" * audio_count
|
||||
prompts = f"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>{audio_placeholder}{question}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompts,
|
||||
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# MiniCPM-O
|
||||
def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "openbmb/MiniCPM-o-2_6"
|
||||
@ -240,7 +209,6 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"granite_speech": run_granite_speech,
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
"qwen2_audio": run_qwen2_audio,
|
||||
|
@ -52,8 +52,8 @@ def main():
|
||||
|
||||
args = parse_args()
|
||||
|
||||
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
|
||||
|
||||
max_model_len = 2048
|
||||
|
||||
@ -81,7 +81,7 @@ def main():
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config={
|
||||
"method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle",
|
||||
"method": "eagle",
|
||||
"model": eagle_dir,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
"draft_tensor_parallel_size": args.draft_tp,
|
||||
@ -95,9 +95,6 @@ def main():
|
||||
outputs = llm.generate(prompt_token_ids=prompt_ids,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
if not hasattr(outputs, "metrics") or outputs.metrics is None:
|
||||
return
|
||||
|
||||
# calculate the average number of accepted tokens per forward pass, +1 is
|
||||
# to account for the token from the target model that's always going to be
|
||||
# accepted
|
||||
@ -112,11 +109,6 @@ def main():
|
||||
{sum(acceptance_counts) / acceptance_counts[0]:.2f}")
|
||||
print("-" * 50)
|
||||
|
||||
# print acceptance at each token position
|
||||
for i in range(len(acceptance_counts)):
|
||||
print(f"acceptance at token {i}:"
|
||||
f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -791,9 +791,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--num-images",
|
||||
"-n",
|
||||
type=int,
|
||||
choices=list(range(1,
|
||||
len(IMAGE_URLS) + 1)), # the max number of images
|
||||
choices=list(range(1, 13)), # 12 is the max number of images
|
||||
default=2,
|
||||
help="Number of images to use for the demo.")
|
||||
return parser.parse_args()
|
||||
|
@ -1,49 +1,43 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
To run this example, you need to start the vLLM server:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from openai import BadRequestError, OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
def guided_choice_completion(client: OpenAI, model: str):
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "Classify this sentiment: vLLM is wonderful!"
|
||||
}],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "Classify this sentiment: vLLM is wonderful!"
|
||||
}],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
# Guided decoding by Regex
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": r"\w+@\w+\.com\n",
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "\w+@\w+\.com\n",
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
@ -60,100 +54,66 @@ class CarDescription(BaseModel):
|
||||
car_type: CarType
|
||||
|
||||
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
simplified_sql_grammar = """
|
||||
?start: select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
?table_name: identifier
|
||||
|
||||
condition ::= column "= " number
|
||||
?column_name: identifier
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
# Extra backend options
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
try:
|
||||
# The no-fallback option forces vLLM to use xgrammar, so when it fails
|
||||
# you get a 400 with the reason why
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
extra_body={
|
||||
"guided_regex": "\w+@\w+\.com\n",
|
||||
"stop": ["\n"],
|
||||
"guided_decoding_backend": "xgrammar:no-fallback"
|
||||
},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Extra backend options
|
||||
def extra_backend_options_completion(client: OpenAI, model: str):
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
try:
|
||||
# The no-fallback option forces vLLM to use xgrammar, so when it fails
|
||||
# you get a 400 with the reason why
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": r"\w+@\w+\.com\n",
|
||||
"stop": ["\n"],
|
||||
"guided_decoding_backend": "xgrammar:no-fallback"
|
||||
},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
except BadRequestError as e:
|
||||
print("This error is expected:", e)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
|
||||
model = "Qwen/Qwen2.5-3B-Instruct"
|
||||
|
||||
print("Guided Choice Completion:")
|
||||
print(guided_choice_completion(client, model))
|
||||
|
||||
print("\nGuided Regex Completion:")
|
||||
print(guided_regex_completion(client, model))
|
||||
|
||||
print("\nGuided JSON Completion:")
|
||||
print(guided_json_completion(client, model))
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
print(guided_grammar_completion(client, model))
|
||||
|
||||
print("\nExtra Backend Options Completion:")
|
||||
print(extra_backend_options_completion(client, model))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
except BadRequestError as e:
|
||||
print("This error is expected:", e)
|
||||
|
@ -1,85 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from openai import OpenAI
|
||||
|
||||
# This example demonstrates the `structural_tag` response format.
|
||||
# It can be used to specify a structured output format that occurs between
|
||||
# specific tags in the response. This example shows how it could be used
|
||||
# to enforce the format of a tool call response, but it could be used for
|
||||
# any structured output within a subset of the response.
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"""
|
||||
You have access to the following function to retrieve the weather in a city:
|
||||
|
||||
{
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"city": {
|
||||
"param_type": "string",
|
||||
"description": "The city to get the weather for",
|
||||
"required": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If a you choose to call a function ONLY reply in the following format:
|
||||
<{start_tag}={function_name}>{parameters}{end_tag}
|
||||
where
|
||||
|
||||
start_tag => `<function`
|
||||
parameters => a JSON dict with the function argument name as key and function
|
||||
argument value as value.
|
||||
end_tag => `</function>`
|
||||
|
||||
Here is an example,
|
||||
<function=example_function_name>{"example_name": "example_value"}</function>
|
||||
|
||||
Reminder:
|
||||
- Function calls MUST follow the specified format
|
||||
- Required parameters MUST be specified
|
||||
- Only call one function at a time
|
||||
- Put the entire function call reply on one line
|
||||
- Always add your sources when using search results to answer the user query
|
||||
|
||||
You are a helpful assistant.
|
||||
|
||||
Given the previous instructions, what is the weather in New York City, Boston,
|
||||
and San Francisco?
|
||||
"""
|
||||
}]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
messages=messages,
|
||||
response_format={
|
||||
"type":
|
||||
"structural_tag",
|
||||
"structures": [{
|
||||
"begin": "<function=get_weather>",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"end": "</function>"
|
||||
}],
|
||||
"triggers": ["<function="]
|
||||
})
|
||||
print(response)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -25,28 +25,29 @@ from pydantic import BaseModel
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
def print_completion_details(completion):
|
||||
print("reasoning_content: ",
|
||||
completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
# Guided decoding by Regex
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = ("What is the capital of France?")
|
||||
prompt = ("What is the capital of France?")
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "(Paris|London)",
|
||||
},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "(Paris|London)",
|
||||
},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
|
||||
class People(BaseModel):
|
||||
@ -54,19 +55,19 @@ class People(BaseModel):
|
||||
age: int
|
||||
|
||||
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = People.model_json_schema()
|
||||
json_schema = People.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the name and age of one random person.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
prompt = ("Generate a JSON with the name and age of one random person.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
@ -83,73 +84,46 @@ class CarDescription(BaseModel):
|
||||
car_type: CarType
|
||||
|
||||
|
||||
def guided_car_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
simplified_sql_grammar = """
|
||||
?start: select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
?table_name: identifier
|
||||
|
||||
condition ::= column "= " number
|
||||
?column_name: identifier
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
|
||||
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model: str = models.data[0].id
|
||||
|
||||
print("Guided Regex Completion:")
|
||||
guided_regex_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (People):")
|
||||
guided_json_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (CarDescription):")
|
||||
guided_car_json_completion(client, model)
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
guided_grammar_completion(client, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
@ -25,11 +25,11 @@ def main():
|
||||
responses = client.embeddings.create(
|
||||
input=["Follow the white rabbit."],
|
||||
model=model,
|
||||
dimensions=32,
|
||||
dimensions=1,
|
||||
)
|
||||
|
||||
for data in responses.data:
|
||||
print(data.embedding) # List of float of len 32
|
||||
print(data.embedding) # List of float of len 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,48 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
|
||||
See Ray Serve LLM documentation at:
|
||||
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
|
||||
|
||||
Run `python3 ray_serve_deepseek.py` to deploy the model.
|
||||
"""
|
||||
|
||||
from ray import serve
|
||||
from ray.serve.llm import LLMConfig, build_openai_app
|
||||
|
||||
llm_config = LLMConfig(
|
||||
model_loading_config={
|
||||
"model_id": "deepseek",
|
||||
# Since DeepSeek model is huge, it is recommended to pre-download
|
||||
# the model to local disk, say /path/to/the/model and specify:
|
||||
# model_source="/path/to/the/model"
|
||||
"model_source": "deepseek-ai/DeepSeek-R1",
|
||||
},
|
||||
deployment_config={
|
||||
"autoscaling_config": {
|
||||
"min_replicas": 1,
|
||||
"max_replicas": 1,
|
||||
}
|
||||
},
|
||||
# Change to the accelerator type of the node
|
||||
accelerator_type="H100",
|
||||
runtime_env={"env_vars": {
|
||||
"VLLM_USE_V1": "1"
|
||||
}},
|
||||
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
|
||||
engine_kwargs={
|
||||
"tensor_parallel_size": 8,
|
||||
"pipeline_parallel_size": 2,
|
||||
"gpu_memory_utilization": 0.92,
|
||||
"dtype": "auto",
|
||||
"max_num_seqs": 40,
|
||||
"max_model_len": 16384,
|
||||
"enable_chunked_prefill": True,
|
||||
"enable_prefix_caching": True,
|
||||
"trust_remote_code": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Deploy the application
|
||||
llm_app = build_openai_app({"llm_configs": [llm_config]})
|
||||
serve.run(llm_app)
|
@ -1,116 +0,0 @@
|
||||
{%- macro is_array_of_type_objects(var) -%}
|
||||
{%- if var is iterable and var is not string -%}
|
||||
{%- set valid = true -%}
|
||||
{%- for item in var -%}
|
||||
{%- if 'type' not in item -%}
|
||||
{%- set valid = false -%}
|
||||
{%- break -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{{ valid }}
|
||||
{%- else -%}
|
||||
{{ false }}
|
||||
{%- endif -%}
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro render_message(message) %}
|
||||
{%- if message['content'] is string %}
|
||||
{{- message['content']|trim }}
|
||||
{%- elif is_array_of_type_objects(data) == 'True' %}
|
||||
{%- for content in message['content'] %}
|
||||
{%- if content['type'] == 'image' %}
|
||||
{{- '<|image|>' }}
|
||||
{%- elif content['type'] == 'text' %}
|
||||
{{- content['text']|trim }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- else %}
|
||||
{{- message['content']|tojson }}
|
||||
{%- endif %}
|
||||
{%- endmacro %}
|
||||
|
||||
{{- bos_token }}
|
||||
{%- if custom_tools is defined %}
|
||||
{%- set tools = custom_tools %}
|
||||
{%- endif %}
|
||||
{%- if not tools_in_user_message is defined %}
|
||||
{%- set tools_in_user_message = true %}
|
||||
{%- endif %}
|
||||
{%- if not tools is defined %}
|
||||
{%- set tools = none %}
|
||||
{%- endif %}
|
||||
|
||||
{#- This block extracts the system message, so we can slot it into the right place. #}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{%- set system_message = messages[0] %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
|
||||
"capabilities. Only reply with a tool call if the function exists in the "
|
||||
"library provided by the user. If it doesn't exist, just reply directly in "
|
||||
"natural language. When you receive a tool call response, use the output to "
|
||||
"format an answer to the original user question."}) %}
|
||||
{%- endif %}
|
||||
|
||||
{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
|
||||
'or more function/tool calls to fulfill the task. \n'
|
||||
'If none are needed, then proceed to the response.\n\n'
|
||||
'Tool Call Syntax: You can call tools using the following syntax:\n'
|
||||
'{"name": function name, "parameters": dictionary of argument name and its value}.\n'
|
||||
'Separate multiple function calls by "; ". Do not use variables.\n'
|
||||
'Do not include anything else when calling the tools with the syntax above.\n\n'
|
||||
'Here is a list of functions in JSON format that you can invoke.\n' %}
|
||||
|
||||
{{- "<|header_start|>system<|header_end|>\n\n" }}
|
||||
{%- if tools is not none and not tools_in_user_message %}
|
||||
{{- tool_lib_preamble }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- render_message(system_message) }}
|
||||
{{ "<|eot|>\n" }}
|
||||
|
||||
{#- Custom tools are passed in a user message with some extra guidance #}
|
||||
{%- if tools_in_user_message and not tools is none %}
|
||||
{#- Extract the first user message so we can plug it in here #}
|
||||
{%- if messages | length != 0 %}
|
||||
{%- set first_user_message = messages[0] %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
|
||||
{%- endif %}
|
||||
{{- '<|header_start|>user<|header_end|>\n\n' }}
|
||||
{{- tool_lib_preamble }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{{- render_message(first_user_message) + "\n<|eot|>"}}
|
||||
{%- endif %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
|
||||
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
|
||||
{{- render_message(message) }}
|
||||
{{- "\n<|eot|>" }}
|
||||
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
|
||||
{{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
|
||||
{{- render_message(message) }}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{{- '{"name": "' + tool_call.function.name + '", ' }}
|
||||
{{- '"parameters": ' }}
|
||||
{{- tool_call.function.arguments | tojson }}
|
||||
{{- "}" }}
|
||||
{%- endfor %}
|
||||
{{- "\n<|eot|>" }}
|
||||
{%- elif message.role == "tool" or message.role == "ipython" %}
|
||||
{{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
|
||||
{{- render_message(message) }}
|
||||
{{- "\n<|eom|>" }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
|
||||
{%- endif %}
|
@ -46,7 +46,8 @@ vllm = "vllm.entrypoints.cli.main:main"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
include = ["vllm*"]
|
||||
exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"]
|
||||
namespaces = false
|
||||
|
||||
[tool.yapfignore]
|
||||
ignore_patterns = [
|
||||
@ -58,8 +59,7 @@ ignore_patterns = [
|
||||
line-length = 80
|
||||
exclude = [
|
||||
# External file, leaving license intact
|
||||
"examples/other/fp8/quantizer/quantize.py",
|
||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
||||
"examples/other/fp8/quantizer/quantize.py"
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
|
@ -7,7 +7,6 @@ sphinx-togglebutton==0.3.2
|
||||
myst-parser==3.0.1
|
||||
msgspec
|
||||
cloudpickle
|
||||
commonmark # Required by sphinx-argparse when using :markdownhelp:
|
||||
|
||||
# packages to install to build the documentation
|
||||
cachetools
|
||||
|
@ -9,4 +9,4 @@ numpy==1.26.4
|
||||
tabulate
|
||||
setuptools>=61
|
||||
setuptools-scm>=8
|
||||
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
|
||||
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
|
||||
|
@ -34,7 +34,7 @@ num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.8 # required for model evaluation test
|
||||
transformers==4.51.3
|
||||
transformers==4.51.1
|
||||
tokenizers==0.21.1
|
||||
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
|
@ -737,7 +737,7 @@ tqdm==4.66.6
|
||||
# transformers
|
||||
tqdm-multiprocess==0.0.11
|
||||
# via lm-eval
|
||||
transformers==4.51.3
|
||||
transformers==4.51.1
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
|
27
setup.py
27
setup.py
@ -269,17 +269,15 @@ class cmake_build_ext(build_ext):
|
||||
# First, run the standard build_ext command to compile the extensions
|
||||
super().run()
|
||||
|
||||
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
|
||||
# copy vllm/vllm_flash_attn/*.py from self.build_lib to current
|
||||
# directory so that they can be included in the editable build
|
||||
import glob
|
||||
files = glob.glob(os.path.join(self.build_lib, "vllm",
|
||||
"vllm_flash_attn", "**", "*.py"),
|
||||
recursive=True)
|
||||
files = glob.glob(
|
||||
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
|
||||
for file in files:
|
||||
dst_file = os.path.join("vllm/vllm_flash_attn",
|
||||
file.split("vllm/vllm_flash_attn/")[-1])
|
||||
os.path.basename(file))
|
||||
print(f"Copying {file} to {dst_file}")
|
||||
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
|
||||
self.copy_file(file, dst_file)
|
||||
|
||||
|
||||
@ -379,22 +377,13 @@ class repackage_wheel(build_ext):
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/flash_attn_interface.py",
|
||||
"vllm/vllm_flash_attn/__init__.py",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
# "vllm/_version.py", # not available in nightly wheels yet
|
||||
]
|
||||
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
|
||||
|
||||
# vllm_flash_attn python code:
|
||||
# Regex from
|
||||
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
|
||||
import re
|
||||
compiled_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename),
|
||||
wheel.filelist))
|
||||
file_members = filter(lambda x: x.filename in files_to_copy,
|
||||
wheel.filelist)
|
||||
|
||||
for file in file_members:
|
||||
print(f"Extracting and including {file.filename} "
|
||||
|
@ -20,11 +20,15 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
|
||||
("facebook/opt-125m", {}),
|
||||
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||
"dtype": torch.float16,
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
||||
"dtype": torch.float16,
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||
]
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user