mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
172 Commits
v0.9.1rc1
...
fix-precom
Author | SHA1 | Date | |
---|---|---|---|
e17250f0d2 | |||
4959915089 | |||
8d1e89d946 | |||
36239f79dd | |||
dfada85eee | |||
ed33349738 | |||
d49adea1f9 | |||
14fdd21d39 | |||
04fefe7c9a | |||
3b523e38d9 | |||
16c16301c8 | |||
9206d0ff01 | |||
a89209b78d | |||
ffacb222cb | |||
12575cfa7a | |||
8b6e1d639c | |||
735a9de71f | |||
257ab95439 | |||
cca91a7a10 | |||
f04d604567 | |||
19a53b2783 | |||
eccdc8318c | |||
5f52a84685 | |||
d4629dc43f | |||
6e9cc73f67 | |||
c53711bd63 | |||
dac8cc49f4 | |||
a44b1c951d | |||
b447624ee3 | |||
cda92307c1 | |||
bf57ccc5c2 | |||
ffb2cd6b54 | |||
ca94d7fa00 | |||
5a1c2e15d8 | |||
4c8f64faa7 | |||
93aee29fdb | |||
154d063b9f | |||
ccd7c05089 | |||
c48c6c4008 | |||
aed8468642 | |||
5c76b9cdaf | |||
ddfed314f9 | |||
5b3ad5ecf2 | |||
ede5c4ebdf | |||
07334959d8 | |||
119f683949 | |||
0860087aff | |||
6bc7b57315 | |||
90f9c2eb5c | |||
387bdf0ab9 | |||
5e5baa91aa | |||
836d4ce140 | |||
c3fec47bb7 | |||
1173804dca | |||
4d5424029b | |||
3e7506975c | |||
ee35e96ac3 | |||
dec66d253b | |||
8d120701fd | |||
f40f763f12 | |||
26bc46ef89 | |||
a77aea59fd | |||
b692e9cd07 | |||
367871a469 | |||
92183b41f3 | |||
c6703d1e0d | |||
a5e7242d5f | |||
91b2c17a55 | |||
055915e6ce | |||
3d330c4c09 | |||
0b73736a0d | |||
ee1531bc38 | |||
e13945f9dd | |||
08500011d3 | |||
861a0a0a39 | |||
bc956b38d0 | |||
294fc1e2c9 | |||
2db9044ab6 | |||
6fa718a460 | |||
06be858828 | |||
d1e34cc9ac | |||
bd517eb9fe | |||
d65668b4e8 | |||
aafbbd981f | |||
0f0874515a | |||
3597b06a4f | |||
1015296b79 | |||
ce9dc02c93 | |||
a24cb91600 | |||
7e8d97dd3f | |||
d70bc7c029 | |||
ce688ad46e | |||
cefdb9962d | |||
ace5cdaff0 | |||
6458721108 | |||
bb4a0decef | |||
c707cfc12e | |||
7b3c9ff91d | |||
c68698b326 | |||
e3b12667d4 | |||
e6aab5de29 | |||
c57bb199b3 | |||
dba68f9159 | |||
a3319f4f04 | |||
9d880f594d | |||
017ef648e9 | |||
4b25ab14e2 | |||
f98548b9da | |||
96846bb360 | |||
b6efafd9e4 | |||
1129e2b1ab | |||
c742438f8b | |||
73e2e0118f | |||
c9280e6346 | |||
af09b3f0a0 | |||
4f6c42fa0a | |||
dff680001d | |||
2e090bd5df | |||
1b0b065eb5 | |||
d5bdf899e4 | |||
7e3e74c97c | |||
3f6341bf7f | |||
e5d35d62f5 | |||
2f1c19b245 | |||
42f52cc95b | |||
97a9465bbc | |||
c7ea0b56cd | |||
29fa5cac1c | |||
b2d9be6f7d | |||
04a55612dd | |||
89b0f84e17 | |||
497a91e9f7 | |||
943ffa5703 | |||
5c8d34a42c | |||
3c8694eabe | |||
7484e1fce2 | |||
a2142f0196 | |||
871d6b7c74 | |||
29a38f0352 | |||
a5115f4ff5 | |||
68b4a26149 | |||
b8e809a057 | |||
5039ec2336 | |||
7c644ab6d5 | |||
2d40665fe8 | |||
96ada386b7 | |||
1e473b3010 | |||
2b1e2111b0 | |||
a45b979d9f | |||
3952731e8f | |||
77f0d465d0 | |||
22c3c0aa4a | |||
33f8dba7c6 | |||
5241ca50d6 | |||
da9b523ce1 | |||
b6553be1bc | |||
64a9af5afa | |||
e4248849ec | |||
467bef18a3 | |||
5f1ac1e1d1 | |||
9368cc90b2 | |||
32b3946bb4 | |||
6b1391ca7e | |||
a3f66e75d1 | |||
319cb1e351 | |||
1efef71645 | |||
646d62f636 | |||
6cd4ae8acd | |||
c016047ed7 | |||
9af6d22e4c | |||
4589b94032 | |||
cc867be19c |
@ -24,13 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
# list packages
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
@ -43,7 +52,10 @@ function cpu_tests() {
|
||||
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
|
||||
pytest -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
@ -69,7 +81,7 @@ function cpu_tests() {
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
|
@ -177,6 +177,11 @@ steps:
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
@ -305,6 +310,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
@ -669,7 +675,7 @@ steps:
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
|
10
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
10
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -8,6 +8,16 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
|
||||
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
|
||||
- Passwords or authentication credentials
|
||||
- Private URLs or endpoints
|
||||
- Personal or confidential data
|
||||
|
||||
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Your current environment
|
||||
|
35
.github/mergify.yml
vendored
35
.github/mergify.yml
vendored
@ -65,6 +65,41 @@ pull_request_rules:
|
||||
add:
|
||||
- multi-modality
|
||||
|
||||
- name: label-qwen
|
||||
description: Automatically apply qwen label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/.*qwen.*\.py
|
||||
- files~=^tests/.*qwen.*\.py
|
||||
- files~=^vllm/model_executor/models/.*qwen.*\.py
|
||||
- files~=^vllm/reasoning/.*qwen.*\.py
|
||||
- title~=(?i)Qwen
|
||||
- body~=(?i)Qwen
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- qwen
|
||||
|
||||
- name: label-rocm
|
||||
description: Automatically apply rocm label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^csrc/rocm/
|
||||
- files~=^docker/Dockerfile.rocm
|
||||
- files~=^requirements/rocm.*\.txt
|
||||
- files~=^vllm/attention/backends/rocm.*\.py
|
||||
- files~=^vllm/attention/ops/rocm.*\.py
|
||||
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
|
||||
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
|
||||
- files~=^tests/kernels/.*_rocm.*\.py
|
||||
- files=vllm/platforms/rocm.py
|
||||
- title~=(?i)AMD
|
||||
- title~=(?i)ROCm
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- rocm
|
||||
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -200,5 +200,5 @@ benchmarks/**/*.json
|
||||
actionlint
|
||||
shellcheck*/
|
||||
|
||||
# Ingore moe/marlin_moe gen code
|
||||
# Ignore moe/marlin_moe gen code
|
||||
csrc/moe/marlin_moe_wna16/kernel_*
|
||||
|
@ -20,12 +20,10 @@ repos:
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.32.0
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'pyproject.toml']
|
||||
- id: typos
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 6.0.1
|
||||
hooks:
|
||||
@ -145,6 +143,13 @@ repos:
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
- id: check-pickle-imports
|
||||
name: Prevent new pickle/cloudpickle imports
|
||||
entry: python tools/check_pickle_imports.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [pathspec, regex]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
@ -420,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||
# require CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
@ -542,10 +542,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
||||
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
||||
# if it's possible to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
|
@ -156,7 +156,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
||||
|
||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
||||
|
||||
|
@ -123,7 +123,7 @@ def main(args: argparse.Namespace):
|
||||
save_to_pytorch_benchmark_format(args, results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the latency of processing a single batch of "
|
||||
"requests till completion."
|
||||
@ -171,6 +171,12 @@ if __name__ == "__main__":
|
||||
# V1 enables prefix caching by default which skews the latency
|
||||
# numbers. We need to disable prefix caching by default.
|
||||
parser.set_defaults(enable_prefix_caching=False)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||
raise OSError(
|
||||
|
@ -142,7 +142,7 @@ def main(args):
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance with or "
|
||||
"without automatic prefix caching."
|
||||
@ -192,5 +192,11 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -218,7 +218,7 @@ def main(args):
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance with or without "
|
||||
"automatic prefix caching."
|
||||
@ -268,5 +268,11 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -161,7 +161,7 @@ def main(args: argparse.Namespace):
|
||||
json.dump(results, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument(
|
||||
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
|
||||
@ -204,6 +204,12 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
|
@ -875,7 +875,7 @@ def main(args: argparse.Namespace):
|
||||
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the online serving throughput."
|
||||
)
|
||||
@ -1225,6 +1225,10 @@ if __name__ == "__main__":
|
||||
"script chooses a LoRA module at random.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
|
||||
json.dump(results, outfile, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the online serving throughput."
|
||||
)
|
||||
@ -1034,5 +1034,10 @@ if __name__ == "__main__":
|
||||
help="Ratio of Structured Outputs requests",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -595,7 +595,7 @@ def validate_args(args):
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
@ -717,6 +717,12 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
|
@ -1,5 +1,4 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
@ -11,6 +10,80 @@ from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"fp8-tensor-w-token-a": dict(
|
||||
w="tensor", a="token", no_a_quant=False, enabled=False
|
||||
),
|
||||
"fp8-tensor-w-tensor-a": dict(
|
||||
w="tensor", a="tensor", no_a_quant=False, enabled=True
|
||||
),
|
||||
"fp8-channel-w-token-a": dict(
|
||||
w="channel", a="token", no_a_quant=False, enabled=True
|
||||
),
|
||||
"fp8-channel-w-tensor-a": dict(
|
||||
w="channel", a="tensor", no_a_quant=False, enabled=False
|
||||
),
|
||||
"fp8-tensor-w-token-a-noquant": dict(
|
||||
w="tensor", a="token", no_a_quant=True, enabled=False
|
||||
),
|
||||
"fp8-tensor-w-tensor-a-noquant": dict(
|
||||
w="tensor", a="tensor", no_a_quant=True, enabled=True
|
||||
),
|
||||
"fp8-channel-w-token-a-noquant": dict(
|
||||
w="channel", a="token", no_a_quant=True, enabled=True
|
||||
),
|
||||
"fp8-channel-w-tensor-a-noquant": dict(
|
||||
w="channel", a="tensor", no_a_quant=True, enabled=False
|
||||
),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
|
||||
|
||||
def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
|
||||
if w_type == "tensor":
|
||||
scale_b = torch.ones(1, device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
else:
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
|
||||
return b_fp8.t(), scale_b_fp8
|
||||
|
||||
|
||||
def build_fp8_runner(cfg, a, b, dtype, device):
|
||||
b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
|
||||
|
||||
scale_a_const = (
|
||||
torch.ones(1, device=device, dtype=torch.float32)
|
||||
if cfg["a"] == "tensor"
|
||||
else None
|
||||
)
|
||||
|
||||
if cfg["no_a_quant"]:
|
||||
if cfg["a"] == "tensor":
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
|
||||
else:
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
|
||||
|
||||
def run():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
return run
|
||||
|
||||
if cfg["a"] == "tensor":
|
||||
|
||||
def run():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
else:
|
||||
|
||||
def run():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
@ -18,28 +91,8 @@ from vllm.triton_utils import triton
|
||||
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=[
|
||||
"torch-bf16",
|
||||
# "fp8-tensor-w-token-a",
|
||||
"fp8-tensor-w-tensor-a",
|
||||
"fp8-channel-w-token-a",
|
||||
# "fp8-channel-w-tensor-a",
|
||||
# "fp8-tensor-w-token-a-noquant",
|
||||
"fp8-tensor-w-tensor-a-noquant",
|
||||
"fp8-channel-w-token-a-noquant",
|
||||
# "fp8-channel-w-tensor-a-noquant",
|
||||
],
|
||||
line_names=[
|
||||
"torch-bf16",
|
||||
# "fp8-tensor-w-token-a",
|
||||
"fp8-tensor-w-tensor-a",
|
||||
"fp8-channel-w-token-a",
|
||||
# "fp8-channel-w-tensor-a",
|
||||
# "fp8-tensor-w-token-a-noquant",
|
||||
"fp8-tensor-w-tensor-a-noquant",
|
||||
"fp8-channel-w-token-a-noquant",
|
||||
# "fp8-channel-w-tensor-a-noquant",
|
||||
],
|
||||
line_vals=_enabled,
|
||||
line_names=_enabled,
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs FP8 GEMMs",
|
||||
args={},
|
||||
@ -50,144 +103,34 @@ def benchmark(batch_size, provider, N, K):
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
# Create input tensors
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if "torch-bf16" in provider:
|
||||
if provider == "torch-bf16":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||
)
|
||||
|
||||
elif "fp8" in provider:
|
||||
# Weights are always quantized ahead of time
|
||||
if "noquant" in provider:
|
||||
# For no quantization, we just measure the GEMM
|
||||
if "tensor-w-token-a" in provider:
|
||||
# Dynamic per-token quant for A, per-tensor quant for B
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "tensor-w-tensor-a" in provider:
|
||||
# Static per-tensor quantization with fixed scales
|
||||
# for both A and B
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-token-a" in provider:
|
||||
# Static per-channel quantization for weights, per-token
|
||||
# quant for A
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-tensor-a" in provider:
|
||||
# Static per-channel quantization for weights, per-tensor
|
||||
# quant for A
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
else:
|
||||
# In these cases, we quantize the activations during the GEMM call
|
||||
if "tensor-w-token-a" in provider:
|
||||
# Dynamic per-token quant for A, per-tensor quant for B
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "tensor-w-tensor-a" in provider:
|
||||
# Static per-tensor quantization with fixed scales
|
||||
# for both A and B
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
assert scale_b_fp8.numel() == 1
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-token-a" in provider:
|
||||
# Static per-channel quantization for weights, per-token
|
||||
# quant for A
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
|
||||
a, use_per_token_if_dynamic=True
|
||||
)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
elif "channel-w-tensor-a" in provider:
|
||||
# Static per-channel quantization for weights, per-tensor
|
||||
# quant for A
|
||||
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
|
||||
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
|
||||
assert scale_b_fp8.numel() == N
|
||||
|
||||
def run_quant():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
b_fp8 = b_fp8.t()
|
||||
|
||||
else:
|
||||
cfg = PROVIDER_CFGS[provider]
|
||||
run_quant = build_fp8_runner(cfg, a, b, dtype, device)
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), quantiles=quantiles
|
||||
)
|
||||
|
||||
# Calculate TFLOP/s, two flops per multiply-add
|
||||
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
|
||||
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
KN_model_names = []
|
||||
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||
for model, tp_size in models_tps:
|
||||
assert model in WEIGHT_SHAPES
|
||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||
out = []
|
||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_dim] //= tp_size
|
||||
KN.append(model)
|
||||
KN_model_names.append(KN)
|
||||
return KN_model_names
|
||||
out.append(KN)
|
||||
return out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -197,21 +140,13 @@ if __name__ == "__main__":
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
choices=[*WEIGHT_SHAPES.keys()],
|
||||
help="List of models to benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[1],
|
||||
help="List of tensor parallel sizes",
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
args = parser.parse_args()
|
||||
|
||||
KN_model_names = prepare_shapes(args)
|
||||
for K, N, model_name in KN_model_names:
|
||||
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
||||
for K, N, model in prepare_shapes(args):
|
||||
print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
|
169
benchmarks/kernels/bench_int8_gemm.py
Normal file
169
benchmarks/kernels/bench_int8_gemm.py
Normal file
@ -0,0 +1,169 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||
from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"int8-tensor-w-token-a": dict(
|
||||
w="tensor", a="token", no_a_quant=False, enabled=False
|
||||
),
|
||||
"int8-tensor-w-tensor-a": dict(
|
||||
w="tensor", a="tensor", no_a_quant=False, enabled=True
|
||||
),
|
||||
"int8-channel-w-token-a": dict(
|
||||
w="channel", a="token", no_a_quant=False, enabled=True
|
||||
),
|
||||
"int8-channel-w-tensor-a": dict(
|
||||
w="channel", a="tensor", no_a_quant=False, enabled=False
|
||||
),
|
||||
"int8-tensor-w-token-a-noquant": dict(
|
||||
w="tensor", a="token", no_a_quant=True, enabled=False
|
||||
),
|
||||
"int8-tensor-w-tensor-a-noquant": dict(
|
||||
w="tensor", a="tensor", no_a_quant=True, enabled=True
|
||||
),
|
||||
"int8-channel-w-token-a-noquant": dict(
|
||||
w="channel", a="token", no_a_quant=True, enabled=True
|
||||
),
|
||||
"int8-channel-w-tensor-a-noquant": dict(
|
||||
w="channel", a="tensor", no_a_quant=True, enabled=False
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _quant_weight(b, w_type, device):
|
||||
if w_type == "tensor":
|
||||
scale_b = torch.ones(1, device=device, dtype=torch.float32)
|
||||
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
|
||||
assert scale_b_int8.numel() == 1
|
||||
else: # channel
|
||||
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
|
||||
assert scale_b_int8.numel() == b.shape[0]
|
||||
return b_int8.t(), scale_b_int8
|
||||
|
||||
|
||||
def build_int8_runner(cfg, a, b, dtype, device):
|
||||
# quant before running the kernel
|
||||
b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
|
||||
|
||||
scale_a_const = None
|
||||
if cfg["a"] == "tensor":
|
||||
scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
|
||||
|
||||
# no quant, create activation ahead
|
||||
if cfg["no_a_quant"]:
|
||||
if cfg["a"] == "tensor":
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
|
||||
else: # token
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
|
||||
|
||||
return run_quant
|
||||
|
||||
# dynamic quant, create activation inside
|
||||
if cfg["a"] == "tensor":
|
||||
|
||||
def run_quant():
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
|
||||
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
|
||||
|
||||
else: # token
|
||||
|
||||
def run_quant():
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
|
||||
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
|
||||
|
||||
return run_quant
|
||||
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
line_names=[k for k in _enabled],
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs INT8 GEMMs",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch-bf16":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||
)
|
||||
else:
|
||||
cfg = PROVIDER_CFGS[provider]
|
||||
run_quant = build_int8_runner(cfg, a, b, dtype, device)
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), quantiles=quantiles
|
||||
)
|
||||
|
||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
KN_model_names = []
|
||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_dim] //= tp_size
|
||||
KN.append(model)
|
||||
KN_model_names.append(KN)
|
||||
return KN_model_names
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
help="List of models to benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[1],
|
||||
help="List of tensor parallel sizes",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_int8_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
||||
print("Benchmark finished!")
|
@ -7,7 +7,6 @@ import time
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime
|
||||
from itertools import product
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, TypedDict
|
||||
|
||||
import ray
|
||||
@ -43,7 +42,7 @@ def benchmark_config(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
block_quant_shape: List[int] = None,
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> float:
|
||||
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
@ -400,7 +399,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
block_quant_shape: List[int] = None,
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
current_platform.seed_everything(self.seed)
|
||||
@ -532,7 +531,7 @@ def save_configs(
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
block_quant_shape: List[int],
|
||||
block_quant_shape: list[int],
|
||||
) -> None:
|
||||
dtype_str = get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
@ -563,7 +562,6 @@ def main(args: argparse.Namespace):
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
config = SimpleNamespace(**config)
|
||||
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
@ -595,11 +593,7 @@ def main(args: argparse.Namespace):
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = (
|
||||
torch.float16
|
||||
if current_platform.is_rocm()
|
||||
else getattr(torch, config.torch_dtype)
|
||||
)
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
|
159
benchmarks/kernels/benchmark_moe_align_block_size.py
Normal file
159
benchmarks/kernels/benchmark_moe_align_block_size.py
Normal file
@ -0,0 +1,159 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||
moe_align_block_size_triton,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
|
||||
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
||||
return torch.stack(
|
||||
[
|
||||
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
|
||||
for _ in range(num_tokens)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
|
||||
"""
|
||||
Verifies vllm vs. Triton
|
||||
"""
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
# 1. malloc space for triton and vllm
|
||||
# malloc enough space (max_num_tokens_padded) for the sorted ids
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
sorted_ids_triton = torch.empty(
|
||||
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
|
||||
)
|
||||
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
|
||||
expert_ids_triton = torch.zeros(
|
||||
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
|
||||
)
|
||||
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
|
||||
|
||||
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
|
||||
sorted_ids_vllm.fill_(topk_ids.numel())
|
||||
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
|
||||
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
|
||||
|
||||
# 2. run implementations
|
||||
moe_align_block_size_triton(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids_triton,
|
||||
expert_ids_triton,
|
||||
num_tokens_post_pad_triton,
|
||||
)
|
||||
|
||||
ops.moe_align_block_size(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids_vllm,
|
||||
expert_ids_vllm,
|
||||
num_tokens_post_pad_vllm,
|
||||
)
|
||||
print(f"✅ VLLM implementation works with {num_experts} experts!")
|
||||
|
||||
# 3. compare results
|
||||
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
|
||||
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
|
||||
):
|
||||
print("✅ Triton and VLLM implementations match.")
|
||||
else:
|
||||
print("❌ Triton and VLLM implementations DO NOT match.")
|
||||
print("Triton expert_ids:", expert_ids_triton)
|
||||
print("VLLM expert_ids:", expert_ids_vllm)
|
||||
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
|
||||
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
|
||||
|
||||
|
||||
# test configurations
|
||||
num_tokens_range = [1, 16, 256, 4096]
|
||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||
topk_range = [1, 2, 8]
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["vllm", "triton"], # "triton"
|
||||
line_names=["VLLM", "Triton"], # "Triton"
|
||||
plot_name="moe-align-block-size-performance",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
"""Benchmark function for Triton."""
|
||||
block_size = 256
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
|
||||
sorted_ids.fill_(topk_ids.numel())
|
||||
max_num_m_blocks = max_num_tokens_padded // block_size
|
||||
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
|
||||
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "vllm":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: ops.moe_align_block_size(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids.clone(),
|
||||
expert_ids.clone(),
|
||||
num_tokens_post_pad.clone(),
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif provider == "triton":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: moe_align_block_size_triton(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids.clone(),
|
||||
expert_ids.clone(),
|
||||
num_tokens_post_pad.clone(),
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--num_experts",
|
||||
type=int,
|
||||
default=64,
|
||||
choices=[8, 16, 32, 64, 128, 256],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topk",
|
||||
type=int,
|
||||
default=8,
|
||||
choices=[2, 4, 8],
|
||||
help="Top-k value for correctness check.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Running correctness check...")
|
||||
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
|
||||
benchmark.run(print_data=True, show_plots=True)
|
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
|
||||
GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
@ -122,6 +122,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
|
||||
"-DENABLE_FP8"
|
||||
"-U__HIP_NO_HALF_CONVERSIONS__"
|
||||
"-U__HIP_NO_HALF_OPERATORS__"
|
||||
"-Werror=unused-variable"
|
||||
"-fno-gpu-rdc")
|
||||
|
||||
endif()
|
||||
|
@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||
assert(head_size % thread_group_size == 0);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
@ -193,4 +190,4 @@ void paged_attention_v1(
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||
assert(head_size % thread_group_size == 0);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
@ -203,4 +200,4 @@ void paged_attention_v2(
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
|
||||
const int size) {
|
||||
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
|
||||
const int size) {
|
||||
T max = max_data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
max = max >= max_data[i] ? max : max_data[i];
|
||||
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
|
||||
|
||||
if (partition_num == 1) continue;
|
||||
|
||||
reducePartitonSoftmax(
|
||||
reducePartitionSoftmax(
|
||||
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions,
|
||||
exp_sums + seq_idx * num_heads * max_num_partitions +
|
||||
|
@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
explicit FP16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit FP16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit BF16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
// normal load
|
||||
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit FP32Vec16(bool, void* ptr)
|
||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
// normal load
|
||||
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
|
||||
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
_mm512_mask_storeu_epi8(ptr, mask, reg);
|
||||
}
|
||||
|
||||
// non-temproal save
|
||||
// non-temporal save
|
||||
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
|
||||
};
|
||||
#endif
|
||||
|
@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
|
||||
int page_num = numa_migrate_pages(pid, src_mask, mask);
|
||||
if (page_num == -1) {
|
||||
TORCH_CHECK(false,
|
||||
"numa_migrate_pages failed. errno: " + std::to_string(errno));
|
||||
TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
|
||||
}
|
||||
|
||||
// restrict memory allocation node.
|
||||
@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
@ -13,232 +13,45 @@
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
namespace {
|
||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||
int32_t col) {
|
||||
// don't worry about overflow because num_experts is relatively small
|
||||
return row * total_col + col;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <typename scalar_t, typename token_cnts_t>
|
||||
__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
||||
int32_t* sorted_token_ids,
|
||||
int32_t* expert_ids,
|
||||
int32_t* total_tokens_post_pad,
|
||||
int32_t num_experts,
|
||||
int32_t block_size, size_t numel) {
|
||||
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1)
|
||||
token_cnts_t* tokens_cnts =
|
||||
(token_cnts_t*)(shared_mem + num_experts +
|
||||
1); // 2d tensor with shape (blockDim.x + 1, num_experts)
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* In the first step we compute token_cnts[thread_index + 1][expert_index],
|
||||
* which counts how many tokens in the token shard of thread_index are
|
||||
* assigned to expert expert_index.
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// For each expert we accumulate the token counts from the different threads.
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// We accumulate the token counts of all experts in thread 0.
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] = cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||
block_size) *
|
||||
block_size;
|
||||
}
|
||||
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/**
|
||||
* For each expert, each thread processes the tokens of the corresponding
|
||||
* blocks and stores the corresponding expert_id for each block.
|
||||
*/
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Each thread processes a token shard, calculating the index of each token
|
||||
* after sorting by expert number. Given the example topk_ids =
|
||||
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
|
||||
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
|
||||
* padding value(preset in python).
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||
* expert with expert_id needs to process, and
|
||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||
* processed by the expert with expert_id within the current thread's token
|
||||
* shard.
|
||||
*/
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||
cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(simon): this is temporarily adapted from
|
||||
// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
|
||||
// we did this to unblock Deepseek V3 but there should be a better
|
||||
// implementation to manage shared memory.
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_global_mem_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
|
||||
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
|
||||
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||
size_t numel, int32_t* __restrict__ cumsum) {
|
||||
extern __shared__ int32_t shared_counts[];
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* In the first step we compute token_cnts[thread_index + 1][expert_index],
|
||||
* which counts how many tokens in the token shard of thread_index are
|
||||
* assigned to expert expert_index.
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// For each expert we accumulate the token counts from the different threads.
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// We accumulate the token counts of all experts in thread 0.
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] = cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||
block_size) *
|
||||
block_size;
|
||||
}
|
||||
*total_tokens_post_pad = cumsum[num_experts];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/**
|
||||
* For each expert, each thread processes the tokens of the corresponding
|
||||
* blocks and stores the corresponding expert_id for each block.
|
||||
*/
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Each thread processes a token shard, calculating the index of each token
|
||||
* after sorting by expert number. Given the example topk_ids =
|
||||
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
|
||||
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
|
||||
* padding value(preset in python).
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||
* expert with expert_id needs to process, and
|
||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||
* processed by the expert with expert_id within the current thread's token
|
||||
* shard.
|
||||
*/
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||
cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
|
||||
}
|
||||
}
|
||||
|
||||
// taken from
|
||||
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
|
||||
template <typename scalar_t>
|
||||
__global__ void sgl_moe_align_block_size_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
|
||||
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel, int32_t* cumsum) {
|
||||
__shared__ int32_t shared_counts[32][8];
|
||||
|
||||
const int warp_id = threadIdx.x / 32;
|
||||
const int experts_per_warp = 8;
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int my_expert_start = warp_id * experts_per_warp;
|
||||
|
||||
// Initialize shared_counts for this warp's experts
|
||||
for (int i = 0; i < experts_per_warp; ++i) {
|
||||
if (my_expert_start + i < num_experts) {
|
||||
shared_counts[warp_id][i] = 0;
|
||||
if (my_expert_start + i < padded_num_experts) {
|
||||
shared_counts[warp_id * experts_per_warp + i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
const size_t tid = threadIdx.x;
|
||||
const size_t stride = blockDim.x;
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int expert_id = topk_ids[i];
|
||||
int warp_idx = expert_id / experts_per_warp;
|
||||
int expert_offset = expert_id % experts_per_warp;
|
||||
atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Single thread computes cumulative sum and total tokens
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
int expert_count = 0;
|
||||
int warp_idx = (i - 1) / experts_per_warp;
|
||||
int expert_offset = (i - 1) % experts_per_warp;
|
||||
expert_count = shared_counts[warp_idx][expert_offset];
|
||||
expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
|
||||
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
|
||||
@ -248,7 +61,6 @@ __global__ void sgl_moe_align_block_size_kernel(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Assign expert IDs to blocks
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
@ -257,13 +69,11 @@ __global__ void sgl_moe_align_block_size_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
// taken from
|
||||
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
|
||||
template <typename scalar_t>
|
||||
__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
|
||||
int32_t* sorted_token_ids,
|
||||
int32_t* cumsum_buffer,
|
||||
size_t numel) {
|
||||
__global__ void count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
size_t numel) {
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
@ -290,132 +100,138 @@ __global__ void moe_sum_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_small_batch_expert_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel) {
|
||||
const size_t tid = threadIdx.x;
|
||||
const size_t stride = blockDim.x;
|
||||
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[threadIdx.x] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[i * num_experts + threadIdx.x] +=
|
||||
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
|
||||
block_size;
|
||||
}
|
||||
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[threadIdx.x * num_experts + expert_id];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace moe
|
||||
} // namespace vllm
|
||||
|
||||
// taken from
|
||||
// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int device_max_shared_mem;
|
||||
auto dev = topk_ids.get_device();
|
||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
|
||||
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
|
||||
const int32_t shared_mem_i32 =
|
||||
((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
|
||||
const int32_t shared_mem_i16 =
|
||||
((num_thread + 1) * num_experts) * sizeof(uint16_t) +
|
||||
(num_experts + 1) * sizeof(int32_t);
|
||||
|
||||
bool use_global_memory = false;
|
||||
bool use_i16 = false; // Use uint16_t for shared memory token counts
|
||||
if (shared_mem_i32 < device_max_shared_mem) {
|
||||
// Do nothing in this case. We're all set to use int32_t token counts
|
||||
} else if (shared_mem_i16 < device_max_shared_mem &&
|
||||
topk_ids.numel() <= 65535) {
|
||||
// when nelements of topk_ids is smaller than 65535 (max value of uint16),
|
||||
// element value of token_cnts would also smaller than 65535,
|
||||
// so we can use uint16 as dtype of token_cnts
|
||||
use_i16 = true;
|
||||
} else {
|
||||
use_global_memory = true;
|
||||
}
|
||||
|
||||
if (use_global_memory) {
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
|
||||
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
|
||||
// tensors
|
||||
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
|
||||
|
||||
auto options_int = torch::TensorOptions()
|
||||
.dtype(torch::kInt)
|
||||
.device(topk_ids.device());
|
||||
torch::Tensor token_cnts_buffer =
|
||||
torch::empty({(num_experts + 1) * num_experts}, options_int);
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::empty({num_experts + 1}, options_int);
|
||||
|
||||
auto kernel =
|
||||
vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
|
||||
kernel<<<1, num_thread, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>());
|
||||
});
|
||||
} else if (use_i16) {
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
// set dynamic shared mem
|
||||
auto kernel =
|
||||
vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem_i16));
|
||||
kernel<<<1, num_thread, shared_mem_i16, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel());
|
||||
});
|
||||
} else {
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
auto kernel =
|
||||
vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem_i32));
|
||||
kernel<<<1, num_thread, shared_mem_i32, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
TORCH_CHECK(num_experts == 256,
|
||||
"sgl_moe_align_block_size kernel only supports deepseek v3.");
|
||||
int64_t padded_num_experts =
|
||||
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||
int experts_per_warp = WARP_SIZE;
|
||||
int threads = 1024;
|
||||
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
// calc needed amount of shared mem for `cumsum` tensors
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::zeros({num_experts + 1}, options_int);
|
||||
bool small_batch_expert_mode =
|
||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||
|
||||
auto align_kernel =
|
||||
vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
|
||||
align_kernel<<<1, 1024, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
|
||||
if (small_batch_expert_mode) {
|
||||
const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
|
||||
const int32_t shared_mem_size =
|
||||
((threads + 1) * num_experts + (num_experts + 1)) *
|
||||
sizeof(int32_t);
|
||||
|
||||
const int block_threads = 256;
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
|
||||
auto small_batch_expert_kernel =
|
||||
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
|
||||
scalar_t>;
|
||||
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel());
|
||||
} else {
|
||||
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
||||
|
||||
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
|
||||
size_t shared_mem_size =
|
||||
num_warps * experts_per_warp * sizeof(int32_t);
|
||||
|
||||
align_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
|
||||
padded_num_experts, experts_per_warp, block_size,
|
||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
|
||||
|
||||
const int block_threads = std::min(256, (int)threads);
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
|
||||
auto sort_kernel =
|
||||
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -12,12 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
|
@ -12,7 +12,7 @@ void moe_permute(
|
||||
const torch::Tensor& input, // [n_token, hidden]
|
||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||
torch::Tensor& topk_ids, // [n_token, topk]
|
||||
const torch::Tensor& token_expert_indicies, // [n_token, topk]
|
||||
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
@ -27,15 +27,15 @@ void moe_permute(
|
||||
"expert_first_token_offset must be int64");
|
||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||
"topk_ids must be int32");
|
||||
TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
|
||||
"token_expert_indicies must be int32");
|
||||
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||
"token_expert_indices must be int32");
|
||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
||||
"src_row_id2dst_row_id_map must be int32");
|
||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||
"expert_first_token_offset shape != n_local_expert+1")
|
||||
TORCH_CHECK(
|
||||
src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
|
||||
"token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
|
||||
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
||||
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
||||
auto n_token = input.sizes()[0];
|
||||
auto n_hidden = input.sizes()[1];
|
||||
auto align_block_size_value =
|
||||
@ -71,7 +71,7 @@ void moe_permute(
|
||||
expert_map_ptr, n_expert, stream);
|
||||
}
|
||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
|
||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
||||
get_ptr<int>(permuted_experts_id),
|
||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
||||
@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor,
|
||||
|
||||
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const torch::Tensor& token_expert_indices,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||
|
||||
void moe_unpermute(const torch::Tensor& input,
|
||||
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const torch::Tensor& token_expert_indices,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
|
@ -20,7 +20,6 @@ __global__ void expandInputRowsKernel(
|
||||
int expert_id = sorted_experts[expanded_dest_row];
|
||||
|
||||
extern __shared__ int64_t smem_expert_first_token_offset[];
|
||||
int64_t align_expanded_row_accumulate = 0;
|
||||
if constexpr (ALIGN_BLOCK_SIZE) {
|
||||
// load g2s
|
||||
for (int idx = threadIdx.x; idx < num_local_experts + 1;
|
||||
@ -63,7 +62,6 @@ __global__ void expandInputRowsKernel(
|
||||
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
||||
|
||||
// Duplicate and permute rows
|
||||
int64_t const source_k_rank = expanded_source_row / num_rows;
|
||||
int64_t const source_row = expanded_source_row % num_rows;
|
||||
|
||||
auto const* source_row_ptr =
|
||||
@ -160,7 +158,6 @@ __global__ void finalizeMoeRoutingKernel(
|
||||
elem_index += stride) {
|
||||
ComputeElem thread_output;
|
||||
thread_output.fill(0);
|
||||
float row_rescale{0.f};
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
int64_t const expanded_original_row = original_row + k_idx * num_rows;
|
||||
int64_t const expanded_permuted_row =
|
||||
@ -177,8 +174,6 @@ __global__ void finalizeMoeRoutingKernel(
|
||||
auto const* expanded_permuted_rows_row_ptr =
|
||||
expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
|
||||
|
||||
int64_t const expert_idx = expert_for_source_row[k_offset];
|
||||
|
||||
ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
|
||||
expanded_permuted_rows_row_ptr[elem_index]);
|
||||
thread_output = thread_output + row_scale * (expert_result);
|
||||
|
@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
|
||||
gating_output, nullptr, topk_weights, topk_indicies, \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, \
|
||||
stream);
|
||||
|
||||
@ -433,7 +433,7 @@ template <typename IndType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
const float* gating_output,
|
||||
float* topk_weights,
|
||||
IndType* topk_indicies,
|
||||
IndType* topk_indices,
|
||||
int* token_expert_indices,
|
||||
float* softmax_workspace,
|
||||
const int num_tokens,
|
||||
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
|
||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts);
|
||||
}
|
||||
}
|
||||
|
@ -22,15 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||
|
||||
// temporarily adapted from
|
||||
// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
|
||||
m.def(
|
||||
"sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
|
||||
" int block_size, Tensor! sorted_token_ids,"
|
||||
" Tensor! experts_ids,"
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
m.def(
|
||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||
@ -66,7 +57,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
|
||||
m.def(
|
||||
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
||||
"Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
|
||||
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
||||
"int n_local_expert,"
|
||||
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
||||
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
||||
|
@ -274,7 +274,6 @@ void advance_step_flashinfer(
|
||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
|
||||
|
||||
[[maybe_unused]] int block_tables_stride = block_tables.stride(0);
|
||||
TORCH_CHECK((blocks * threads > num_queries),
|
||||
"multi-step: not enough threads to map to num_queries = ",
|
||||
num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
|
||||
|
@ -1,15 +1,17 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "../../dispatch_utils.h"
|
||||
#include "../vectorization_utils.cuh"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cub/util_type.cuh>
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/util_type.cuh>
|
||||
#else
|
||||
#include <hipcub/util_type.hpp>
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#include <hipcub/util_type.hpp>
|
||||
#endif
|
||||
|
||||
static inline __device__ int8_t float_to_int8_rn(float x) {
|
||||
@ -103,134 +105,170 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename scalar_t, typename scale_type>
|
||||
template <typename scalar_t, typename scale_t>
|
||||
__global__ void static_scaled_int8_quant_kernel(
|
||||
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
|
||||
scale_type const* scale_ptr, const int hidden_size) {
|
||||
int const tid = threadIdx.x;
|
||||
int64_t const token_idx = blockIdx.x;
|
||||
scale_type const scale = *scale_ptr;
|
||||
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
|
||||
const scale_t* scale_ptr, const int hidden_size) {
|
||||
const int tid = threadIdx.x;
|
||||
const int stride = blockDim.x;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const float scale = *scale_ptr;
|
||||
|
||||
// Must be performed using 64-bit math to avoid integer overflow.
|
||||
out += token_idx * hidden_size;
|
||||
input += token_idx * hidden_size;
|
||||
const scalar_t* row_in = input + token_idx * hidden_size;
|
||||
int8_t* row_out = output + token_idx * hidden_size;
|
||||
|
||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||
out[i] = float_to_int8_rn(static_cast<float>(input[i]) / scale);
|
||||
}
|
||||
vectorize_with_alignment<16>(
|
||||
row_in, row_out, hidden_size, tid, stride,
|
||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||
dst = float_to_int8_rn(static_cast<float>(src) / scale);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename scale_type, typename azp_type>
|
||||
template <typename scalar_t, typename scale_t, typename azp_t>
|
||||
__global__ void static_scaled_int8_azp_quant_kernel(
|
||||
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
|
||||
scale_type const* scale_ptr, azp_type const* azp_ptr,
|
||||
const int hidden_size) {
|
||||
int const tid = threadIdx.x;
|
||||
int64_t const token_idx = blockIdx.x;
|
||||
scale_type const scale = *scale_ptr;
|
||||
azp_type const azp = *azp_ptr;
|
||||
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
|
||||
const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) {
|
||||
const int tid = threadIdx.x;
|
||||
const int stride = blockDim.x;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const float scale = *scale_ptr;
|
||||
const azp_t azp = *azp_ptr;
|
||||
const float inv_s = 1.0f / scale;
|
||||
|
||||
// Must be performed using 64-bit math to avoid integer overflow.
|
||||
out += token_idx * hidden_size;
|
||||
input += token_idx * hidden_size;
|
||||
const scalar_t* row_in = input + token_idx * hidden_size;
|
||||
int8_t* row_out = output + token_idx * hidden_size;
|
||||
|
||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||
auto const val = static_cast<float>(input[i]);
|
||||
auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
|
||||
out[i] = quant_val;
|
||||
}
|
||||
vectorize_with_alignment<16>(
|
||||
row_in, row_out, hidden_size, tid, stride,
|
||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||
const auto v = static_cast<float>(src) * inv_s;
|
||||
dst = int32_to_int8(float_to_int32_rn(v) + azp);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename scale_type>
|
||||
template <typename scalar_t, typename scale_t>
|
||||
__global__ void dynamic_scaled_int8_quant_kernel(
|
||||
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
|
||||
scale_type* scale, const int hidden_size) {
|
||||
int const tid = threadIdx.x;
|
||||
int64_t const token_idx = blockIdx.x;
|
||||
float absmax_val = 0.0f;
|
||||
float const zero = 0.0f;
|
||||
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
|
||||
scale_t* scale_out, const int hidden_size) {
|
||||
const int tid = threadIdx.x;
|
||||
const int stride = blockDim.x;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
|
||||
// Must be performed using 64-bit math to avoid integer overflow.
|
||||
out += token_idx * hidden_size;
|
||||
input += token_idx * hidden_size;
|
||||
const scalar_t* row_in = input + token_idx * hidden_size;
|
||||
int8_t* row_out = output + token_idx * hidden_size;
|
||||
|
||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||
float val = static_cast<float>(input[i]);
|
||||
val = val > zero ? val : -val;
|
||||
absmax_val = val > absmax_val ? val : absmax_val;
|
||||
// calculate for absmax
|
||||
float thread_max = 0.f;
|
||||
for (int i = tid; i < hidden_size; i += stride) {
|
||||
const auto v = fabsf(static_cast<float>(row_in[i]));
|
||||
thread_max = fmaxf(thread_max, v);
|
||||
}
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
||||
float const block_absmax_val_maybe =
|
||||
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
||||
__shared__ float block_absmax_val;
|
||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||
__shared__ typename BlockReduce::TempStorage tmp;
|
||||
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
|
||||
__shared__ float absmax;
|
||||
if (tid == 0) {
|
||||
block_absmax_val = block_absmax_val_maybe;
|
||||
scale[token_idx] = block_absmax_val / 127.0f;
|
||||
absmax = block_max;
|
||||
scale_out[blockIdx.x] = absmax / 127.f;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float const tmp_scale = 127.0f / block_absmax_val;
|
||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||
out[i] = float_to_int8_rn(static_cast<float>(input[i]) * tmp_scale);
|
||||
}
|
||||
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
|
||||
|
||||
// 2. quantize
|
||||
vectorize_with_alignment<16>(
|
||||
row_in, row_out, hidden_size, tid, stride,
|
||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||
dst = float_to_int8_rn(static_cast<float>(src) * inv_s);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename scale_type, typename azp_type>
|
||||
// MinMax structure to hold min and max values in one go
|
||||
struct MinMax {
|
||||
float min, max;
|
||||
|
||||
__host__ __device__ MinMax()
|
||||
: min(std::numeric_limits<float>::max()),
|
||||
max(std::numeric_limits<float>::lowest()) {}
|
||||
|
||||
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
|
||||
|
||||
// add a value to the MinMax
|
||||
__host__ __device__ MinMax& operator+=(float v) {
|
||||
min = fminf(min, v);
|
||||
max = fmaxf(max, v);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// merge two MinMax objects
|
||||
__host__ __device__ MinMax& operator&=(const MinMax& other) {
|
||||
min = fminf(min, other.min);
|
||||
max = fmaxf(max, other.max);
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
__host__ __device__ inline MinMax operator+(MinMax a, float v) {
|
||||
return a += v;
|
||||
}
|
||||
__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) {
|
||||
return a &= b;
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename scale_t, typename azp_t>
|
||||
__global__ void dynamic_scaled_int8_azp_quant_kernel(
|
||||
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
|
||||
scale_type* scale, azp_type* azp, const int hidden_size) {
|
||||
int64_t const token_idx = blockIdx.x;
|
||||
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
|
||||
scale_t* scale_out, azp_t* azp_out, const int hidden_size) {
|
||||
const int tid = threadIdx.x;
|
||||
const int stride = blockDim.x;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
|
||||
// Must be performed using 64-bit math to avoid integer overflow.
|
||||
out += token_idx * hidden_size;
|
||||
input += token_idx * hidden_size;
|
||||
const scalar_t* row_in = input + token_idx * hidden_size;
|
||||
int8_t* row_out = output + token_idx * hidden_size;
|
||||
|
||||
// Scan for the min and max value for this token
|
||||
float max_val = std::numeric_limits<float>::min();
|
||||
float min_val = std::numeric_limits<float>::max();
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
auto val = static_cast<float>(input[i]);
|
||||
max_val = std::max(max_val, val);
|
||||
min_val = std::min(min_val, val);
|
||||
// 1. calculate min & max
|
||||
MinMax thread_mm;
|
||||
for (int i = tid; i < hidden_size; i += stride) {
|
||||
thread_mm += static_cast<float>(row_in[i]);
|
||||
}
|
||||
|
||||
// Reduce the max and min values across the block
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
||||
max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
|
||||
__syncthreads(); // Make sure min doesn't mess with max shared memory
|
||||
min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
|
||||
using BlockReduce = cub::BlockReduce<MinMax, 256>;
|
||||
__shared__ typename BlockReduce::TempStorage tmp;
|
||||
|
||||
__shared__ scale_type scale_sh;
|
||||
__shared__ azp_type azp_sh;
|
||||
MinMax mm = BlockReduce(tmp).Reduce(
|
||||
thread_mm,
|
||||
[] __device__(MinMax a, const MinMax& b) {
|
||||
a &= b;
|
||||
return a;
|
||||
},
|
||||
blockDim.x);
|
||||
|
||||
// Compute the scale and zero point and store them, only on the first thread
|
||||
if (threadIdx.x == 0) {
|
||||
float const scale_val = (max_val - min_val) / 255.0f;
|
||||
// Use rounding to even (same as torch.round)
|
||||
auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
|
||||
auto const azp_val = static_cast<azp_type>(azp_float);
|
||||
|
||||
// Store the scale and azp into shared and global
|
||||
scale[token_idx] = scale_sh = scale_val;
|
||||
azp[token_idx] = azp_sh = azp_val;
|
||||
__shared__ float scale_sh;
|
||||
__shared__ azp_t azp_sh;
|
||||
if (tid == 0) {
|
||||
float s = (mm.max - mm.min) / 255.f;
|
||||
float zp = nearbyintf(-128.f - mm.min / s); // round-to-even
|
||||
scale_sh = s;
|
||||
azp_sh = azp_t(zp);
|
||||
scale_out[blockIdx.x] = s;
|
||||
azp_out[blockIdx.x] = azp_sh;
|
||||
}
|
||||
|
||||
// Wait for the scale and azp to be computed
|
||||
__syncthreads();
|
||||
|
||||
float const scale_val = scale_sh;
|
||||
azp_type const azp_val = azp_sh;
|
||||
const float inv_s = 1.f / scale_sh;
|
||||
const azp_t azp = azp_sh;
|
||||
|
||||
// Quantize the values
|
||||
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
auto const val = static_cast<float>(input[i]);
|
||||
auto const quant_val =
|
||||
int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
|
||||
out[i] = quant_val;
|
||||
}
|
||||
// 2. quantize
|
||||
vectorize_with_alignment<16>(
|
||||
row_in, row_out, hidden_size, tid, stride,
|
||||
[=] __device__(int8_t& dst, const scalar_t& src) {
|
||||
const auto v = static_cast<float>(src) * inv_s;
|
||||
dst = int32_to_int8(float_to_int32_rn(v) + azp);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
@ -247,7 +285,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||
int const hidden_size = input.size(-1);
|
||||
int const num_tokens = input.numel() / hidden_size;
|
||||
dim3 const grid(num_tokens);
|
||||
dim3 const block(std::min(hidden_size, 1024));
|
||||
dim3 const block(std::min(hidden_size, 256));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
|
||||
@ -278,7 +316,7 @@ void dynamic_scaled_int8_quant(
|
||||
int const hidden_size = input.size(-1);
|
||||
int const num_tokens = input.numel() / hidden_size;
|
||||
dim3 const grid(num_tokens);
|
||||
dim3 const block(std::min(hidden_size, 1024));
|
||||
dim3 const block(std::min(hidden_size, 256));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
|
||||
|
@ -15,11 +15,25 @@ using c3x::cutlass_gemm_caller;
|
||||
template <typename InType, typename OutType,
|
||||
template <typename, typename, typename> typename Epilogue>
|
||||
struct sm100_fp8_config_default {
|
||||
// M in (128, inf)
|
||||
// M in (256, inf)
|
||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||
using TileShape = Shape<_256, _128, _64>;
|
||||
using TileShape = Shape<_256, _128, _128>;
|
||||
using ClusterShape = Shape<_2, _2, _1>;
|
||||
using Cutlass3xGemm =
|
||||
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||
KernelSchedule, EpilogueSchedule>;
|
||||
};
|
||||
|
||||
template <typename InType, typename OutType,
|
||||
template <typename, typename, typename> typename Epilogue>
|
||||
struct sm100_fp8_config_M256 {
|
||||
// M in (128, 256]
|
||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||
using TileShape = Shape<_128, _128, _128>;
|
||||
using ClusterShape = Shape<_2, _2, _1>;
|
||||
using Cutlass3xGemm =
|
||||
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||
@ -33,8 +47,8 @@ struct sm100_fp8_config_M128 {
|
||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||
using TileShape = Shape<_128, _128, _64>;
|
||||
using ClusterShape = Shape<_2, _2, _1>;
|
||||
using TileShape = Shape<_128, _128, _256>;
|
||||
using ClusterShape = Shape<_2, _4, _1>;
|
||||
using Cutlass3xGemm =
|
||||
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||
KernelSchedule, EpilogueSchedule>;
|
||||
@ -72,6 +86,8 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||
using Cutlass3xGemmM128 =
|
||||
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||
using Cutlass3xGemmM256 =
|
||||
typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||
|
||||
uint32_t const m = a.size(0);
|
||||
uint32_t const mp2 =
|
||||
@ -85,8 +101,12 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
// m in (64, 128]
|
||||
return cutlass_gemm_caller<Cutlass3xGemmM128>(
|
||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
} else if (mp2 <= 256) {
|
||||
// m in (128, 256]
|
||||
return cutlass_gemm_caller<Cutlass3xGemmM256>(
|
||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
} else {
|
||||
// m in (128, inf)
|
||||
// m in (256, inf)
|
||||
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||
}
|
||||
|
@ -231,12 +231,115 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
|
||||
}
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__launch_bounds__(512, 4) cvt_fp16_to_fp4(
|
||||
#else
|
||||
cvt_fp16_to_fp4(
|
||||
#endif
|
||||
int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
|
||||
uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
|
||||
uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
using PackedVec = PackedVec<Type>;
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
"Vec size is not matched.");
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
// Each global thread processes one element
|
||||
for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
|
||||
globalIdx += gridDim.x * blockDim.x) {
|
||||
// Calculate which row and column this global thread should process
|
||||
int rowIdx = globalIdx / colsPerRow;
|
||||
int colIdx = globalIdx % colsPerRow;
|
||||
|
||||
int64_t inOffset = rowIdx * colsPerRow + colIdx;
|
||||
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
// Get the output tensor offset.
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
int64_t outOffset = inOffset;
|
||||
auto& out_pos = out[outOffset];
|
||||
|
||||
// Find index within the experts using different strategies based on expert
|
||||
// count
|
||||
int rowIdx_in_expert = 0;
|
||||
int expert_idx = 0;
|
||||
|
||||
if constexpr (SMALL_NUM_EXPERTS) {
|
||||
for (int i = 0; i < n_experts; i++) {
|
||||
uint32_t current_offset = __ldca(&input_offset_by_experts[i]);
|
||||
uint32_t next_offset = __ldca(&input_offset_by_experts[i + 1]);
|
||||
if (rowIdx >= current_offset && rowIdx < next_offset) {
|
||||
rowIdx_in_expert = rowIdx - current_offset;
|
||||
expert_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Load input offsets into registers first, then do the computation.
|
||||
// Local array size set to 17 because of register limit.
|
||||
uint32_t local_offsets[17];
|
||||
for (int chunk_start = 0; chunk_start < n_experts; chunk_start += 16) {
|
||||
*reinterpret_cast<int4*>(local_offsets) =
|
||||
__ldca(reinterpret_cast<const int4*>(
|
||||
&input_offset_by_experts[chunk_start]));
|
||||
*reinterpret_cast<int4*>(local_offsets + 4) =
|
||||
__ldca(reinterpret_cast<const int4*>(
|
||||
&input_offset_by_experts[chunk_start + 4]));
|
||||
*reinterpret_cast<int4*>(local_offsets + 8) =
|
||||
__ldca(reinterpret_cast<const int4*>(
|
||||
&input_offset_by_experts[chunk_start + 8]));
|
||||
*reinterpret_cast<int4*>(local_offsets + 12) =
|
||||
__ldca(reinterpret_cast<const int4*>(
|
||||
&input_offset_by_experts[chunk_start + 12]));
|
||||
local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);
|
||||
|
||||
// Check against the 16 loaded offsets
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; i++) {
|
||||
if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
|
||||
rowIdx_in_expert = rowIdx - local_offsets[i];
|
||||
expert_idx = chunk_start + i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
|
||||
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
// The actual output_scales dim is computed from the padded numCols.
|
||||
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
|
||||
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
|
||||
uint32_t* SFout_in_expert =
|
||||
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__launch_bounds__(1024, 4) cvt_fp16_to_fp4(
|
||||
#else
|
||||
cvt_fp16_to_fp4(
|
||||
#endif
|
||||
int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
|
||||
uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
|
||||
@ -247,50 +350,80 @@ cvt_fp16_to_fp4(
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
"Vec size is not matched.");
|
||||
extern __shared__ uint32_t shared_input_offsets[];
|
||||
|
||||
// Input tensor row/col loops.
|
||||
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
|
||||
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
|
||||
colIdx += blockDim.x) {
|
||||
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
|
||||
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
// Get the output tensor offset.
|
||||
// Same as inOffset because 8 elements are packed into one uint32_t.
|
||||
int64_t outOffset = inOffset;
|
||||
auto& out_pos = out[outOffset];
|
||||
|
||||
// Find index within the experts.
|
||||
int rowIdx_in_expert = 0;
|
||||
int expert_idx = 0;
|
||||
for (int i = 0; i < n_experts; i++) {
|
||||
if (rowIdx >= input_offset_by_experts[i] &&
|
||||
rowIdx < input_offset_by_experts[i + 1]) {
|
||||
rowIdx_in_expert = rowIdx - input_offset_by_experts[i];
|
||||
expert_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
|
||||
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
// The actual output_scales dim is computed from the padded numCols.
|
||||
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
|
||||
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
|
||||
uint32_t* SFout_in_expert =
|
||||
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
|
||||
|
||||
out_pos =
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
// Load input offsets into shared memory.
|
||||
// If n_experts is larger than 4, use vectorized int4 to save instructions.
|
||||
// If n_experts is smaller than 4, read directly.
|
||||
if constexpr (SMALL_NUM_EXPERTS) {
|
||||
for (int i = threadIdx.x; i < n_experts + 1; i += blockDim.x) {
|
||||
shared_input_offsets[i] = input_offset_by_experts[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = threadIdx.x * 4; i < n_experts; i += blockDim.x * 4) {
|
||||
*reinterpret_cast<int4*>(&shared_input_offsets[i]) =
|
||||
*reinterpret_cast<const int4*>(&input_offset_by_experts[i]);
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
shared_input_offsets[n_experts] = input_offset_by_experts[n_experts];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
// Each global thread processes one element
|
||||
for (int globalIdx = tid; globalIdx < numRows * colsPerRow;
|
||||
globalIdx += gridDim.x * blockDim.x) {
|
||||
// Calculate which row and column this global thread should process
|
||||
int rowIdx = globalIdx / colsPerRow;
|
||||
int colIdx = globalIdx % colsPerRow;
|
||||
|
||||
int64_t inOffset = rowIdx * colsPerRow + colIdx;
|
||||
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
|
||||
int64_t outOffset = inOffset;
|
||||
auto& out_pos = out[outOffset];
|
||||
|
||||
// Find expert using binary search for better performance with large m_topk
|
||||
int rowIdx_in_expert = 0;
|
||||
int expert_idx = 0;
|
||||
|
||||
// Binary search through experts using shared memory
|
||||
int left = 0, right = n_experts - 1;
|
||||
while (left <= right) {
|
||||
int mid = (left + right) / 2;
|
||||
// Get offsets: shared_input_offsets[i] corresponds to
|
||||
// input_offset_by_experts[i]
|
||||
uint32_t mid_offset = shared_input_offsets[mid];
|
||||
uint32_t next_offset = shared_input_offsets[mid + 1];
|
||||
|
||||
if (rowIdx >= mid_offset && rowIdx < next_offset) {
|
||||
rowIdx_in_expert = rowIdx - mid_offset;
|
||||
expert_idx = mid;
|
||||
break;
|
||||
} else if (rowIdx < mid_offset) {
|
||||
right = mid - 1;
|
||||
} else {
|
||||
left = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
|
||||
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
|
||||
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
|
||||
uint32_t* SFout_in_expert =
|
||||
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -309,18 +442,63 @@ void quant_impl(void* output, void* output_scale, void* input,
|
||||
|
||||
// Grid, Block size.
|
||||
// Each thread converts 8 values.
|
||||
dim3 block(std::min(int(k / ELTS_PER_THREAD), 512));
|
||||
int const workSizePerRow = k / ELTS_PER_THREAD;
|
||||
int const totalWorkSize = m_topk * workSizePerRow;
|
||||
dim3 block(std::min(workSizePerRow, 512));
|
||||
// Get number of blocks per SM (assume we can fully utilize the SM).
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
dim3 grid(std::min(int(m_topk), multiProcessorCount * numBlocksPerSM));
|
||||
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
|
||||
multiProcessorCount * numBlocksPerSM));
|
||||
while (grid.x <= multiProcessorCount && block.x > 64) {
|
||||
grid.x *= 2;
|
||||
block.x = (block.x + 1) / 2;
|
||||
}
|
||||
|
||||
cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
|
||||
m_topk, k, reinterpret_cast<T*>(input),
|
||||
reinterpret_cast<float*>(input_global_scale),
|
||||
reinterpret_cast<uint32_t*>(output),
|
||||
reinterpret_cast<uint32_t*>(output_scale),
|
||||
reinterpret_cast<uint32_t*>(input_offset_by_experts),
|
||||
reinterpret_cast<uint32_t*>(output_scale_offset_by_experts), n_experts);
|
||||
int const blockRepeat =
|
||||
(totalWorkSize + block.x * grid.x - 1) / (block.x * grid.x);
|
||||
if (blockRepeat > 1) {
|
||||
size_t shared_mem_size = (n_experts + 1) * sizeof(uint32_t);
|
||||
if (n_experts >= 4) {
|
||||
cvt_fp16_to_fp4<T, false, false>
|
||||
<<<grid, block, shared_mem_size, stream>>>(
|
||||
m_topk, k, reinterpret_cast<T*>(input),
|
||||
reinterpret_cast<float*>(input_global_scale),
|
||||
reinterpret_cast<uint32_t*>(output),
|
||||
reinterpret_cast<uint32_t*>(output_scale),
|
||||
reinterpret_cast<uint32_t*>(input_offset_by_experts),
|
||||
reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
|
||||
n_experts);
|
||||
} else {
|
||||
cvt_fp16_to_fp4<T, false, true><<<grid, block, shared_mem_size, stream>>>(
|
||||
m_topk, k, reinterpret_cast<T*>(input),
|
||||
reinterpret_cast<float*>(input_global_scale),
|
||||
reinterpret_cast<uint32_t*>(output),
|
||||
reinterpret_cast<uint32_t*>(output_scale),
|
||||
reinterpret_cast<uint32_t*>(input_offset_by_experts),
|
||||
reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
|
||||
n_experts);
|
||||
}
|
||||
} else {
|
||||
if (n_experts >= 16) {
|
||||
cvt_fp16_to_fp4<T, false, false><<<grid, block, 0, stream>>>(
|
||||
m_topk, k, reinterpret_cast<T*>(input),
|
||||
reinterpret_cast<float*>(input_global_scale),
|
||||
reinterpret_cast<uint32_t*>(output),
|
||||
reinterpret_cast<uint32_t*>(output_scale),
|
||||
reinterpret_cast<uint32_t*>(input_offset_by_experts),
|
||||
reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
|
||||
n_experts, /* bool low_latency */ true);
|
||||
} else {
|
||||
cvt_fp16_to_fp4<T, false, true><<<grid, block, 0, stream>>>(
|
||||
m_topk, k, reinterpret_cast<T*>(input),
|
||||
reinterpret_cast<float*>(input_global_scale),
|
||||
reinterpret_cast<uint32_t*>(output),
|
||||
reinterpret_cast<uint32_t*>(output_scale),
|
||||
reinterpret_cast<uint32_t*>(input_offset_by_experts),
|
||||
reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
|
||||
n_experts, /* bool low_latency */ true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*Quantization entry for fp4 experts quantization*/
|
||||
|
@ -446,8 +446,6 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
|
||||
template <>
|
||||
__inline__ __device__ uint32_t
|
||||
scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
|
||||
[[maybe_unused]] __half2_raw h2r =
|
||||
__hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
|
||||
union {
|
||||
__half2_raw h2r;
|
||||
uint32_t ui32;
|
||||
|
@ -92,111 +92,112 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, // quant weight
|
||||
torch::Tensor X, // input
|
||||
int64_t type, int64_t row) {
|
||||
int col = X.sizes()[1];
|
||||
int vecs = X.sizes()[0];
|
||||
const int padded = (col + 512 - 1) / 512 * 512;
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
|
||||
auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
|
||||
at::Tensor Y = torch::empty({1, row}, options);
|
||||
at::Tensor Y = torch::empty({vecs, row}, options);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
|
||||
at::Tensor quant_X = torch::empty({1, padded / 32 * 9}, options);
|
||||
at::Tensor quant_X = torch::empty({vecs, padded / 32 * 9}, options);
|
||||
VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] {
|
||||
quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
|
||||
(void*)quant_X.data_ptr(), col, 1, stream);
|
||||
quantize_row_q8_1_cuda<scalar_t>(
|
||||
(scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(), col, vecs, stream);
|
||||
switch (type) {
|
||||
case 2:
|
||||
mul_mat_vec_q4_0_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 3:
|
||||
mul_mat_vec_q4_1_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 6:
|
||||
mul_mat_vec_q5_0_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 7:
|
||||
mul_mat_vec_q5_1_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 8:
|
||||
mul_mat_vec_q8_0_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 10:
|
||||
mul_mat_vec_q2_K_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 11:
|
||||
mul_mat_vec_q3_K_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 12:
|
||||
mul_mat_vec_q4_K_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 13:
|
||||
mul_mat_vec_q5_K_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 14:
|
||||
mul_mat_vec_q6_K_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 16:
|
||||
mul_mat_vec_iq2_xxs_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 17:
|
||||
mul_mat_vec_iq2_xs_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 18:
|
||||
mul_mat_vec_iq3_xxs_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 19:
|
||||
mul_mat_vec_iq1_s_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 20:
|
||||
mul_mat_vec_iq4_nl_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 21:
|
||||
mul_mat_vec_iq3_s_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 22:
|
||||
mul_mat_vec_iq2_s_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 23:
|
||||
mul_mat_vec_iq4_xs_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
case 29:
|
||||
mul_mat_vec_iq1_m_q8_1_cuda<scalar_t>(
|
||||
(void*)W.data_ptr(), (void*)quant_X.data_ptr(),
|
||||
(scalar_t*)Y.data_ptr(), col, row, stream);
|
||||
(scalar_t*)Y.data_ptr(), col, row, vecs, stream);
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
@ -1,16 +1,19 @@
|
||||
// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
|
||||
template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows, const int nvecs) {
|
||||
const auto row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const auto vec = blockIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
if (row >= nrows || vec >= nvecs) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int blocks_per_row = ncols / qk;
|
||||
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||
const int nrows_y = (ncols + 512 - 1) / 512 * 512;
|
||||
|
||||
// partial sum for each thread
|
||||
|
||||
// partial sum for each thread
|
||||
float tmp = 0.0f;
|
||||
|
||||
const block_q_t * x = (const block_q_t *) vx;
|
||||
@ -19,7 +22,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||
for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row*blocks_per_row + i; // x block index
|
||||
|
||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
||||
const int iby = vec*(nrows_y/QK8_1) + i * (qk/QK8_1); // y block index that aligns with ibx
|
||||
|
||||
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
||||
|
||||
@ -33,177 +36,177 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[row] = tmp;
|
||||
dst[vec*nrows + row] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, const int nvecs, cudaStream_t stream) {
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_nums(block_num_y, nvecs, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, nvecs);
|
||||
}
|
||||
|
@ -206,8 +206,6 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
int n = offset_n + t * 4;
|
||||
@ -344,8 +342,6 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
int n = offset_n + t * 4;
|
||||
@ -465,8 +461,6 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
int n = offset_n + t * 4;
|
||||
@ -593,8 +587,6 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
int n = offset_n + t * 4;
|
||||
|
@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma {
|
||||
static constexpr int A_CPY_VEC =
|
||||
decltype(max_common_vector(tCsA, tCrA_load)){};
|
||||
|
||||
static constexpr int COVERSION_WIDTH =
|
||||
static constexpr int CONVERSION_WIDTH =
|
||||
std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
|
||||
|
||||
auto load_A_to_registers = [&](int read_stage) {
|
||||
@ -1026,8 +1026,8 @@ struct MacheteCollectiveMma {
|
||||
// PIPELINED MAIN LOOP
|
||||
//
|
||||
|
||||
auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
|
||||
int read_stage) {
|
||||
auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
|
||||
int read_stage) {
|
||||
load_extra_info_to_registers(partitioned_extra_info,
|
||||
copy_partitions_extra_info, k_block,
|
||||
read_stage);
|
||||
|
75
csrc/quantization/vectorization_utils.cuh
Normal file
75
csrc/quantization/vectorization_utils.cuh
Normal file
@ -0,0 +1,75 @@
|
||||
#pragma once
|
||||
#include "vectorization.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
|
||||
struct DefaultVecOp {
|
||||
ScaOp scalar_op;
|
||||
|
||||
__device__ __forceinline__ void operator()(
|
||||
vec_n_t<OutT, VEC_SIZE>& dst, const vec_n_t<InT, VEC_SIZE>& src) const {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
scalar_op(dst.val[i], src.val[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int VEC_SIZE, typename InT, typename OutT, typename VecOp,
|
||||
typename ScaOp>
|
||||
__device__ inline void vectorize_with_alignment(
|
||||
const InT* in, OutT* out, int len, int tid, int stride,
|
||||
VecOp&& vec_op, // vec_n_t<InT,16> -> vec_n_t<OutT,16>
|
||||
ScaOp&& scalar_op) { // InT -> OutT
|
||||
static_assert(VEC_SIZE > 0 && (VEC_SIZE & (VEC_SIZE - 1)) == 0,
|
||||
"VEC_SIZE must be a positive power-of-two");
|
||||
constexpr int WIDTH = VEC_SIZE * sizeof(InT); // eg: 64 B
|
||||
uintptr_t addr = reinterpret_cast<uintptr_t>(in);
|
||||
|
||||
int misalignment_offset = addr & (WIDTH - 1); // addr % 64
|
||||
int alignment_bytes = WIDTH - misalignment_offset; // 64 - (addr % 64)
|
||||
int prefix_elems = alignment_bytes & (WIDTH - 1); // handle 64
|
||||
prefix_elems /= sizeof(InT);
|
||||
prefix_elems = min(prefix_elems, len); // 0 ≤ prefix < 16
|
||||
|
||||
// 1. prefill the when it is unsafe to vectorize
|
||||
for (int i = tid; i < prefix_elems; i += stride) {
|
||||
scalar_op(out[i], in[i]);
|
||||
}
|
||||
|
||||
in += prefix_elems;
|
||||
out += prefix_elems;
|
||||
len -= prefix_elems;
|
||||
|
||||
int num_vec = len / VEC_SIZE;
|
||||
using vin_t = vec_n_t<InT, VEC_SIZE>;
|
||||
using vout_t = vec_n_t<OutT, VEC_SIZE>;
|
||||
auto* v_in = reinterpret_cast<const vin_t*>(in);
|
||||
auto* v_out = reinterpret_cast<vout_t*>(out);
|
||||
|
||||
// 2. vectorize the main part
|
||||
for (int i = tid; i < num_vec; i += stride) {
|
||||
vout_t tmp;
|
||||
vec_op(tmp, v_in[i]);
|
||||
v_out[i] = tmp;
|
||||
}
|
||||
|
||||
// 3. handle the tail
|
||||
int tail_start = num_vec * VEC_SIZE;
|
||||
for (int i = tid + tail_start; i < len; i += stride) {
|
||||
scalar_op(out[i], in[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <int VEC_SIZE, typename InT, typename OutT, typename ScaOp>
|
||||
__device__ __forceinline__ void vectorize_with_alignment(const InT* in,
|
||||
OutT* out, int len,
|
||||
int tid, int stride,
|
||||
ScaOp&& scalar_op) {
|
||||
using Vec = DefaultVecOp<VEC_SIZE, InT, OutT, std::decay_t<ScaOp>>;
|
||||
vectorize_with_alignment<VEC_SIZE>(in, out, len, tid, stride, Vec{scalar_op},
|
||||
std::forward<ScaOp>(scalar_op));
|
||||
}
|
||||
|
||||
} // namespace vllm
|
@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) {
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
||||
[[maybe_unused]] union tmpcvt {
|
||||
uint16_t u;
|
||||
_Float16 f;
|
||||
__hip_bfloat16 b;
|
||||
} t16;
|
||||
_B16x4 ret;
|
||||
if constexpr (std::is_same<T, _Float16>::value) {
|
||||
union h2cvt {
|
||||
@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
||||
template <typename T>
|
||||
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
|
||||
const _B16x4& inp2) {
|
||||
[[maybe_unused]] union tmpcvt {
|
||||
uint16_t u;
|
||||
_Float16 f;
|
||||
__hip_bfloat16 b;
|
||||
} t1, t2, res;
|
||||
_B16x4 ret;
|
||||
if constexpr (std::is_same<T, _Float16>::value) {
|
||||
union h2cvt {
|
||||
@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
|
||||
constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
|
||||
|
||||
[[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
|
||||
[[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
|
||||
// shared_logits is used for multiple purposes
|
||||
__shared__ _B16x4 shared_logits[NWARPS][4][16][4];
|
||||
|
||||
@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
|
||||
const int klocal_token_idx =
|
||||
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
|
||||
[[maybe_unused]] const int kglobal_token_idx =
|
||||
partition_start_token_idx + klocal_token_idx;
|
||||
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
|
||||
const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
|
||||
|
||||
@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
const auto warpid = threadIdx.x / WARP_SIZE;
|
||||
[[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
// max num partitions supported is warp_size * NPAR_LOOPS
|
||||
@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
// max num partitions supported is warp_size * NPAR_LOOPS
|
||||
@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
// max num partitions supported is warp_size * NPAR_LOOPS
|
||||
|
@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
// then this is not going to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
// then this is not going to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
// Goal is to bring the activation matrix A to the LDS
|
||||
// and use it across the lifetime of the work group
|
||||
// TODO: When activation matrix is larger than 64 KB
|
||||
// then this is not goint to work!
|
||||
// then this is not going to work!
|
||||
//----------------------------------------------------
|
||||
__shared__ scalar_t s[max_lds_len];
|
||||
|
||||
@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
//----------------------------------------------------
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
kBase = 0;
|
||||
|
||||
// Check whether there will be fragmenation!
|
||||
// Check whether there will be fragmentation!
|
||||
// This will happen only for the last wave!
|
||||
if (m < M && (m + YTILE) >= M) {
|
||||
uint32_t startColumn = M - YTILE;
|
||||
|
@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
|
||||
uint32_t const m = 1; // Set M to 1 for compression
|
||||
uint32_t const n = a.size(1);
|
||||
|
||||
// Note: For correctess, the compressed format must be invariant in:
|
||||
// Note: For correctness, the compressed format must be invariant in:
|
||||
// - M, the flattened number of tokens
|
||||
// - Whether output dtype is fp16 or bf16
|
||||
// - CUTLASS epilogues
|
||||
|
@ -243,30 +243,32 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# If we need to build FlashInfer wheel before its release:
|
||||
# $ export FLASHINFER_ENABLE_AOT=1
|
||||
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
|
||||
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
|
||||
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a'
|
||||
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
||||
# $ cd flashinfer
|
||||
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
|
||||
# $ rm -rf build
|
||||
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
|
||||
# $ ls dist
|
||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
||||
# $ git checkout v0.2.6.post1
|
||||
# $ python -m flashinfer.aot
|
||||
# $ python -m build --no-isolation --wheel
|
||||
# $ ls -la dist
|
||||
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||
# FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
||||
# FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
||||
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
|
||||
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
|
||||
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl; \
|
||||
else \
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -lt 12 ]; then \
|
||||
export FLASHINFER_ENABLE_SM90=0; \
|
||||
fi; \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' && \
|
||||
git clone https://github.com/flashinfer-ai/flashinfer.git --single-branch --branch v0.2.6.post1 --recursive && \
|
||||
# Needed to build AOT kernels
|
||||
(cd flashinfer && \
|
||||
python3 -m flashinfer.aot && \
|
||||
uv pip install --system --no-build-isolation . \
|
||||
) && \
|
||||
rm -rf flashinfer; \
|
||||
fi \
|
||||
fi
|
||||
COPY examples examples
|
||||
|
@ -98,6 +98,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
cp requirements/test.in requirements/test-cpu.in && \
|
||||
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
|
||||
uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
|
||||
uv pip install -r requirements/dev.txt && \
|
||||
pre-commit install --hook-type pre-commit --hook-type commit-msg
|
||||
|
||||
|
134
docs/ci/update_pytorch_version.md
Normal file
134
docs/ci/update_pytorch_version.md
Normal file
@ -0,0 +1,134 @@
|
||||
---
|
||||
title: Update PyTorch version on vLLM OSS CI/CD
|
||||
---
|
||||
|
||||
vLLM's current policy is to always use the latest PyTorch stable
|
||||
release in CI/CD. It is standard practice to submit a PR to update the
|
||||
PyTorch version as early as possible when a new [PyTorch stable
|
||||
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
||||
This process is non-trivial due to the gap between PyTorch
|
||||
releases. Using [#16859](https://github.com/vllm-project/vllm/pull/16859) as
|
||||
an example, this document outlines common steps to achieve this update along with
|
||||
a list of potential issues and how to address them.
|
||||
|
||||
## Test PyTorch release candidates (RCs)
|
||||
|
||||
Updating PyTorch in vLLM after the official release is not
|
||||
ideal because any issues discovered at that point can only be resolved
|
||||
by waiting for the next release or by implementing hacky workarounds in vLLM.
|
||||
The better solution is to test vLLM with PyTorch release candidates (RC) to ensure
|
||||
compatibility before each release.
|
||||
|
||||
PyTorch release candidates can be downloaded from PyTorch test index at https://download.pytorch.org/whl/test.
|
||||
For example, torch2.7.0+cu12.8 RC can be installed using the following command:
|
||||
|
||||
```
|
||||
uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
|
||||
```
|
||||
|
||||
When the final RC is ready for testing, it will be announced to the community
|
||||
on the [PyTorch dev-discuss forum](https://dev-discuss.pytorch.org/c/release-announcements).
|
||||
After this announcement, we can begin testing vLLM integration by drafting a pull request
|
||||
following this 3-step process:
|
||||
|
||||
1. Update requirements files in https://github.com/vllm-project/vllm/tree/main/requirements
|
||||
to point to the new releases for torch, torchvision, and torchaudio.
|
||||
2. Use `--extra-index-url https://download.pytorch.org/whl/test/<PLATFORM>` to
|
||||
get the final release candidates' wheels. Some common platforms are `cpu`, `cu128`,
|
||||
and `rocm6.2.4`.
|
||||
3. As vLLM uses uv, make sure that `unsafe-best-match` strategy is set either
|
||||
via `UV_INDEX_STRATEGY` env variable or via `--index-strategy unsafe-best-match`.
|
||||
|
||||
If failures are found in the pull request, raise them as issues on vLLM and
|
||||
cc the PyTorch release team to initiate discussion on how to address them.
|
||||
|
||||
## Update CUDA version
|
||||
|
||||
The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,
|
||||
torch2.7.0+cu12.6) is uploaded to PyPI. However, vLLM may require a different CUDA version,
|
||||
such as 12.8 for Blackwell support.
|
||||
This complicates the process as we cannot use the out-of-the-box
|
||||
`pip install torch torchvision torchaudio` command. The solution is to use
|
||||
`--extra-index-url` in vLLM's Dockerfiles.
|
||||
|
||||
1. Use `--extra-index-url https://download.pytorch.org/whl/cu128` to install torch+cu128.
|
||||
2. Other important indexes at the moment include:
|
||||
1. CPU ‒ https://download.pytorch.org/whl/cpu
|
||||
2. ROCm ‒ https://download.pytorch.org/whl/rocm6.2.4 and https://download.pytorch.org/whl/rocm6.3
|
||||
3. XPU ‒ https://download.pytorch.org/whl/xpu
|
||||
3. Update .buildkite/release-pipeline.yaml and .buildkite/scripts/upload-wheels.sh to
|
||||
match the CUDA version from step 1. This makes sure that the release vLLM wheel is tested
|
||||
on CI.
|
||||
|
||||
## Address long vLLM build time
|
||||
|
||||
When building vLLM with a new PyTorch/CUDA version, no cache will exist
|
||||
in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours
|
||||
and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode,
|
||||
it doesn't populate the cache, so re-running it to warm up the cache
|
||||
is ineffective.
|
||||
|
||||
While ongoing efforts like [#17419](https://github.com/vllm-project/vllm/issues/17419)
|
||||
address the long build time at its source, the current workaround is to set VLLM_CI_BRANCH
|
||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||
|
||||
1. Increase the timeout limit to 10 hours so that the build doesn't timeout.
|
||||
2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
|
||||
to warm it up so that future builds are faster.
|
||||
|
||||
<p align="center" width="100%">
|
||||
<img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
|
||||
</p>
|
||||
|
||||
## Update dependencies
|
||||
|
||||
Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
|
||||
to be updated accordingly. Rather than waiting for all of them to publish new
|
||||
releases (which would take too much time), they can be built from
|
||||
source to unblock the update process.
|
||||
|
||||
### FlashInfer
|
||||
Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
|
||||
|
||||
```
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
|
||||
export FLASHINFER_ENABLE_SM90=1
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
|
||||
```
|
||||
|
||||
One caveat is that building FlashInfer from source adds approximately 30
|
||||
minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
|
||||
public location for immediate installation, such as https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl. For future releases, contact the PyTorch release
|
||||
team if you want to get the package published there.
|
||||
|
||||
### xFormers
|
||||
Similar to FlashInfer, here is how to build and install xFormers from source:
|
||||
|
||||
```
|
||||
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
||||
MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
||||
```
|
||||
|
||||
### Mamba
|
||||
|
||||
```
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||
```
|
||||
|
||||
### causal-conv1d
|
||||
|
||||
```
|
||||
uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
```
|
||||
|
||||
## Update all the different vLLM platforms
|
||||
|
||||
Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
|
||||
to handle some platforms separately. The separation of requirements and Dockerfiles
|
||||
for different platforms in vLLM CI/CD allows us to selectively choose
|
||||
which platforms to update. For instance, updating XPU requires the corresponding
|
||||
release from https://github.com/intel/intel-extension-for-pytorch by Intel.
|
||||
While https://github.com/vllm-project/vllm/pull/16859 updated vLLM to PyTorch
|
||||
2.7.0 on CPU, CUDA, and ROCm, https://github.com/vllm-project/vllm/pull/17444
|
||||
completed the update for XPU.
|
@ -130,7 +130,7 @@ pytest -s -v tests/test_logger.py
|
||||
|
||||
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
|
||||
|
||||
## Pull Requests & Code Reviews
|
||||
|
@ -48,8 +48,8 @@ Further update the model as follows:
|
||||
return vision_embeddings
|
||||
```
|
||||
|
||||
!!! warning
|
||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||
!!! important
|
||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||
|
||||
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||
|
||||
@ -100,8 +100,8 @@ Further update the model as follows:
|
||||
```
|
||||
|
||||
!!! note
|
||||
The model class does not have to be named `*ForCausalLM`.
|
||||
Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
|
||||
The model class does not have to be named `*ForCausalLM`.
|
||||
Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
|
||||
|
||||
## 2. Specify processing information
|
||||
|
||||
|
@ -18,7 +18,7 @@ After you have implemented your model (see [tutorial][new-model-basic]), put it
|
||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||
Finally, update our [list of supported models][supported-models] to promote your model!
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
The list of models in each section should be maintained in alphabetical order.
|
||||
|
||||
## Out-of-tree models
|
||||
@ -49,6 +49,6 @@ def register():
|
||||
)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
|
||||
Read more about that [here][supports-multimodal].
|
||||
|
@ -15,7 +15,7 @@ Without them, the CI for your PR will fail.
|
||||
Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
|
||||
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
The list of models in each section should be maintained in alphabetical order.
|
||||
|
||||
!!! tip
|
||||
|
@ -34,6 +34,7 @@ you may contact the following individuals:
|
||||
|
||||
- Simon Mo - simon.mo@hey.com
|
||||
- Russell Bryant - rbryant@redhat.com
|
||||
- Huzaifa Sidhpurwala - huzaifas@redhat.com
|
||||
|
||||
## Slack Discussion
|
||||
|
||||
|
@ -5,19 +5,22 @@ title: Using Kubernetes
|
||||
|
||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
|
||||
|
||||
* [Deployment with CPUs](#deployment-with-cpus)
|
||||
* [Deployment with GPUs](#deployment-with-gpus)
|
||||
- [Deployment with CPUs](#deployment-with-cpus)
|
||||
- [Deployment with GPUs](#deployment-with-gpus)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
|
||||
- [Conclusion](#conclusion)
|
||||
|
||||
Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
|
||||
* [Helm](frameworks/helm.md)
|
||||
* [InftyAI/llmaz](integrations/llmaz.md)
|
||||
* [KServe](integrations/kserve.md)
|
||||
* [kubernetes-sigs/lws](frameworks/lws.md)
|
||||
* [meta-llama/llama-stack](integrations/llamastack.md)
|
||||
* [substratusai/kubeai](integrations/kubeai.md)
|
||||
* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
|
||||
* [vllm-project/production-stack](integrations/production-stack.md)
|
||||
- [Helm](frameworks/helm.md)
|
||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||
- [KServe](integrations/kserve.md)
|
||||
- [kubernetes-sigs/lws](frameworks/lws.md)
|
||||
- [meta-llama/llama-stack](integrations/llamastack.md)
|
||||
- [substratusai/kubeai](integrations/kubeai.md)
|
||||
- [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
|
||||
- [vllm-project/production-stack](integrations/production-stack.md)
|
||||
|
||||
## Deployment with CPUs
|
||||
|
||||
@ -351,6 +354,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
|
||||
If the service is correctly deployed, you should receive a response from the vLLM model.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"
|
||||
|
||||
If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
|
||||
|
||||
1. container log contains "KeyboardInterrupt: terminated"
|
||||
2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
|
||||
|
||||
To mitigate, increase the failureThreshold to allow more time for the model server to start serving. You can identify an ideal failureThreshold by removing the probes from the manifest and measuring how much time it takes for the model server to show it's ready to serve.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
|
||||
|
@ -7,7 +7,7 @@ page for information on known issues and how to solve them.
|
||||
|
||||
## Introduction
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
The source code references are to the state of the code at the time of writing in December, 2024.
|
||||
|
||||
The use of Python multiprocessing in vLLM is complicated by:
|
||||
@ -123,7 +123,7 @@ what is happening. First, a log message from vLLM:
|
||||
WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
|
||||
initialized. We must use the `spawn` multiprocessing start method. Setting
|
||||
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
|
||||
https://docs.vllm.ai/en/latest/usage/debugging.html#python-multiprocessing
|
||||
https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing
|
||||
for more information.
|
||||
```
|
||||
|
337
docs/design/v1/p2p_nccl_connector.md
Normal file
337
docs/design/v1/p2p_nccl_connector.md
Normal file
@ -0,0 +1,337 @@
|
||||
An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo.
|
||||
|
||||
# Detailed Design
|
||||
|
||||
## Overall Process
|
||||
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
|
||||
|
||||
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
|
||||
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
|
||||
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
|
||||
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
|
||||
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
|
||||
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
|
||||
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
|
||||
|
||||

|
||||
|
||||
## Proxy/Router (Demo)
|
||||
|
||||
A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception.
|
||||
|
||||
The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example:
|
||||
|
||||
```
|
||||
cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0
|
||||
```
|
||||
|
||||
Currently, to quickly verify whether xPyD can work, a round-robin selection of 1P1D is used. In the future, it is planned to use a trie combined with the load status of instances to select appropriate P and D.
|
||||
|
||||
Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed).
|
||||
|
||||
## KV Cache Transfer Methods
|
||||
|
||||
There are three methods for KVcache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVcache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache.
|
||||
|
||||
Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.
|
||||
|
||||
## P2P Communication via ZMQ & NCCL
|
||||
|
||||
As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.
|
||||
|
||||
Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVcache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVcache data itself.
|
||||
|
||||
When a P instance and a D instance transmit KVcache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVcache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVcache transmission can be performed, without being restricted by rank or world size.
|
||||
|
||||
## NCCL Group Topology
|
||||
|
||||
Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVcache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
|
||||
|
||||

|
||||
|
||||
Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurations—such as DeepSeek's 96P144D—this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL.
|
||||
|
||||
## GPU Memory Buffer and Tensor Memory Pool
|
||||
|
||||
The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%~10% of the memory size.
|
||||
|
||||
If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVcache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVcache loss. Once KVcache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.
|
||||
|
||||
To address the above issues, I have designed and developed a local Tensor memory pool for storing KVcache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVcache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVcache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
|
||||
|
||||
# Install vLLM
|
||||
|
||||
```shell
|
||||
# Enter the home directory or your working directory.
|
||||
cd /home
|
||||
|
||||
# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
|
||||
wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
|
||||
# Download the code repository.
|
||||
git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
|
||||
cd vllm
|
||||
|
||||
# Set the installation package path.
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
|
||||
# installation
|
||||
pip install -e . -v
|
||||
```
|
||||
|
||||
# Run xPyD
|
||||
|
||||
## Instructions
|
||||
- The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model.
|
||||
- Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput.
|
||||
- For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance.
|
||||
- You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict).
|
||||
- `PUT_ASYNC` offers the best performance and should be prioritized.
|
||||
- The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`.
|
||||
- The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
|
||||
- The node running the proxy must have `quart` installed.
|
||||
- Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
|
||||
- In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
|
||||
|
||||
## Run 1P3D
|
||||
|
||||
### Proxy (e.g. 10.0.1.1)
|
||||
|
||||
```shell
|
||||
cd {your vllm directory}/examples/online_serving/disagg_xpyd/
|
||||
python3 disagg_prefill_proxy_xpyd.py &
|
||||
```
|
||||
|
||||
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20005 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20009 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20008 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
## Run 3P1D
|
||||
|
||||
### Proxy (e.g. 10.0.1.1)
|
||||
|
||||
```shell
|
||||
cd {your vllm directory}/examples/online_serving/disagg_xpyd/
|
||||
python3 disagg_prefill_proxy_xpyd.py &
|
||||
```
|
||||
|
||||
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20005 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20009 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20003 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
|
||||
|
||||
```shell
|
||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||
--host 0.0.0.0 \
|
||||
--port 20008 \
|
||||
--tensor-parallel-size 1 \
|
||||
--seed 1024 \
|
||||
--served-model-name base_model \
|
||||
--dtype float16 \
|
||||
--max-model-len 10000 \
|
||||
--max-num-batched-tokens 10000 \
|
||||
--max-num-seqs 256 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--disable-log-request \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
|
||||
```
|
||||
|
||||
# Single request
|
||||
|
||||
```shell
|
||||
curl -X POST -s http://10.0.1.1:10001/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "base_model",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
}'
|
||||
```
|
||||
|
||||
# Benchmark
|
||||
|
||||
```shell
|
||||
python3 benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model base_model \
|
||||
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name "random" \
|
||||
--host 10.0.1.1 \
|
||||
--port 10001 \
|
||||
--random-input-len 1024 \
|
||||
--random-output-len 1024 \
|
||||
--ignore-eos \
|
||||
--burstiness 100 \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el" \
|
||||
--metric-percentiles "90,95,99" \
|
||||
--seed $(date +%s) \
|
||||
--trust-remote-code \
|
||||
--request-rate 3 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
# Shut down
|
||||
|
||||
```shell
|
||||
pgrep python | xargs kill -9 && pkill -f python
|
||||
```
|
||||
|
||||
# Test data
|
||||
|
||||
## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s
|
||||
- **1P5D (6×A800) vs vLLM (1×A800)**:
|
||||
- Throughput ↑7.2% (1085 → 6979/6)
|
||||
- ITL (P99) ↓81.3% (120ms → 22.9ms)
|
||||
- TTFT (P99) ↑26.8% (175ms → 222ms)
|
||||
- TPOT: No change
|
||||
|
||||
- **1P6D (7×A800) vs vLLM (1×A800)**:
|
||||
- Throughput ↑9.6% (1085 → 8329/7)
|
||||
- ITL (P99) ↓81.0% (120ms → 22.7ms)
|
||||
- TTFT (P99) ↑210% (175ms →543ms)
|
||||
- TPOT: No change
|
||||
|
||||
## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s
|
||||
- **1P1D (2×A800) vs vLLM (1×A800)**:
|
||||
- Throughput ↑37.4% (537 → 1476/2)
|
||||
- ITL (P99) ↓81.8% (127ms → 23.1ms)
|
||||
- TTFT (P99) ↑41.8% (160ms → 227ms)
|
||||
- TPOT: No change
|
||||
|
||||

|
@ -211,7 +211,7 @@ for o in outputs:
|
||||
|
||||
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
A chat template is **required** to use Chat Completions API.
|
||||
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
|
||||
|
||||
|
@ -7,16 +7,16 @@ Quantization trades off model precision for smaller memory footprint, allowing l
|
||||
|
||||
Contents:
|
||||
|
||||
- [Supported_Hardware](supported_hardware.md)
|
||||
- [Auto_Awq](auto_awq.md)
|
||||
- [Bnb](bnb.md)
|
||||
- [Bitblas](bitblas.md)
|
||||
- [Gguf](gguf.md)
|
||||
- [Gptqmodel](gptqmodel.md)
|
||||
- [Int4](int4.md)
|
||||
- [Int8](int8.md)
|
||||
- [Fp8](fp8.md)
|
||||
- [Modelopt](modelopt.md)
|
||||
- [Quark](quark.md)
|
||||
- [Quantized_Kvcache](quantized_kvcache.md)
|
||||
- [Torchao](torchao.md)
|
||||
- [Supported Hardware](supported_hardware.md)
|
||||
- [AutoAWQ](auto_awq.md)
|
||||
- [BitsAndBytes](bnb.md)
|
||||
- [BitBLAS](bitblas.md)
|
||||
- [GGUF](gguf.md)
|
||||
- [GPTQModel](gptqmodel.md)
|
||||
- [INT4 W4A16](int4.md)
|
||||
- [INT8 W8A8](int8.md)
|
||||
- [FP8 W8A8](fp8.md)
|
||||
- [NVIDIA TensorRT Model Optimizer](modelopt.md)
|
||||
- [AMD Quark](quark.md)
|
||||
- [Quantized KV Cache](quantized_kvcache.md)
|
||||
- [TorchAO](torchao.md)
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
title: AMD QUARK
|
||||
title: AMD Quark
|
||||
---
|
||||
[](){ #quark }
|
||||
|
||||
|
@ -142,51 +142,6 @@ for chunk in stream:
|
||||
|
||||
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
|
||||
## Structured output
|
||||
|
||||
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
The following is an example client:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
class People(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
json_schema = People.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the name and age of one random person.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
## Tool Calling
|
||||
|
||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
||||
|
@ -39,9 +39,10 @@ client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
model = client.models.list().data[0].id
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
@ -54,7 +55,7 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
|
||||
|
||||
```python
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
@ -92,26 +93,32 @@ class CarDescription(BaseModel):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
"response_format": {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "car-description",
|
||||
"schema": CarDescription.model_json_schema()
|
||||
},
|
||||
},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
!!! tip
|
||||
While not strictly necessary, normally it´s better to indicate in the prompt the
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
results notably in most cases.
|
||||
|
||||
Finally we have the `guided_grammar` option, which is probably the most
|
||||
difficult to use, but it´s really powerful. It allows us to define complete
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
As an example, we can use to define a specific format of simplified SQL queries:
|
||||
|
||||
```python
|
||||
@ -130,7 +137,7 @@ simplified_sql_grammar = """
|
||||
"""
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
@ -142,7 +149,48 @@ completion = client.chat.completions.create(
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
|
||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
||||
|
||||
## Reasoning Outputs
|
||||
|
||||
You can also use structured outputs with <project:#reasoning-outputs> for reasoning models.
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class People(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a JSON with the name and age of one random person.",
|
||||
}
|
||||
],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "people",
|
||||
"schema": People.model_json_schema()
|
||||
}
|
||||
},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
||||
|
||||
## Experimental Automatic Parsing (OpenAI API)
|
||||
|
||||
@ -163,14 +211,14 @@ class Info(BaseModel):
|
||||
age: int
|
||||
|
||||
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
|
||||
model = client.models.list().data[0].id
|
||||
completion = client.beta.chat.completions.parse(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
|
||||
],
|
||||
response_format=Info,
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
)
|
||||
|
||||
message = completion.choices[0].message
|
||||
@ -203,15 +251,13 @@ class MathResponse(BaseModel):
|
||||
steps: list[Step]
|
||||
final_answer: str
|
||||
|
||||
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
|
||||
completion = client.beta.chat.completions.parse(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful expert math tutor."},
|
||||
{"role": "user", "content": "Solve 8x + 31 = 2."},
|
||||
],
|
||||
response_format=MathResponse,
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
)
|
||||
|
||||
message = completion.choices[0].message
|
||||
@ -232,11 +278,11 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
|
||||
Answer: x = -29/8
|
||||
```
|
||||
|
||||
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py>
|
||||
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
|
||||
|
||||
## Offline Inference
|
||||
|
||||
Offline inference allows for the same types of guided decoding.
|
||||
Offline inference allows for the same types of structured outputs.
|
||||
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
|
||||
The main available options inside `GuidedDecodingParams` are:
|
||||
|
||||
@ -247,7 +293,7 @@ The main available options inside `GuidedDecodingParams` are:
|
||||
- `structural_tag`
|
||||
|
||||
These parameters can be used in the same way as the parameters from the Online
|
||||
Serving examples above. One example for the usage of the `choice` parameter is
|
||||
Serving examples above. One example for the usage of the `choice` parameter is
|
||||
shown below:
|
||||
|
||||
```python
|
||||
@ -265,4 +311,4 @@ outputs = llm.generate(
|
||||
print(outputs[0].outputs[0].text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/structured_outputs.py>
|
||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
||||
|
@ -2,4 +2,6 @@ nav:
|
||||
- README.md
|
||||
- gpu.md
|
||||
- cpu.md
|
||||
- ai_accelerator.md
|
||||
- google_tpu.md
|
||||
- intel_gaudi.md
|
||||
- aws_neuron.md
|
||||
|
@ -14,7 +14,6 @@ vLLM supports the following hardware platforms:
|
||||
- [ARM AArch64](cpu.md#arm-aarch64)
|
||||
- [Apple silicon](cpu.md#apple-silicon)
|
||||
- [IBM Z (S390X)](cpu.md#ibm-z-s390x)
|
||||
- [Other AI accelerators](ai_accelerator.md)
|
||||
- [Google TPU](ai_accelerator.md#google-tpu)
|
||||
- [Intel Gaudi](ai_accelerator.md#intel-gaudi)
|
||||
- [AWS Neuron](ai_accelerator.md#aws-neuron)
|
||||
- [Google TPU](google_tpu.md)
|
||||
- [Intel Gaudi](intel_gaudi.md)
|
||||
- [AWS Neuron](aws_neuron.md)
|
||||
|
@ -1,117 +0,0 @@
|
||||
# Other AI accelerators
|
||||
|
||||
vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:installation"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:installation"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:installation"
|
||||
|
||||
## Requirements
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:requirements"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:requirements"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:requirements"
|
||||
|
||||
## Configure a new environment
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:configure-a-new-environment"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:configure-a-new-environment"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:configure-a-new-environment"
|
||||
|
||||
## Set up using Python
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-wheels"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-wheels"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-wheels"
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-wheel-from-source"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-wheel-from-source"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-wheel-from-source"
|
||||
|
||||
## Set up using Docker
|
||||
|
||||
### Pre-built images
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-images"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-images"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-images"
|
||||
|
||||
### Build image from source
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-image-from-source"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-image-from-source"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-image-from-source"
|
||||
|
||||
## Extra information
|
||||
|
||||
=== "Google TPU"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:extra-information"
|
||||
|
||||
=== "Intel Gaudi"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:extra-information"
|
||||
|
||||
=== "AWS Neuron"
|
||||
|
||||
--8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:extra-information"
|
@ -1,15 +1,14 @@
|
||||
# --8<-- [start:installation]
|
||||
# AWS Neuron
|
||||
|
||||
[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and
|
||||
generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2,
|
||||
and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores.
|
||||
This tab describes how to set up your environment to run vLLM on Neuron.
|
||||
[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and
|
||||
generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2,
|
||||
and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores.
|
||||
This describes how to set up your environment to run vLLM on Neuron.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
## Requirements
|
||||
|
||||
- OS: Linux
|
||||
- Python: 3.9 or newer
|
||||
@ -21,36 +20,32 @@
|
||||
|
||||
### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
|
||||
|
||||
The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this
|
||||
The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this
|
||||
[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
|
||||
|
||||
- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
|
||||
- Once inside your instance, activate the pre-installed virtual environment for inference by running
|
||||
|
||||
```console
|
||||
source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
|
||||
```
|
||||
|
||||
Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html)
|
||||
Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html)
|
||||
for alternative setup instructions including using Docker and manually installing dependencies.
|
||||
|
||||
!!! note
|
||||
NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx)
|
||||
library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).
|
||||
NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx)
|
||||
library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
## Set up using Python
|
||||
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
### Pre-built wheels
|
||||
|
||||
Currently, there are no pre-built Neuron wheels.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
### Build wheel from source
|
||||
|
||||
#### Install vLLM from source
|
||||
|
||||
Install vllm as follows:
|
||||
To build and install vLLM from source, run:
|
||||
|
||||
```console
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
@ -59,14 +54,14 @@ pip install -U -r requirements/neuron.txt
|
||||
VLLM_TARGET_DEVICE="neuron" pip install -e .
|
||||
```
|
||||
|
||||
AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at
|
||||
[https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's
|
||||
available on vLLM V0. Please utilize the AWS Fork for the following features:
|
||||
AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at
|
||||
<https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2>, which contains several features in addition to what's
|
||||
available on vLLM V0. Please utilize the AWS Fork for the following features:
|
||||
|
||||
- Llama-3.2 multi-modal support
|
||||
- Multi-node distributed inference
|
||||
- Multi-node distributed inference
|
||||
|
||||
Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html)
|
||||
Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html)
|
||||
for more details and usage examples.
|
||||
|
||||
To install the AWS Neuron fork, run the following:
|
||||
@ -80,75 +75,73 @@ VLLM_TARGET_DEVICE="neuron" pip install -e .
|
||||
|
||||
Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:set-up-using-docker]
|
||||
## Set up using Docker
|
||||
|
||||
# --8<-- [end:set-up-using-docker]
|
||||
# --8<-- [start:pre-built-images]
|
||||
### Pre-built images
|
||||
|
||||
Currently, there are no pre-built Neuron images.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
### Build image from source
|
||||
|
||||
See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
|
||||
|
||||
Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
## Extra information
|
||||
|
||||
[](){ #feature-support-through-nxd-inference-backend }
|
||||
|
||||
### Feature support through NxD Inference backend
|
||||
|
||||
The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend
|
||||
to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most
|
||||
[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration.
|
||||
The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend
|
||||
to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most
|
||||
[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration.
|
||||
|
||||
To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
|
||||
To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
|
||||
as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
|
||||
|
||||
```console
|
||||
override_neuron_config={
|
||||
"enable_bucketing":False,
|
||||
}
|
||||
```
|
||||
|
||||
or when launching vLLM from the CLI, pass
|
||||
|
||||
```console
|
||||
--override-neuron-config "{\"enable_bucketing\":false}"
|
||||
```
|
||||
|
||||
Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts
|
||||
(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads.
|
||||
Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts
|
||||
(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads.
|
||||
|
||||
### Known limitations
|
||||
|
||||
- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
|
||||
[guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
|
||||
for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
|
||||
- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this
|
||||
[Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
|
||||
to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
|
||||
- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at
|
||||
runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
|
||||
[guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
|
||||
for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
|
||||
- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this
|
||||
[Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
|
||||
to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
|
||||
- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at
|
||||
runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
|
||||
- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
|
||||
to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
|
||||
to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
|
||||
- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
|
||||
to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
|
||||
to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
|
||||
- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches
|
||||
max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt
|
||||
to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support
|
||||
for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is
|
||||
implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
|
||||
|
||||
to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
|
||||
to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
|
||||
- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches
|
||||
max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt
|
||||
to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support
|
||||
for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is
|
||||
implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
|
||||
|
||||
### Environment variables
|
||||
- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
|
||||
compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
|
||||
artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
|
||||
but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
|
||||
under this specified path.
|
||||
|
||||
- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
|
||||
compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
|
||||
artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
|
||||
but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
|
||||
under this specified path.
|
||||
- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
|
||||
- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
|
||||
|
||||
# --8<-- [end:extra-information]
|
@ -110,8 +110,9 @@ vLLM CPU backend supports the following vLLM features:
|
||||
|
||||
## Related runtime environment variables
|
||||
|
||||
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
|
||||
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
|
||||
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`.
|
||||
- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`.
|
||||
- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
|
||||
|
||||
## Performance tips
|
||||
@ -133,7 +134,15 @@ export VLLM_CPU_OMP_THREADS_BIND=0-29
|
||||
vllm serve facebook/opt-125m
|
||||
```
|
||||
|
||||
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
|
||||
or using default auto thread binding:
|
||||
|
||||
```console
|
||||
export VLLM_CPU_KVCACHE_SPACE=40
|
||||
export VLLM_CPU_NUM_OF_RESERVED_CPU=2
|
||||
vllm serve facebook/opt-125m
|
||||
```
|
||||
|
||||
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
|
||||
|
||||
```console
|
||||
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
|
||||
@ -178,6 +187,12 @@ $ python examples/offline_inference/basic/basic.py
|
||||
VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
|
||||
```
|
||||
|
||||
or using default auto thread binding:
|
||||
|
||||
```console
|
||||
VLLM_CPU_KVCACHE_SPACE=40 vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
|
||||
```
|
||||
|
||||
- For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
|
||||
|
||||
- Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
|
||||
|
@ -1,4 +1,4 @@
|
||||
# --8<-- [start:installation]
|
||||
# Google TPU
|
||||
|
||||
Tensor Processing Units (TPUs) are Google's custom-developed application-specific
|
||||
integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
|
||||
@ -33,8 +33,7 @@ information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp
|
||||
!!! warning
|
||||
There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
## Requirements
|
||||
|
||||
- Google Cloud TPU VM
|
||||
- TPU versions: v6e, v5e, v5p, v4
|
||||
@ -58,6 +57,7 @@ assigned to your Google Cloud project for your immediate exclusive use.
|
||||
### Provision Cloud TPUs with GKE
|
||||
|
||||
For more information about using TPUs with GKE, see:
|
||||
|
||||
- <https://cloud.google.com/kubernetes-engine/docs/how-to/tpus>
|
||||
- <https://cloud.google.com/kubernetes-engine/docs/concepts/tpus>
|
||||
- <https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus>
|
||||
@ -70,40 +70,41 @@ Create a TPU v5e with 4 TPU chips:
|
||||
|
||||
```console
|
||||
gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
|
||||
--node-id TPU_NAME \
|
||||
--project PROJECT_ID \
|
||||
--zone ZONE \
|
||||
--accelerator-type ACCELERATOR_TYPE \
|
||||
--runtime-version RUNTIME_VERSION \
|
||||
--service-account SERVICE_ACCOUNT
|
||||
--node-id TPU_NAME \
|
||||
--project PROJECT_ID \
|
||||
--zone ZONE \
|
||||
--accelerator-type ACCELERATOR_TYPE \
|
||||
--runtime-version RUNTIME_VERSION \
|
||||
--service-account SERVICE_ACCOUNT
|
||||
```
|
||||
|
||||
| Parameter name | Description |
|
||||
|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. |
|
||||
| TPU_NAME | The user-assigned name of the TPU which is created when the queued |
|
||||
| TPU_NAME | The user-assigned name of the TPU which is created when the queued resource request is allocated. |
|
||||
| PROJECT_ID | Your Google Cloud project |
|
||||
| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use |
|
||||
| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example |
|
||||
| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes). |
|
||||
<figcaption>Parameter descriptions</figcaption>
|
||||
| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones] |
|
||||
| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. |
|
||||
| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images]. |
|
||||
| SERVICE_ACCOUNT | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com` |
|
||||
|
||||
Connect to your TPU using SSH:
|
||||
Connect to your TPU VM using SSH:
|
||||
|
||||
```bash
|
||||
gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
|
||||
gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE
|
||||
```
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
[TPU versions]: https://cloud.google.com/tpu/docs/runtimes
|
||||
[TPU VM images]: https://cloud.google.com/tpu/docs/runtimes
|
||||
[TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones
|
||||
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
## Set up using Python
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
Currently, there are no pre-built TPU wheels.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
### Build wheel from source
|
||||
|
||||
Install Miniconda:
|
||||
|
||||
@ -136,7 +137,7 @@ Install build dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements/tpu.txt
|
||||
sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
|
||||
sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
|
||||
```
|
||||
|
||||
Run the setup script:
|
||||
@ -145,16 +146,13 @@ Run the setup script:
|
||||
VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
|
||||
```
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:set-up-using-docker]
|
||||
## Set up using Docker
|
||||
|
||||
# --8<-- [end:set-up-using-docker]
|
||||
# --8<-- [start:pre-built-images]
|
||||
### Pre-built images
|
||||
|
||||
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
### Build image from source
|
||||
|
||||
You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
|
||||
|
||||
@ -188,11 +186,5 @@ docker run --privileged --net host --shm-size=16G -it vllm-tpu
|
||||
Install OpenBLAS with the following command:
|
||||
|
||||
```console
|
||||
sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
|
||||
sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
|
||||
```
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
|
||||
There is no extra information for this device.
|
||||
# --8<-- [end:extra-information]
|
@ -42,7 +42,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
||||
|
||||
=== "NVIDIA CUDA"
|
||||
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:create-a-new-python-environment"
|
||||
--8<-- "docs/getting_started/installation/gpu/cuda.inc.md:set-up-using-python"
|
||||
|
||||
=== "AMD ROCm"
|
||||
|
||||
|
@ -10,8 +10,6 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
|
||||
### Create a new Python environment
|
||||
|
||||
!!! note
|
||||
PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
|
||||
|
||||
@ -254,7 +252,10 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
|
||||
|
||||
See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
|
||||
|
||||
## Supported features
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:supported-features]
|
||||
|
||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
||||
|
||||
# --8<-- [end:supported-features]
|
||||
# --8<-- [end:extra-information]
|
||||
|
@ -217,7 +217,10 @@ docker run -it \
|
||||
|
||||
Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
|
||||
|
||||
## Supported features
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:supported-features]
|
||||
|
||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
||||
|
||||
# --8<-- [end:supported-features]
|
||||
# --8<-- [end:extra-information]
|
||||
|
@ -63,7 +63,8 @@ $ docker run -it \
|
||||
vllm-xpu-env
|
||||
```
|
||||
|
||||
## Supported features
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:supported-features]
|
||||
|
||||
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
|
||||
|
||||
@ -78,4 +79,6 @@ python -m vllm.entrypoints.openai.api_server \
|
||||
```
|
||||
|
||||
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
|
||||
|
||||
# --8<-- [end:supported-features]
|
||||
# --8<-- [end:extra-information]
|
||||
|
@ -1,12 +1,11 @@
|
||||
# --8<-- [start:installation]
|
||||
# Intel Gaudi
|
||||
|
||||
This tab provides instructions on running vLLM with Intel Gaudi devices.
|
||||
This page provides instructions on running vLLM with Intel Gaudi devices.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
## Requirements
|
||||
|
||||
- OS: Ubuntu 22.04 LTS
|
||||
- Python: 3.10
|
||||
@ -56,16 +55,13 @@ docker run \
|
||||
vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
|
||||
```
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
## Set up using Python
|
||||
|
||||
# --8<-- [end:set-up-using-python]
|
||||
# --8<-- [start:pre-built-wheels]
|
||||
### Pre-built wheels
|
||||
|
||||
Currently, there are no pre-built Intel Gaudi wheels.
|
||||
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
### Build wheel from source
|
||||
|
||||
To build and install vLLM from source, run:
|
||||
|
||||
@ -86,16 +82,13 @@ pip install -r requirements/hpu.txt
|
||||
python setup.py develop
|
||||
```
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:set-up-using-docker]
|
||||
## Set up using Docker
|
||||
|
||||
# --8<-- [end:set-up-using-docker]
|
||||
# --8<-- [start:pre-built-images]
|
||||
### Pre-built images
|
||||
|
||||
Currently, there are no pre-built Intel Gaudi images.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
### Build image from source
|
||||
|
||||
```console
|
||||
docker build -f docker/Dockerfile.hpu -t vllm-hpu-env .
|
||||
@ -112,10 +105,9 @@ docker run \
|
||||
!!! tip
|
||||
If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
## Extra information
|
||||
|
||||
## Supported features
|
||||
### Supported features
|
||||
|
||||
- [Offline inference][offline-inference]
|
||||
- Online serving via [OpenAI-Compatible Server][openai-compatible-server]
|
||||
@ -129,14 +121,14 @@ docker run \
|
||||
for accelerating low-batch latency and throughput
|
||||
- Attention with Linear Biases (ALiBi)
|
||||
|
||||
## Unsupported features
|
||||
### Unsupported features
|
||||
|
||||
- Beam search
|
||||
- LoRA adapters
|
||||
- Quantization
|
||||
- Prefill chunking (mixed-batch inferencing)
|
||||
|
||||
## Supported configurations
|
||||
### Supported configurations
|
||||
|
||||
The following configurations have been validated to function with
|
||||
Gaudi2 devices. Configurations that are not listed may or may not work.
|
||||
@ -183,7 +175,6 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
|
||||
| 0 | 0 | torch.compile |
|
||||
| 0 | 1 | PyTorch eager mode |
|
||||
| 1 | 0 | HPU Graphs |
|
||||
<figcaption>vLLM execution modes</figcaption>
|
||||
|
||||
!!! warning
|
||||
In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
|
||||
@ -401,4 +392,3 @@ the below:
|
||||
higher batches. You can do that by adding `--enforce-eager` flag to
|
||||
server (for online serving), or by passing `enforce_eager=True`
|
||||
argument to LLM constructor (for offline inference).
|
||||
# --8<-- [end:extra-information]
|
@ -61,7 +61,8 @@ from vllm import LLM, SamplingParams
|
||||
```
|
||||
|
||||
The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params].
|
||||
!!! warning
|
||||
|
||||
!!! important
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
|
||||
|
||||
However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
|
||||
@ -116,7 +117,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
|
||||
!!! note
|
||||
By default, the server uses a predefined chat template stored in the tokenizer.
|
||||
You can learn about overriding it [here][chat-template].
|
||||
!!! warning
|
||||
!!! important
|
||||
By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
|
||||
|
||||
To disable this behavior, please pass `--generation-config vllm` when launching the server.
|
||||
|
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
|
||||
@ -8,10 +9,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
|
||||
if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
|
||||
# remove the warning banner if the version is a tagged release
|
||||
docs_dir = os.path.dirname(__file__)
|
||||
announcement_path = os.path.join(docs_dir,
|
||||
"mkdocs/overrides/main.html")
|
||||
mkdocs_dir = Path(__file__).parent.parent
|
||||
announcement_path = mkdocs_dir / "overrides/main.html"
|
||||
# The file might be removed already if the build is triggered multiple
|
||||
# times (readthedocs build both HTML and PDF versions separately)
|
||||
if os.path.exists(announcement_path):
|
||||
if announcement_path.exists():
|
||||
os.remove(announcement_path)
|
||||
|
47
docs/mkdocs/javascript/edit_and_feedback.js
Normal file
47
docs/mkdocs/javascript/edit_and_feedback.js
Normal file
@ -0,0 +1,47 @@
|
||||
/**
|
||||
* edit_and_feedback.js
|
||||
*
|
||||
* Enhances MkDocs Material docs pages by:
|
||||
*
|
||||
* 1. Adding a "Question? Give us feedback" link
|
||||
* below the "Edit" button.
|
||||
*
|
||||
* - The link opens a GitHub issue with a template,
|
||||
* auto-filled with the current page URL and path.
|
||||
*
|
||||
* 2. Ensuring the edit button opens in a new tab
|
||||
* with target="_blank" and rel="noopener".
|
||||
*/
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const url = window.location.href;
|
||||
const page = document.body.dataset.mdUrl || location.pathname;
|
||||
|
||||
const feedbackLink = document.createElement("a");
|
||||
feedbackLink.href = `https://github.com/vllm-project/vllm/issues/new?template=100-documentation.yml&title=${encodeURIComponent(
|
||||
`[Docs] Feedback for \`${page}\``
|
||||
)}&body=${encodeURIComponent(`📄 **Reference:**\n${url}\n\n📝 **Feedback:**\n_Your response_`)}`;
|
||||
feedbackLink.target = "_blank";
|
||||
feedbackLink.rel = "noopener";
|
||||
feedbackLink.title = "Provide feedback";
|
||||
feedbackLink.className = "md-content__button";
|
||||
feedbackLink.innerHTML = `
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
height="24px"
|
||||
viewBox="0 -960 960 960"
|
||||
width="24px"
|
||||
fill="currentColor"
|
||||
>
|
||||
<path d="M280-280h280v-80H280v80Zm0-160h400v-80H280v80Zm0-160h400v-80H280v80Zm-80 480q-33 0-56.5-23.5T120-200v-560q0-33 23.5-56.5T200-840h560q33 0 56.5 23.5T840-760v560q0 33-23.5 56.5T760-120H200Zm0-80h560v-560H200v560Zm0-560v560-560Z"/>
|
||||
</svg>
|
||||
`;
|
||||
|
||||
const editButton = document.querySelector('.md-content__button[href*="edit"]');
|
||||
|
||||
if (editButton && editButton.parentNode) {
|
||||
editButton.insertAdjacentElement("beforebegin", feedbackLink);
|
||||
|
||||
editButton.setAttribute("target", "_blank");
|
||||
editButton.setAttribute("rel", "noopener");
|
||||
}
|
||||
});
|
@ -34,3 +34,77 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
|
||||
color: rgba(255, 255, 255, 0.75) !important;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
/* Custom admonitions */
|
||||
:root {
|
||||
--md-admonition-icon--announcement: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M3.25 9a.75.75 0 0 1 .75.75c0 2.142.456 3.828.733 4.653a.122.122 0 0 0 .05.064.212.212 0 0 0 .117.033h1.31c.085 0 .18-.042.258-.152a.45.45 0 0 0 .075-.366A16.743 16.743 0 0 1 6 9.75a.75.75 0 0 1 1.5 0c0 1.588.25 2.926.494 3.85.293 1.113-.504 2.4-1.783 2.4H4.9c-.686 0-1.35-.41-1.589-1.12A16.4 16.4 0 0 1 2.5 9.75.75.75 0 0 1 3.25 9Z"></path><path d="M0 6a4 4 0 0 1 4-4h2.75a.75.75 0 0 1 .75.75v6.5a.75.75 0 0 1-.75.75H4a4 4 0 0 1-4-4Zm4-2.5a2.5 2.5 0 1 0 0 5h2v-5Z"></path><path d="M15.59.082A.75.75 0 0 1 16 .75v10.5a.75.75 0 0 1-1.189.608l-.002-.001h.001l-.014-.01a5.775 5.775 0 0 0-.422-.25 10.63 10.63 0 0 0-1.469-.64C11.576 10.484 9.536 10 6.75 10a.75.75 0 0 1 0-1.5c2.964 0 5.174.516 6.658 1.043.423.151.787.302 1.092.443V2.014c-.305.14-.669.292-1.092.443C11.924 2.984 9.713 3.5 6.75 3.5a.75.75 0 0 1 0-1.5c2.786 0 4.826-.484 6.155-.957.665-.236 1.154-.47 1.47-.64.144-.077.284-.161.421-.25l.014-.01a.75.75 0 0 1 .78-.061Z"></path></svg>');
|
||||
--md-admonition-icon--important: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M4.47.22A.749.749 0 0 1 5 0h6c.199 0 .389.079.53.22l4.25 4.25c.141.14.22.331.22.53v6a.749.749 0 0 1-.22.53l-4.25 4.25A.749.749 0 0 1 11 16H5a.749.749 0 0 1-.53-.22L.22 11.53A.749.749 0 0 1 0 11V5c0-.199.079-.389.22-.53Zm.84 1.28L1.5 5.31v5.38l3.81 3.81h5.38l3.81-3.81V5.31L10.69 1.5ZM8 4a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 4Zm0 8a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path></svg>');
|
||||
}
|
||||
|
||||
.md-typeset .admonition.announcement,
|
||||
.md-typeset details.announcement {
|
||||
border-color: rgb(255, 110, 66);
|
||||
}
|
||||
.md-typeset .admonition.important,
|
||||
.md-typeset details.important {
|
||||
border-color: rgb(239, 85, 82);
|
||||
}
|
||||
|
||||
.md-typeset .announcement > .admonition-title,
|
||||
.md-typeset .announcement > summary {
|
||||
background-color: rgb(255, 110, 66, 0.1);
|
||||
}
|
||||
.md-typeset .important > .admonition-title,
|
||||
.md-typeset .important > summary {
|
||||
background-color: rgb(239, 85, 82, 0.1);
|
||||
}
|
||||
|
||||
.md-typeset .announcement > .admonition-title::before,
|
||||
.md-typeset .announcement > summary::before {
|
||||
background-color: rgb(239, 85, 82);
|
||||
-webkit-mask-image: var(--md-admonition-icon--announcement);
|
||||
mask-image: var(--md-admonition-icon--announcement);
|
||||
}
|
||||
.md-typeset .important > .admonition-title::before,
|
||||
.md-typeset .important > summary::before {
|
||||
background-color: rgb(239, 85, 82);
|
||||
-webkit-mask-image: var(--md-admonition-icon--important);
|
||||
mask-image: var(--md-admonition-icon--important);
|
||||
}
|
||||
|
||||
/* Make label fully visible on hover */
|
||||
.md-content__button[href*="edit"]:hover::after {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
/* Hide edit button on generated docs/examples pages */
|
||||
@media (min-width: 960px) {
|
||||
.md-content__button[href*="docs/examples/"] {
|
||||
display: none !important;
|
||||
}
|
||||
}
|
||||
|
||||
.md-content__button-wrapper {
|
||||
position: absolute;
|
||||
top: 0.6rem;
|
||||
right: 0.8rem;
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
align-items: center;
|
||||
gap: 0.4rem;
|
||||
z-index: 1;
|
||||
}
|
||||
|
||||
.md-content__button-wrapper a {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
height: 24px;
|
||||
width: 24px;
|
||||
color: var(--md-default-fg-color);
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.md-content__button-wrapper a:hover {
|
||||
color: var(--md-accent-fg-color);
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
|
||||
|
||||
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
|
||||
@ -81,7 +81,7 @@ The [chat][vllm.LLM.chat] method implements chat functionality on top of [genera
|
||||
In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
|
||||
and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
In general, only instruction-tuned models have a chat template.
|
||||
Base models may perform poorly as they are not trained to respond to the chat conversation.
|
||||
|
||||
|
@ -299,78 +299,80 @@ See [this page][generative-models] for more information on how to use generative
|
||||
|
||||
Specified using `--task generate`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
|
||||
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ |
|
||||
| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ |
|
||||
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ |
|
||||
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
|
||||
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ |
|
||||
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
|
||||
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | |
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ | ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | |
|
||||
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | |
|
||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
|
||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
|
||||
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | |
|
||||
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | |
|
||||
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | |
|
||||
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
|
||||
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`etc. | | | |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | |
|
||||
|
||||
!!! note
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
@ -379,7 +381,7 @@ Specified using `--task generate`.
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
Since some model architectures support both generative and pooling tasks,
|
||||
you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
|
||||
|
||||
@ -387,18 +389,19 @@ See [this page](./pooling_models.md) for more information on how to use pooling
|
||||
|
||||
Specified using `--task embed`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
|
||||
| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
|
||||
| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
|
||||
| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ︎ | |
|
||||
| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ︎ | ︎ |
|
||||
| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ︎ | ︎ |
|
||||
| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎ | ︎ |
|
||||
| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
|
||||
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|-----------------------|
|
||||
| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
|
||||
| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
|
||||
| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ︎ | | |
|
||||
| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ︎ | ︎ | |
|
||||
| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ︎ | ︎ | |
|
||||
| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎ | ︎ | |
|
||||
| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | |
|
||||
| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | |
|
||||
| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | |
|
||||
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
@ -422,16 +425,16 @@ of the whole prompt are extracted from the normalized hidden state corresponding
|
||||
|
||||
Specified using `--task reward`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|
|
||||
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ |
|
||||
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ |
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | |
|
||||
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | |
|
||||
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | |
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
@ -439,9 +442,9 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
|
||||
Specified using `--task classify`.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
|
||||
|
||||
If your model is not in the above list, we will try to automatically convert the model using
|
||||
[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
|
||||
@ -450,12 +453,19 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
|
||||
Specified using `--task score`.
|
||||
|
||||
| Architecture | Models | Example HF Models |
|
||||
|---------------------------------------|-------------------|----------------------------------------------|
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. |
|
||||
| Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
|
||||
|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
|
||||
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
|
||||
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
|
||||
|
||||
!!! note
|
||||
Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: <gh-file:examples/offline_inference/qwen3_reranker.py>.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
[](){ #supported-mm-models }
|
||||
|
||||
## List of Multimodal Language Models
|
||||
@ -477,7 +487,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
|
||||
|
||||
See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
|
||||
or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
|
||||
|
||||
@ -513,45 +523,45 @@ Specified using `--task generate`.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ |
|
||||
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ |
|
||||
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
|
||||
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | |
|
||||
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
|
||||
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
|
||||
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
|
||||
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* |
|
||||
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
|
||||
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
|
||||
• For example, to use DeepSeek-VL2 series models:
|
||||
@ -628,11 +638,21 @@ Specified using `--task generate`.
|
||||
Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
||||
`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
|
||||
|
||||
#### Transcription
|
||||
|
||||
Specified using `--task transcription`.
|
||||
|
||||
Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|----------------------------------------------|------------------|------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
Since some model architectures support both generative and pooling tasks,
|
||||
you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
|
||||
|
||||
@ -647,19 +667,10 @@ Any text generation model can be converted into an embedding model by passing `-
|
||||
|
||||
The following table lists those that are tested in vLLM.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ |
|
||||
|
||||
#### Transcription
|
||||
|
||||
Specified using `--task transcription`.
|
||||
|
||||
Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] |
|
||||
|----------------|----------|---------------------|------------------------|-----------------------------|
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|
||||
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
|
||||
| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
|
||||
|
||||
---
|
||||
|
||||
|
@ -36,7 +36,7 @@ print(completion.choices[0].message)
|
||||
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
|
||||
You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
|
||||
|
||||
To disable this behavior, please pass `--generation-config vllm` when launching the server.
|
||||
@ -250,7 +250,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
--chat-template examples/template_vlm2vec.jinja
|
||||
```
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
@ -294,13 +294,13 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||
```
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
Like with VLM2Vec, we have to explicitly pass `--task embed`.
|
||||
|
||||
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
|
||||
by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
|
||||
|
||||
!!! warning
|
||||
!!! important
|
||||
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
example below for details.
|
||||
|
||||
|
@ -40,7 +40,7 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
|
||||
- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
|
||||
- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
|
||||
- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
|
||||
- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs.
|
||||
- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
|
||||
|
||||
## Incorrect network setup
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
# vLLM V1
|
||||
|
||||
**We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.**
|
||||
!!! announcement
|
||||
|
||||
We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
|
||||
|
||||
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
||||
|
||||
@ -32,36 +34,9 @@ Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-
|
||||
|
||||
This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
|
||||
|
||||
### Supports Overview
|
||||
#### Hardware
|
||||
## Current Status
|
||||
|
||||
| Hardware | Status |
|
||||
|----------|------------------------------------------|
|
||||
| **NVIDIA** | <nobr>🚀 Natively Supported</nobr> |
|
||||
| **AMD** | <nobr>🚧 WIP</nobr> |
|
||||
| **TPU** | <nobr>🚧 WIP</nobr> |
|
||||
| **CPU** | <nobr>🚧 WIP</nobr> |
|
||||
|
||||
#### Feature / Model
|
||||
|
||||
| Feature / Model | Status |
|
||||
|-----------------|-----------------------------------------------------------------------------------|
|
||||
| **Prefix Caching** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
|
||||
| **LoRA** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
|
||||
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
||||
| **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
|
||||
| **Spec Decode** | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
|
||||
| **Prompt Logprobs with Prefix Caching** | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
|
||||
| **Structured Output Alternative Backends** | <nobr>🟡 Planned</nobr> |
|
||||
| **Embedding Models** | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
|
||||
| **Mamba Models** | <nobr>🟡 Planned</nobr> |
|
||||
| **Encoder-Decoder Models** | <nobr>🟠 Delayed</nobr> |
|
||||
| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr> |
|
||||
| **best_of** | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
|
||||
| **Per-Request Logits Processors** | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
|
||||
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Deprecated</nobr> |
|
||||
For each item, our progress towards V1 support falls into one of the following states:
|
||||
|
||||
- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
|
||||
- **🟢 Functional**: Fully operational, with ongoing optimizations.
|
||||
@ -70,15 +45,87 @@ This living user guide outlines a few known **important changes and limitations*
|
||||
- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
|
||||
- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
|
||||
|
||||
**Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
|
||||
way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
|
||||
allocate a fixed token budget per request, enabling features like chunked prefills,
|
||||
prefix caching, and speculative decoding without a strict separation between prefill
|
||||
and decode phases.
|
||||
### Hardware
|
||||
|
||||
### Semantic Changes and Deprecated Features
|
||||
| Hardware | Status |
|
||||
|------------|------------------------------------|
|
||||
| **NVIDIA** | <nobr>🚀</nobr> |
|
||||
| **AMD** | <nobr>🟢</nobr> |
|
||||
| **TPU** | <nobr>🟢</nobr> |
|
||||
| **CPU** | <nobr>🟢 (x86) 🟡 (MacOS) </nobr> |
|
||||
|
||||
#### Logprobs
|
||||
!!! note
|
||||
|
||||
More hardware platforms may be supported via plugins, e.g.:
|
||||
|
||||
- [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
|
||||
- [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
|
||||
- [vllm-openvino](https://github.com/vllm-project/vllm-openvino)
|
||||
|
||||
Please check their corresponding repositories for more details.
|
||||
|
||||
### Models
|
||||
|
||||
| Model Type | Status |
|
||||
|-----------------------------|------------------------------------------------------------------------------------|
|
||||
| **Decoder-only Models** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Encoder-Decoder Models** | <nobr>🟠 Delayed</nobr> |
|
||||
| **Embedding Models** | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
|
||||
| **Mamba Models** | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> |
|
||||
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
||||
|
||||
vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
|
||||
|
||||
!!! tip
|
||||
|
||||
This corresponds to the V1 column in our [list of supported models][supported-models].
|
||||
|
||||
See below for the status of models that are still not yet supported in V1.
|
||||
|
||||
#### Embedding Models
|
||||
|
||||
The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
|
||||
|
||||
Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
|
||||
which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
|
||||
to enable simultaneous generation and embedding using the same engine instance in V1.
|
||||
|
||||
#### Mamba Models
|
||||
|
||||
Models using selective state-space mechanisms instead of standard transformer attention (e.g., `MambaForCausalLM`, `JambaForCausalLM`)
|
||||
will be supported via [PR #19327](https://github.com/vllm-project/vllm/pull/19327).
|
||||
|
||||
#### Encoder-Decoder Models
|
||||
|
||||
Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)
|
||||
are not yet supported.
|
||||
|
||||
### Features
|
||||
|
||||
| Feature | Status |
|
||||
|---------------------------------------------|-----------------------------------------------------------------------------------|
|
||||
| **Prefix Caching** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
|
||||
| **LoRA** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
|
||||
| **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
|
||||
| **Spec Decode** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Prompt Logprobs with Prefix Caching** | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
|
||||
| **Structured Output Alternative Backends** | <nobr>🟢 Functional</nobr> |
|
||||
| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr> |
|
||||
| **best_of** | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
|
||||
| **Per-Request Logits Processors** | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
|
||||
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Deprecated</nobr> |
|
||||
|
||||
!!! note
|
||||
|
||||
vLLM V1’s unified scheduler treats both prompt and output tokens the same
|
||||
way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
|
||||
allocate a fixed token budget per request, enabling features like chunked prefills,
|
||||
prefix caching, and speculative decoding without a strict separation between prefill
|
||||
and decode phases.
|
||||
|
||||
#### Semantic Changes to Logprobs
|
||||
|
||||
vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
|
||||
differences compared to V0:
|
||||
@ -115,46 +162,4 @@ to handle request preemptions.
|
||||
|
||||
**Structured Output features**
|
||||
|
||||
- **Request-level Structured Output Backend**: Deprecated, alternative backends
|
||||
(outlines, guidance) with fallbacks is WIP.
|
||||
### Feature & Model Support in Progress
|
||||
|
||||
Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
|
||||
|
||||
#### Features to Be Optimized
|
||||
|
||||
These features are already supported in vLLM V1, but their optimization is still
|
||||
in progress.
|
||||
|
||||
- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
|
||||
will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
|
||||
|
||||
- **Multimodal Models**: V1 is almost fully compatible with V0 except that interleaved modality input is not supported yet.
|
||||
See [here](https://github.com/orgs/vllm-project/projects/8) for the status of upcoming features and optimizations.
|
||||
|
||||
#### Features to Be Supported
|
||||
|
||||
- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
|
||||
supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
|
||||
Details about the structured outputs can be found
|
||||
[here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
|
||||
|
||||
#### Models to Be Supported
|
||||
|
||||
vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
|
||||
and the majority fall into the following categories. V1 support for these models will be added eventually.
|
||||
|
||||
**Embedding Models**
|
||||
The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
|
||||
|
||||
Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1.
|
||||
|
||||
**Mamba Models**
|
||||
Models using selective state-space mechanisms (instead of standard transformer attention)
|
||||
are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
|
||||
|
||||
**Encoder-Decoder Models**
|
||||
vLLM V1 is currently optimized for decoder-only transformers. Models requiring
|
||||
cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
|
||||
|
||||
For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
|
||||
|
@ -70,7 +70,7 @@ Try one yourself by passing one of the following models to the `--model` argumen
|
||||
|
||||
vLLM supports models that are quantized using GGUF.
|
||||
|
||||
Try one yourself by downloading a GUFF quantised model and using the following arguments:
|
||||
Try one yourself by downloading a quantized GGUF model and using the following arguments:
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
@ -137,4 +137,8 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(
|
||||
"[WARNING] Use examples/offline_inference/spec_decode.py"
|
||||
" instead of this script."
|
||||
)
|
||||
main()
|
||||
|
@ -64,7 +64,7 @@ def print_outputs(outputs):
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def main():
|
||||
assert (
|
||||
len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
|
||||
), f"""Text, image prompts and sampling parameters should have the
|
||||
@ -104,3 +104,7 @@ if __name__ == "__main__":
|
||||
# test batch-size = 4
|
||||
outputs = llm.generate(batched_inputs, batched_sample_params)
|
||||
print_outputs(outputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -70,7 +70,7 @@ def main(args: argparse.Namespace):
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the latency of processing a single batch of "
|
||||
"requests till completion."
|
||||
@ -102,5 +102,9 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
|
77
examples/offline_inference/qwen3_reranker.py
Normal file
77
examples/offline_inference/qwen3_reranker.py
Normal file
@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
model_name = "Qwen/Qwen3-Reranker-0.6B"
|
||||
|
||||
# What is the difference between the official original version and one
|
||||
# that has been converted into a sequence classification model?
|
||||
# Qwen3-Reranker is a language model that doing reranker by using the
|
||||
# logits of "no" and "yes" tokens.
|
||||
# It needs to computing 151669 tokens logits, making this method extremely
|
||||
# inefficient, not to mention incompatible with the vllm score API.
|
||||
# A method for converting the original model into a sequence classification
|
||||
# model was proposed. See:https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
|
||||
# Models converted offline using this method can not only be more efficient
|
||||
# and support the vllm score API, but also make the init parameters more
|
||||
# concise, for example.
|
||||
# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
|
||||
|
||||
# If you want to load the official original version, the init parameters are
|
||||
# as follows.
|
||||
|
||||
model = LLM(
|
||||
model=model_name,
|
||||
task="score",
|
||||
hf_overrides={
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
"is_original_qwen3_reranker": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Why do we need hf_overrides for the official original version:
|
||||
# vllm converts it to Qwen3ForSequenceClassification when loaded for
|
||||
# better performance.
|
||||
# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
|
||||
# to manually route to Qwen3ForSequenceClassification.
|
||||
# - Then, we will extract the vector corresponding to classifier_from_token
|
||||
# from lm_head using `"classifier_from_token": ["no", "yes"]`.
|
||||
# - Third, we will convert these two vectors into one vector. The use of
|
||||
# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
|
||||
|
||||
# Please use the query_template and document_template to format the query and
|
||||
# document for better reranker results.
|
||||
|
||||
prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
|
||||
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||
|
||||
query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
|
||||
document_template = "<Document>: {doc}{suffix}"
|
||||
|
||||
if __name__ == "__main__":
|
||||
instruction = (
|
||||
"Given a web search query, retrieve relevant passages that answer the query"
|
||||
)
|
||||
|
||||
queries = [
|
||||
"What is the capital of China?",
|
||||
"Explain gravity",
|
||||
]
|
||||
|
||||
documents = [
|
||||
"The capital of China is Beijing.",
|
||||
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
|
||||
]
|
||||
|
||||
queries = [
|
||||
query_template.format(prefix=prefix, instruction=instruction, query=query)
|
||||
for query in queries
|
||||
]
|
||||
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
|
||||
|
||||
outputs = model.score(queries, documents)
|
||||
|
||||
print([output.outputs.score for output in outputs])
|
137
examples/offline_inference/spec_decode.py
Normal file
137
examples/offline_inference/spec_decode.py
Normal file
@ -0,0 +1,137 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.benchmarks.datasets import add_dataset_parser, get_samples
|
||||
from vllm.v1.metrics.reader import Counter, Vector
|
||||
|
||||
try:
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser()
|
||||
add_dataset_parser(parser)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="./examples/data/gsm8k.jsonl",
|
||||
help="downloaded from the eagle repo "
|
||||
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--method", type=str, default="eagle", choices=["ngram", "eagle", "eagle3"]
|
||||
)
|
||||
parser.add_argument("--max-num-seqs", type=int, default=8)
|
||||
parser.add_argument("--num-spec-tokens", type=int, default=2)
|
||||
parser.add_argument("--prompt-lookup-max", type=int, default=5)
|
||||
parser.add_argument("--prompt-lookup-min", type=int, default=2)
|
||||
parser.add_argument("--tp", type=int, default=1)
|
||||
parser.add_argument("--draft-tp", type=int, default=1)
|
||||
parser.add_argument("--enforce-eager", action="store_true")
|
||||
parser.add_argument("--enable-chunked-prefill", action="store_true")
|
||||
parser.add_argument("--max-num-batched-tokens", type=int, default=2048)
|
||||
parser.add_argument("--temp", type=float, default=0)
|
||||
parser.add_argument("--top-p", type=float, default=1.0)
|
||||
parser.add_argument("--top-k", type=int, default=-1)
|
||||
parser.add_argument("--print-output", action="store_true")
|
||||
parser.add_argument("--output-len", type=int, default=256)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
args.endpoint_type = "openai-chat"
|
||||
|
||||
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
||||
max_model_len = 2048
|
||||
|
||||
prompts = get_samples(args, tokenizer)
|
||||
# add_special_tokens is False to avoid adding bos twice when using chat templates
|
||||
prompt_ids = [
|
||||
tokenizer.encode(prompt.prompt, add_special_tokens=False) for prompt in prompts
|
||||
]
|
||||
|
||||
if args.method == "eagle" or args.method == "eagle3":
|
||||
if args.method == "eagle":
|
||||
eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
|
||||
elif args.method == "eagle3":
|
||||
eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
speculative_config = {
|
||||
"method": args.method,
|
||||
"model": eagle_dir,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
"draft_tensor_parallel_size": args.draft_tp,
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
elif args.method == "ngram":
|
||||
speculative_config = {
|
||||
"method": "ngram",
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
"prompt_lookup_max": args.prompt_lookup_max,
|
||||
"prompt_lookup_min": args.prompt_lookup_min,
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"unknown method: {args.method}")
|
||||
|
||||
llm = LLM(
|
||||
model=model_dir,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=args.tp,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
max_num_batched_tokens=args.max_num_batched_tokens,
|
||||
enforce_eager=args.enforce_eager,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config=speculative_config,
|
||||
disable_log_stats=False,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
|
||||
outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params)
|
||||
|
||||
# print the generated text
|
||||
if args.print_output:
|
||||
for output in outputs:
|
||||
print("-" * 50)
|
||||
print(f"prompt: {output.prompt}")
|
||||
print(f"generated text: {output.outputs[0].text}")
|
||||
print("-" * 50)
|
||||
|
||||
try:
|
||||
metrics = llm.get_metrics()
|
||||
except AssertionError:
|
||||
print("Metrics are not supported in the V0 engine.")
|
||||
return
|
||||
|
||||
num_drafts = num_accepted = 0
|
||||
acceptance_counts = [0] * args.num_spec_tokens
|
||||
for metric in metrics:
|
||||
if metric.name == "vllm:spec_decode_num_drafts":
|
||||
assert isinstance(metric, Counter)
|
||||
num_drafts += metric.value
|
||||
elif metric.name == "vllm:spec_decode_num_accepted_tokens":
|
||||
assert isinstance(metric, Counter)
|
||||
num_accepted += metric.value
|
||||
elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
|
||||
assert isinstance(metric, Vector)
|
||||
for pos in range(len(metric.values)):
|
||||
acceptance_counts[pos] += metric.values[pos]
|
||||
|
||||
print("-" * 50)
|
||||
print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}")
|
||||
print("-" * 50)
|
||||
|
||||
# print acceptance at each token position
|
||||
for i in range(len(acceptance_counts)):
|
||||
print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -289,6 +289,106 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
|
||||
# it will generate poor response for multi-image inputs!
|
||||
model_name = "llava-hf/llava-1.5-7b-hf"
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_num_seqs=16,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=16,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=16384,
|
||||
max_num_seqs=16,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
@ -737,6 +837,9 @@ model_example_map = {
|
||||
"idefics3": load_idefics3,
|
||||
"internvl_chat": load_internvl,
|
||||
"kimi_vl": load_kimi_vl,
|
||||
"llava": load_llava,
|
||||
"llava-next": load_llava_next,
|
||||
"llava-onevision": load_llava_onevision,
|
||||
"llama4": load_llama4,
|
||||
"mistral3": load_mistral3,
|
||||
"mllama": load_mllama,
|
||||
|
154
examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py
Normal file
154
examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py
Normal file
@ -0,0 +1,154 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
import socket
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
import aiohttp
|
||||
import msgpack
|
||||
import zmq
|
||||
from quart import Quart, make_response, request
|
||||
|
||||
count = 0
|
||||
prefill_instances: dict[str, str] = {} # http_address: zmq_address
|
||||
decode_instances: dict[str, str] = {} # http_address: zmq_address
|
||||
|
||||
prefill_cv = threading.Condition()
|
||||
decode_cv = threading.Condition()
|
||||
|
||||
|
||||
def _listen_for_register(poller, router_socket):
|
||||
while True:
|
||||
socks = dict(poller.poll())
|
||||
if router_socket in socks:
|
||||
remote_address, message = router_socket.recv_multipart()
|
||||
# data: {"type": "P", "http_address": "ip:port",
|
||||
# "zmq_address": "ip:port"}
|
||||
data = msgpack.loads(message)
|
||||
if data["type"] == "P":
|
||||
global prefill_instances
|
||||
global prefill_cv
|
||||
with prefill_cv:
|
||||
prefill_instances[data["http_address"]] = data["zmq_address"]
|
||||
elif data["type"] == "D":
|
||||
global decode_instances
|
||||
global decode_cv
|
||||
with decode_cv:
|
||||
decode_instances[data["http_address"]] = data["zmq_address"]
|
||||
else:
|
||||
print(
|
||||
"Unexpected, Received message from %s, data: %s",
|
||||
remote_address,
|
||||
data,
|
||||
)
|
||||
|
||||
|
||||
def start_service_discovery(hostname, port):
|
||||
if not hostname:
|
||||
hostname = socket.gethostname()
|
||||
if port == 0:
|
||||
raise ValueError("Port cannot be 0")
|
||||
|
||||
context = zmq.Context()
|
||||
router_socket = context.socket(zmq.ROUTER)
|
||||
router_socket.bind(f"tcp://{hostname}:{port}")
|
||||
|
||||
poller = zmq.Poller()
|
||||
poller.register(router_socket, zmq.POLLIN)
|
||||
|
||||
_listener_thread = threading.Thread(
|
||||
target=_listen_for_register, args=[poller, router_socket], daemon=True
|
||||
)
|
||||
_listener_thread.start()
|
||||
return _listener_thread
|
||||
|
||||
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||
|
||||
app = Quart(__name__)
|
||||
|
||||
|
||||
def random_uuid() -> str:
|
||||
return str(uuid.uuid4().hex)
|
||||
|
||||
|
||||
async def forward_request(url, data, request_id):
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
"X-Request-Id": request_id,
|
||||
}
|
||||
async with session.post(url=url, json=data, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
if True:
|
||||
async for chunk_bytes in response.content.iter_chunked(1024):
|
||||
yield chunk_bytes
|
||||
else:
|
||||
content = await response.read()
|
||||
yield content
|
||||
|
||||
|
||||
@app.route("/v1/completions", methods=["POST"])
|
||||
async def handle_request():
|
||||
try:
|
||||
original_request_data = await request.get_json()
|
||||
|
||||
prefill_request = original_request_data.copy()
|
||||
# change max_tokens = 1 to let it only do prefill
|
||||
prefill_request["max_tokens"] = 1
|
||||
|
||||
global count
|
||||
global prefill_instances
|
||||
global prefill_cv
|
||||
with prefill_cv:
|
||||
prefill_list = list(prefill_instances.items())
|
||||
prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
|
||||
|
||||
global decode_instances
|
||||
global decode_cv
|
||||
with decode_cv:
|
||||
decode_list = list(decode_instances.items())
|
||||
decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
|
||||
|
||||
print(
|
||||
f"handle_request count: {count}, [HTTP:{prefill_addr}, "
|
||||
f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
|
||||
f"ZMQ:{decode_zmq_addr}]"
|
||||
)
|
||||
count += 1
|
||||
|
||||
request_id = (
|
||||
f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
|
||||
f"{decode_zmq_addr}_{random_uuid()}"
|
||||
)
|
||||
|
||||
# finish prefill
|
||||
async for _ in forward_request(
|
||||
f"http://{prefill_addr}/v1/completions", prefill_request, request_id
|
||||
):
|
||||
continue
|
||||
|
||||
# return decode
|
||||
generator = forward_request(
|
||||
f"http://{decode_addr}/v1/completions", original_request_data, request_id
|
||||
)
|
||||
response = await make_response(generator)
|
||||
response.timeout = None
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
exc_info = sys.exc_info()
|
||||
print("Error occurred in disagg prefill proxy server")
|
||||
print(e)
|
||||
print("".join(traceback.format_exception(*exc_info)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
t = start_service_discovery("0.0.0.0", 30001)
|
||||
app.run(host="0.0.0.0", port=10001)
|
||||
t.join()
|
@ -1,175 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
To run this example, you need to start the vLLM server:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from openai import BadRequestError, OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
def guided_choice_completion(client: OpenAI, model: str):
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by Regex
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = (
|
||||
"Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n"
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
truck = "Truck"
|
||||
coupe = "Coupe"
|
||||
|
||||
|
||||
class CarDescription(BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = (
|
||||
"Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's"
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by Grammar
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
|
||||
prompt = (
|
||||
"Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table."
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Extra backend options
|
||||
def extra_backend_options_completion(client: OpenAI, model: str):
|
||||
prompt = (
|
||||
"Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n"
|
||||
)
|
||||
|
||||
try:
|
||||
# The guided_decoding_disable_fallback option forces vLLM to use
|
||||
# xgrammar, so when it fails you get a 400 with the reason why
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"guided_regex": r"\w+@\w+\.com\n",
|
||||
"stop": ["\n"],
|
||||
"guided_decoding_disable_fallback": True,
|
||||
},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
except BadRequestError as e:
|
||||
print("This error is expected:", e)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
base_url=openai_api_base,
|
||||
api_key=openai_api_key,
|
||||
)
|
||||
|
||||
model = client.models.list().data[0].id
|
||||
|
||||
print("Guided Choice Completion:")
|
||||
print(guided_choice_completion(client, model))
|
||||
|
||||
print("\nGuided Regex Completion:")
|
||||
print(guided_regex_completion(client, model))
|
||||
|
||||
print("\nGuided JSON Completion:")
|
||||
print(guided_json_completion(client, model))
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
print(guided_grammar_completion(client, model))
|
||||
|
||||
print("\nExtra Backend Options Completion:")
|
||||
print(extra_backend_options_completion(client, model))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,87 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from openai import OpenAI
|
||||
|
||||
# This example demonstrates the `structural_tag` response format.
|
||||
# It can be used to specify a structured output format that occurs between
|
||||
# specific tags in the response. This example shows how it could be used
|
||||
# to enforce the format of a tool call response, but it could be used for
|
||||
# any structured output within a subset of the response.
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
base_url=openai_api_base,
|
||||
api_key=openai_api_key,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
You have access to the following function to retrieve the weather in a city:
|
||||
|
||||
{
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"city": {
|
||||
"param_type": "string",
|
||||
"description": "The city to get the weather for",
|
||||
"required": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If a you choose to call a function ONLY reply in the following format:
|
||||
<{start_tag}={function_name}>{parameters}{end_tag}
|
||||
where
|
||||
|
||||
start_tag => `<function`
|
||||
parameters => a JSON dict with the function argument name as key and function
|
||||
argument value as value.
|
||||
end_tag => `</function>`
|
||||
|
||||
Here is an example,
|
||||
<function=example_function_name>{"example_name": "example_value"}</function>
|
||||
|
||||
Reminder:
|
||||
- Function calls MUST follow the specified format
|
||||
- Required parameters MUST be specified
|
||||
- Only call one function at a time
|
||||
- Put the entire function call reply on one line
|
||||
- Always add your sources when using search results to answer the user query
|
||||
|
||||
You are a helpful assistant.
|
||||
|
||||
Given the previous instructions, what is the weather in New York City, Boston,
|
||||
and San Francisco?
|
||||
""",
|
||||
}
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=messages,
|
||||
response_format={
|
||||
"type": "structural_tag",
|
||||
"structures": [
|
||||
{
|
||||
"begin": "<function=get_weather>",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"city": {"type": "string"}},
|
||||
},
|
||||
"end": "</function>",
|
||||
}
|
||||
],
|
||||
"triggers": ["<function="],
|
||||
},
|
||||
)
|
||||
print(response)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,167 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
An example shows how to generate structured outputs from reasoning models
|
||||
like DeepSeekR1. The thinking process will not be guided by the JSON
|
||||
schema provided by the user. Only the final output will be structured.
|
||||
|
||||
To run this example, you need to start the vLLM server with the reasoning
|
||||
parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions from reasoning models
|
||||
using the OpenAI Python client library.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from openai import OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
def print_completion_details(completion):
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
|
||||
# Guided decoding by Regex
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = "What is the capital of France?"
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"guided_regex": "(Paris|London)",
|
||||
},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
class People(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = People.model_json_schema()
|
||||
|
||||
prompt = "Generate a JSON with the name and age of one random person."
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
truck = "Truck"
|
||||
coupe = "Coupe"
|
||||
|
||||
|
||||
class CarDescription(BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
def guided_car_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = (
|
||||
"Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's"
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
# Guided decoding by Grammar
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
|
||||
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
|
||||
prompt = (
|
||||
"Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table."
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model: str = models.data[0].id
|
||||
|
||||
print("Guided Regex Completion:")
|
||||
guided_regex_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (People):")
|
||||
guided_json_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (CarDescription):")
|
||||
guided_car_json_completion(client, model)
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
guided_grammar_completion(client, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -11,6 +11,7 @@ Features:
|
||||
- Streaming response display
|
||||
- Configurable API endpoint
|
||||
- Real-time chat history
|
||||
- Reasoning Display: Optional thinking process visualization
|
||||
|
||||
Requirements:
|
||||
pip install streamlit openai
|
||||
@ -51,13 +52,33 @@ if "messages" not in st.session_state:
|
||||
if "active_session" not in st.session_state:
|
||||
st.session_state.active_session = None
|
||||
|
||||
# Add new session state for reasoning
|
||||
if "show_reasoning" not in st.session_state:
|
||||
st.session_state.show_reasoning = {}
|
||||
|
||||
# Initialize session state for API base URL
|
||||
if "api_base_url" not in st.session_state:
|
||||
st.session_state.api_base_url = openai_api_base
|
||||
|
||||
|
||||
def create_new_chat_session():
|
||||
"""Create a new chat session with timestamp as ID"""
|
||||
"""Create a new chat session with timestamp as unique identifier.
|
||||
|
||||
This function initializes a new chat session by:
|
||||
1. Generating a timestamp-based session ID
|
||||
2. Creating an empty message list for the new session
|
||||
3. Setting the new session as both current and active session
|
||||
4. Resetting the messages list for the new session
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Session State Updates:
|
||||
- sessions: Adds new empty message list with timestamp key
|
||||
- current_session: Sets to new session ID
|
||||
- active_session: Sets to new session ID
|
||||
- messages: Resets to empty list
|
||||
"""
|
||||
session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
st.session_state.sessions[session_id] = []
|
||||
st.session_state.current_session = session_id
|
||||
@ -66,30 +87,98 @@ def create_new_chat_session():
|
||||
|
||||
|
||||
def switch_to_chat_session(session_id):
|
||||
"""Switch to a different chat session"""
|
||||
"""Switch the active chat context to a different session.
|
||||
|
||||
Args:
|
||||
session_id (str): The timestamp ID of the session to switch to
|
||||
|
||||
This function handles chat session switching by:
|
||||
1. Setting the specified session as current
|
||||
2. Updating the active session marker
|
||||
3. Loading the messages history from the specified session
|
||||
|
||||
Session State Updates:
|
||||
- current_session: Updated to specified session_id
|
||||
- active_session: Updated to specified session_id
|
||||
- messages: Loaded from sessions[session_id]
|
||||
"""
|
||||
st.session_state.current_session = session_id
|
||||
st.session_state.active_session = session_id
|
||||
st.session_state.messages = st.session_state.sessions[session_id]
|
||||
|
||||
|
||||
def get_llm_response(messages, model):
|
||||
"""Get streaming response from llm
|
||||
def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
|
||||
"""Generate and stream LLM response with optional reasoning process.
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
model: Name of model
|
||||
messages (list): List of conversation message dicts with 'role' and 'content'
|
||||
model (str): The model identifier to use for generation
|
||||
reason (bool): Whether to enable and display reasoning process
|
||||
content_ph (streamlit.empty): Placeholder for streaming response content
|
||||
reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
|
||||
|
||||
Returns:
|
||||
Streaming response object or error message string
|
||||
tuple: (str, str)
|
||||
- First string contains the complete response text
|
||||
- Second string contains the complete reasoning text (if enabled)
|
||||
|
||||
Features:
|
||||
- Streams both reasoning and response text in real-time
|
||||
- Handles model API errors gracefully
|
||||
- Supports live updating of thinking process
|
||||
- Maintains separate content and reasoning displays
|
||||
|
||||
Raises:
|
||||
Exception: Wrapped in error message if API call fails
|
||||
|
||||
Note:
|
||||
The function uses streamlit placeholders for live updates.
|
||||
When reason=True, the reasoning process appears above the response.
|
||||
"""
|
||||
full_text = ""
|
||||
think_text = ""
|
||||
live_think = None
|
||||
# Build request parameters
|
||||
params = {"model": model, "messages": messages, "stream": True}
|
||||
if reason:
|
||||
params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=model, messages=messages, stream=True
|
||||
)
|
||||
return response
|
||||
response = client.chat.completions.create(**params)
|
||||
if isinstance(response, str):
|
||||
if content_ph:
|
||||
content_ph.markdown(response)
|
||||
return response, ""
|
||||
|
||||
# Prepare reasoning expander above content
|
||||
if reason and reasoning_ph:
|
||||
exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
|
||||
live_think = exp.empty()
|
||||
|
||||
# Stream chunks
|
||||
for chunk in response:
|
||||
delta = chunk.choices[0].delta
|
||||
# Stream reasoning first
|
||||
if reason and hasattr(delta, "reasoning_content") and live_think:
|
||||
rc = delta.reasoning_content
|
||||
if rc:
|
||||
think_text += rc
|
||||
live_think.markdown(think_text + "▌")
|
||||
# Then stream content
|
||||
if hasattr(delta, "content") and delta.content and content_ph:
|
||||
full_text += delta.content
|
||||
content_ph.markdown(full_text + "▌")
|
||||
|
||||
# Finalize displays: reasoning remains above, content below
|
||||
if reason and live_think:
|
||||
live_think.markdown(think_text)
|
||||
if content_ph:
|
||||
content_ph.markdown(full_text)
|
||||
|
||||
return full_text, think_text
|
||||
except Exception as e:
|
||||
st.error(f"Error details: {str(e)}")
|
||||
return f"Error: {str(e)}"
|
||||
return f"Error: {str(e)}", ""
|
||||
|
||||
|
||||
# Sidebar - API Settings first
|
||||
@ -108,6 +197,7 @@ st.sidebar.title("Chat Sessions")
|
||||
if st.sidebar.button("New Session"):
|
||||
create_new_chat_session()
|
||||
|
||||
|
||||
# Display all sessions in reverse chronological order
|
||||
for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
|
||||
# Mark the active session with a pinned button
|
||||
@ -143,47 +233,79 @@ if st.session_state.current_session is None:
|
||||
create_new_chat_session()
|
||||
st.session_state.active_session = st.session_state.current_session
|
||||
|
||||
# Display chat history for current session
|
||||
for message in st.session_state.messages:
|
||||
with st.chat_message(message["role"]):
|
||||
st.write(message["content"])
|
||||
# Update the chat history display section
|
||||
for idx, msg in enumerate(st.session_state.messages):
|
||||
# Render user messages normally
|
||||
if msg["role"] == "user":
|
||||
with st.chat_message("user"):
|
||||
st.write(msg["content"])
|
||||
# Render assistant messages with reasoning above
|
||||
else:
|
||||
# If reasoning exists for this assistant message, show it above the content
|
||||
if idx in st.session_state.show_reasoning:
|
||||
with st.expander("💭 Thinking Process", expanded=False):
|
||||
st.markdown(st.session_state.show_reasoning[idx])
|
||||
with st.chat_message("assistant"):
|
||||
st.write(msg["content"])
|
||||
|
||||
# Handle user input and generate llm response
|
||||
|
||||
# Setup & Cache reasoning support check
|
||||
@st.cache_data(show_spinner=False)
|
||||
def server_supports_reasoning():
|
||||
"""Check if the current model supports reasoning capability.
|
||||
|
||||
Returns:
|
||||
bool: True if the model supports reasoning, False otherwise
|
||||
"""
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
stream=False,
|
||||
)
|
||||
return hasattr(resp.choices[0].message, "reasoning_content") and bool(
|
||||
resp.choices[0].message.reasoning_content
|
||||
)
|
||||
|
||||
|
||||
# Check support
|
||||
supports_reasoning = server_supports_reasoning()
|
||||
|
||||
# Add reasoning toggle in sidebar if supported
|
||||
reason = False # Default to False
|
||||
if supports_reasoning:
|
||||
reason = st.sidebar.checkbox("Enable Reasoning", value=False)
|
||||
else:
|
||||
st.sidebar.markdown(
|
||||
"<span style='color:gray;'>Reasoning unavailable for this model.</span>",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
# reason remains False
|
||||
|
||||
# Update the input handling section
|
||||
if prompt := st.chat_input("Type your message here..."):
|
||||
# Save user message to session
|
||||
# Save and display user message
|
||||
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||
st.session_state.sessions[st.session_state.current_session] = (
|
||||
st.session_state.messages
|
||||
)
|
||||
|
||||
# Display user message
|
||||
with st.chat_message("user"):
|
||||
st.write(prompt)
|
||||
|
||||
# Prepare messages for llm
|
||||
messages_for_llm = [
|
||||
# Prepare LLM messages
|
||||
msgs = [
|
||||
{"role": m["role"], "content": m["content"]} for m in st.session_state.messages
|
||||
]
|
||||
|
||||
# Generate and display llm response
|
||||
# Stream assistant response
|
||||
with st.chat_message("assistant"):
|
||||
message_placeholder = st.empty()
|
||||
full_response = ""
|
||||
|
||||
# Get streaming response from llm
|
||||
response = get_llm_response(messages_for_llm, model)
|
||||
if isinstance(response, str):
|
||||
message_placeholder.markdown(response)
|
||||
full_response = response
|
||||
else:
|
||||
for chunk in response:
|
||||
if hasattr(chunk.choices[0].delta, "content"):
|
||||
content = chunk.choices[0].delta.content
|
||||
if content:
|
||||
full_response += content
|
||||
message_placeholder.markdown(full_response + "▌")
|
||||
|
||||
message_placeholder.markdown(full_response)
|
||||
|
||||
# Save llm response to session history
|
||||
st.session_state.messages.append({"role": "assistant", "content": full_response})
|
||||
# Placeholders: reasoning above, content below
|
||||
reason_ph = st.empty()
|
||||
content_ph = st.empty()
|
||||
full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
|
||||
# Determine index for this new assistant message
|
||||
message_index = len(st.session_state.messages)
|
||||
# Save assistant reply
|
||||
st.session_state.messages.append({"role": "assistant", "content": full})
|
||||
# Persist reasoning in session state if any
|
||||
if reason and think:
|
||||
st.session_state.show_reasoning[message_index] = think
|
||||
|
54
examples/online_serving/structured_outputs/README.md
Normal file
54
examples/online_serving/structured_outputs/README.md
Normal file
@ -0,0 +1,54 @@
|
||||
# Structured Outputs
|
||||
|
||||
This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
|
||||
It can run individual constraint type or all of them.
|
||||
It supports both streaming responses and concurrent non-streaming requests.
|
||||
|
||||
To use this example, you must start an vLLM server with any model of your choice.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
|
||||
To serve a reasoning model, you can use the following command:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
If you want to run this script standalone with `uv`, you can use the following:
|
||||
|
||||
```bash
|
||||
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
|
||||
```
|
||||
|
||||
See [feature docs](https://docs.vllm.ai/en/latest/features/structured_outputs.html) for more information.
|
||||
|
||||
!!! tip
|
||||
If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
|
||||
|
||||
## Usage
|
||||
|
||||
Run all constraints, non-streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py
|
||||
```
|
||||
|
||||
Run all constraints, streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --stream
|
||||
```
|
||||
|
||||
Run certain constraints, for example `structural_tag` and `regex`, streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --constraint structural_tag regex --stream
|
||||
```
|
||||
|
||||
Run all constraints, with reasoning models and streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --reasoning --stream
|
||||
```
|
@ -0,0 +1,8 @@
|
||||
[project]
|
||||
name = "examples-online-structured-outputs"
|
||||
requires-python = ">=3.9, <3.13"
|
||||
dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
|
||||
version = "0.0.0"
|
||||
|
||||
[project.scripts]
|
||||
structured-outputs = "structured_outputs:main"
|
272
examples/online_serving/structured_outputs/structured_outputs.py
Normal file
272
examples/online_serving/structured_outputs/structured_outputs.py
Normal file
@ -0,0 +1,272 @@
|
||||
# ruff: noqa: E501
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import enum
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import openai
|
||||
import pydantic
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from openai.types.chat import ChatCompletionChunk
|
||||
|
||||
|
||||
ConstraintsFormat = Literal[
|
||||
"choice",
|
||||
"regex",
|
||||
"json",
|
||||
"grammar",
|
||||
"structural_tag",
|
||||
]
|
||||
|
||||
|
||||
async def print_stream_response(
|
||||
stream_response: openai.AsyncStream[ChatCompletionChunk],
|
||||
title: str,
|
||||
args: argparse.Namespace,
|
||||
):
|
||||
print(f"\n\n{title} (Streaming):")
|
||||
|
||||
local_reasoning_header_printed = False
|
||||
local_content_header_printed = False
|
||||
|
||||
async for chunk in stream_response:
|
||||
delta = chunk.choices[0].delta
|
||||
|
||||
reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
|
||||
content_chunk_text = delta.content
|
||||
|
||||
if args.reasoning:
|
||||
if reasoning_chunk_text:
|
||||
if not local_reasoning_header_printed:
|
||||
print(" Reasoning: ", end="")
|
||||
local_reasoning_header_printed = True
|
||||
print(reasoning_chunk_text, end="", flush=True)
|
||||
|
||||
if content_chunk_text:
|
||||
if not local_content_header_printed:
|
||||
if local_reasoning_header_printed:
|
||||
print()
|
||||
print(" Content: ", end="")
|
||||
local_content_header_printed = True
|
||||
print(content_chunk_text, end="", flush=True)
|
||||
else:
|
||||
if content_chunk_text:
|
||||
if not local_content_header_printed:
|
||||
print(" Content: ", end="")
|
||||
local_content_header_printed = True
|
||||
print(content_chunk_text, end="", flush=True)
|
||||
print()
|
||||
|
||||
|
||||
class CarType(str, enum.Enum):
|
||||
SEDAN = "SEDAN"
|
||||
SUV = "SUV"
|
||||
TRUCK = "TRUCK"
|
||||
COUPE = "COUPE"
|
||||
|
||||
|
||||
class CarDescription(pydantic.BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
"choice": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Classify this sentiment: vLLM is wonderful!",
|
||||
}
|
||||
],
|
||||
"extra_body": {"guided_choice": ["positive", "negative"]},
|
||||
},
|
||||
"regex": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
|
||||
},
|
||||
},
|
||||
"json": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
||||
}
|
||||
],
|
||||
"response_format": {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "car-description",
|
||||
"schema": CarDescription.model_json_schema(),
|
||||
},
|
||||
},
|
||||
},
|
||||
"grammar": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_grammar": """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
""",
|
||||
},
|
||||
},
|
||||
"structural_tag": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
You have access to the following function to retrieve the weather in a city:
|
||||
|
||||
{
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"city": {
|
||||
"param_type": "string",
|
||||
"description": "The city to get the weather for",
|
||||
"required": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If a you choose to call a function ONLY reply in the following format:
|
||||
<{start_tag}={function_name}>{parameters}{end_tag}
|
||||
where
|
||||
|
||||
start_tag => `<function`
|
||||
parameters => a JSON dict with the function argument name as key and function
|
||||
argument value as value.
|
||||
end_tag => `</function>`
|
||||
|
||||
Here is an example,
|
||||
<function=example_function_name>{"example_name": "example_value"}</function>
|
||||
|
||||
Reminder:
|
||||
- Function calls MUST follow the specified format
|
||||
- Required parameters MUST be specified
|
||||
- Only call one function at a time
|
||||
- Put the entire function call reply on one line
|
||||
- Always add your sources when using search results to answer the user query
|
||||
|
||||
You are a helpful assistant.
|
||||
|
||||
Given the previous instructions, what is the weather in New York City, Boston,
|
||||
and San Francisco?""",
|
||||
},
|
||||
],
|
||||
"response_format": {
|
||||
"type": "structural_tag",
|
||||
"structures": [
|
||||
{
|
||||
"begin": "<function=get_weather>",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"city": {"type": "string"}},
|
||||
"required": ["city"],
|
||||
},
|
||||
"end": "</function>",
|
||||
}
|
||||
],
|
||||
"triggers": ["<function="],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def cli():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run OpenAI Chat Completion with various structured outputs capabilities",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--constraint",
|
||||
type=str,
|
||||
nargs="+",
|
||||
choices=[*list(PARAMS), "*"],
|
||||
default=["*"],
|
||||
help="Specify which constraint(s) to run.",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--stream",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="Enable streaming output",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--reasoning",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="Enable printing of reasoning traces if available.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
|
||||
client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
|
||||
constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
|
||||
model = (await client.models.list()).data[0].id
|
||||
|
||||
if args.stream:
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
max_tokens=1024,
|
||||
stream=True,
|
||||
**PARAMS[name],
|
||||
)
|
||||
for name in constraints
|
||||
]
|
||||
)
|
||||
for constraint, stream in zip(constraints, results):
|
||||
await print_stream_response(stream, constraint, args)
|
||||
else:
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
max_tokens=1024,
|
||||
stream=False,
|
||||
**PARAMS[name],
|
||||
)
|
||||
for name in constraints
|
||||
]
|
||||
)
|
||||
for constraint, response in zip(constraints, results):
|
||||
print(f"\n\n{constraint}:")
|
||||
message = response.choices[0].message
|
||||
if args.reasoning and hasattr(message, "reasoning_content"):
|
||||
print(f" Reasoning: {message.reasoning_content or ''}")
|
||||
print(f" Content: {message.content!r}")
|
||||
|
||||
|
||||
def main():
|
||||
asyncio.run(cli())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -28,8 +28,8 @@ import os
|
||||
import time
|
||||
from dataclasses import asdict
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
from lmcache.v1.cache_engine import LMCacheEngineBuilder
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
@ -17,8 +17,8 @@ import subprocess
|
||||
import time
|
||||
from multiprocessing import Event, Process
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
from lmcache.v1.cache_engine import LMCacheEngineBuilder
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
@ -105,7 +105,7 @@ def run_retrieve(store_done, prompts, timeout=1):
|
||||
|
||||
def run_lmcache_server(port):
|
||||
server_proc = subprocess.Popen(
|
||||
["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
|
||||
["python", "-m", "lmcache.v1.server", "localhost", str(port)]
|
||||
)
|
||||
return server_proc
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
site_name: vLLM
|
||||
site_url: https://docs.vllm.ai
|
||||
repo_url: https://github.com/vllm-project/vllm
|
||||
edit_uri: edit/main/docs/
|
||||
exclude_docs: |
|
||||
*.inc.md
|
||||
*.template.md
|
||||
@ -29,10 +30,12 @@ theme:
|
||||
icon: material/brightness-2
|
||||
name: Switch to system preference
|
||||
features:
|
||||
- content.action.edit
|
||||
- content.code.copy
|
||||
- content.tabs.link
|
||||
- navigation.tracking
|
||||
- navigation.tabs
|
||||
- navigation.tabs.sticky
|
||||
- navigation.sections
|
||||
- navigation.prune
|
||||
- navigation.top
|
||||
@ -123,6 +126,7 @@ extra_css:
|
||||
extra_javascript:
|
||||
- mkdocs/javascript/run_llm_widget.js
|
||||
- https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
|
||||
- mkdocs/javascript/edit_and_feedback.js
|
||||
|
||||
# Makes the url format end in .html rather than act as a dir
|
||||
# So index.md generates as index.html and is available under URL /index.html
|
||||
|
@ -137,10 +137,6 @@ exclude = [
|
||||
'vllm/attention/ops/.*\.py$'
|
||||
]
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words-list = "dout, te, indicies, subtile, ElementE"
|
||||
skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
|
||||
|
||||
[tool.isort]
|
||||
skip_glob = [
|
||||
".buildkite/*",
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user