Compare commits
1 Commits
use-uv-pyt
...
avoid-doub
Author | SHA1 | Date | |
---|---|---|---|
4a0d6ac40b |
@ -76,7 +76,7 @@ steps:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
|
@ -58,8 +58,11 @@ function cpu_tests() {
|
||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
# Note: disable Bart until supports V1
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||
--ignore=tests/models/language/generation/test_bart.py
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||
--ignore=tests/models/language/generation/test_bart.py
|
||||
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
|
@ -35,15 +35,16 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/test_metrics
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
pytest -v -s v1/test_utils.py
|
||||
pytest -v -s v1/test_metrics_reader.py
|
||||
'
|
||||
|
@ -50,28 +50,19 @@ steps:
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/transformers_utils
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s transformers_utils
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s utils_ # Utils
|
||||
- pytest -v -s transformers_utils # transformers_utils
|
||||
|
||||
- label: Python-only Installation Test # 10min
|
||||
timeout_in_minutes: 20
|
||||
@ -168,7 +159,10 @@ steps:
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/test_internal_lb_dp.py
|
||||
- tests/v1/test_hybrid_lb_dp.py
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
@ -186,10 +180,10 @@ steps:
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
@ -296,34 +290,24 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/kv_connector/unit
|
||||
- pytest -v -s v1/metrics
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||
|
||||
|
||||
- label: Examples Test # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -479,19 +463,30 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
- label: Tensorizer Test # 14min
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- vllm/model_executor/model_loader
|
||||
- tests/tensorizer_loader
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 11min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -525,7 +520,7 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
@ -553,17 +548,10 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/mistral_tool_use
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
- pytest -v -s tool_use
|
||||
- pytest -v -s mistral_tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
@ -603,19 +591,13 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
timeout_in_minutes: 10
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
- pytest -v -s models/test_transformers.py \
|
||||
models/test_registry.py \
|
||||
models/test_utils.py \
|
||||
models/test_vision.py
|
||||
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
@ -785,13 +767,11 @@ steps:
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
@ -845,24 +825,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/models/llama4.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||
- vllm/model_executor/layers/quantization/modelopt.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
@ -906,58 +869,48 @@ steps:
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
- label: Distributed Tests (2 GPUs) # 110min
|
||||
timeout_in_minutes: 150
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- vllm/compilation
|
||||
- vllm/worker/worker_base.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Model Tests (2 GPUs) # 37min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
- tests/basic_correctness/
|
||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||
- tests/models/
|
||||
commands:
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
timeout_in_minutes: 60
|
||||
|
10
.github/CODEOWNERS
vendored
@ -12,6 +12,8 @@
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/sample @22quinn @houseroad
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
@ -26,13 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
|
||||
@ -54,7 +54,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
|
3
.github/mergify.yml
vendored
@ -239,6 +239,7 @@ pull_request_rules:
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
@ -273,7 +274,7 @@ pull_request_rules:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
||||
- files~=^tests/tensorizer_loader/
|
||||
actions:
|
||||
assign:
|
||||
users:
|
||||
|
@ -13,7 +13,6 @@ build:
|
||||
|
||||
mkdocs:
|
||||
configuration: mkdocs.yaml
|
||||
fail_on_warning: true
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
|
@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||
|
||||
#
|
||||
# Supported/expected torch versions for CUDA/ROCm.
|
||||
@ -86,9 +86,6 @@ find_package(Torch REQUIRED)
|
||||
# Supported NVIDIA architectures.
|
||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
||||
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
else()
|
||||
@ -178,15 +175,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Set compression mode for CUDA >=13.x.
|
||||
#
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
||||
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Set CUDA include flags for CXX compiler.
|
||||
#
|
||||
@ -282,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -317,6 +305,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||
@ -451,11 +440,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.8 or later
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||
@ -485,11 +470,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||
# require CUDA 12.8 or later
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
@ -569,11 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||
# CUDA 12.8 or later
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
@ -592,11 +569,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# FP4 Archs and flags
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
@ -618,11 +591,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# CUTLASS MLA Archs and flags
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||
@ -666,11 +635,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@ -691,11 +656,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@ -714,11 +675,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
|
@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||
|
@ -1,31 +1,17 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
import numpy as np
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
DeviceConfig,
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||
|
||||
|
||||
def benchmark_propose(args):
|
||||
def main(args):
|
||||
rows = []
|
||||
for max_ngram in args.max_ngram:
|
||||
collector = TimeCollector(TimeCollector.US)
|
||||
@ -83,88 +69,10 @@ def benchmark_propose(args):
|
||||
)
|
||||
|
||||
|
||||
def benchmark_batched_propose(args):
|
||||
NUM_SPECULATIVE_TOKENS_NGRAM = 10
|
||||
PROMPT_LOOKUP_MIN = 5
|
||||
PROMPT_LOOKUP_MAX = 15
|
||||
MAX_MODEL_LEN = int(1e7)
|
||||
DEVICE = current_platform.device_type
|
||||
|
||||
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
|
||||
|
||||
speculative_config = SpeculativeConfig(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=ParallelConfig(),
|
||||
method="ngram",
|
||||
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
|
||||
prompt_lookup_max=PROMPT_LOOKUP_MAX,
|
||||
prompt_lookup_min=PROMPT_LOOKUP_MIN,
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=CacheConfig(),
|
||||
speculative_config=speculative_config,
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
parallel_config=ParallelConfig(),
|
||||
load_config=LoadConfig(),
|
||||
scheduler_config=SchedulerConfig(),
|
||||
)
|
||||
|
||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||
mock_pp_group = mock.MagicMock()
|
||||
mock_pp_group.world_size = 1
|
||||
with mock.patch(
|
||||
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
|
||||
):
|
||||
runner = GPUModelRunner(vllm_config, DEVICE)
|
||||
|
||||
# hack max model len
|
||||
runner.max_model_len = MAX_MODEL_LEN
|
||||
runner.drafter.max_model_len = MAX_MODEL_LEN
|
||||
|
||||
dummy_input_batch = InputBatch(
|
||||
max_num_reqs=args.num_req,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_batched_tokens=args.num_req * args.num_token,
|
||||
device=DEVICE,
|
||||
pin_memory=False,
|
||||
vocab_size=256000,
|
||||
block_sizes=[16],
|
||||
)
|
||||
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
||||
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
||||
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
||||
dummy_input_batch.token_ids_cpu = np.random.randint(
|
||||
0, 20, (args.num_req, args.num_token)
|
||||
)
|
||||
|
||||
runner.input_batch = dummy_input_batch
|
||||
|
||||
sampled_token_ids = [[0]] * args.num_req
|
||||
|
||||
print("Starting benchmark")
|
||||
# first run is warmup so ignore it
|
||||
for _ in range(args.num_iteration):
|
||||
start = time.time()
|
||||
runner.drafter.propose(
|
||||
sampled_token_ids,
|
||||
dummy_input_batch.req_ids,
|
||||
dummy_input_batch.num_tokens_no_spec,
|
||||
dummy_input_batch.token_ids_cpu,
|
||||
dummy_input_batch.spec_decode_unsupported_reqs,
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Iteration time (s): {end - start}")
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batched", action="store_true", help="consider time to prepare batch"
|
||||
) # noqa: E501
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
@ -197,17 +105,8 @@ def invoke_main() -> None:
|
||||
help="Number of speculative tokens to generate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.batched:
|
||||
benchmark_propose(args)
|
||||
else:
|
||||
benchmark_batched_propose(args)
|
||||
main(args)
|
||||
|
||||
|
||||
"""
|
||||
# Example command lines:
|
||||
# time python3 benchmarks/benchmark_ngram_proposer.py
|
||||
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
|
||||
""" # noqa: E501
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
|
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
w8a8_block_fp8_matmul,
|
||||
w8a8_triton_block_scaled_mm,
|
||||
)
|
||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||
|
||||
@ -158,7 +158,7 @@ def bench_fp8(
|
||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||
),
|
||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||
),
|
||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||
|
@ -3,7 +3,6 @@
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
import os
|
||||
|
||||
import torch
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
|
||||
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
|
||||
}
|
||||
|
||||
_needs_fbgemm = any(
|
||||
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
|
||||
)
|
||||
if _needs_fbgemm:
|
||||
try:
|
||||
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
|
||||
triton_scale_nvfp4_quant,
|
||||
)
|
||||
except ImportError:
|
||||
print(
|
||||
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
|
||||
"These providers will be skipped. Please install fbgemm_gpu with: "
|
||||
"'pip install fbgemm-gpu-genai' to run them."
|
||||
)
|
||||
# Disable FBGEMM providers so the benchmark can run.
|
||||
for cfg in PROVIDER_CFGS.values():
|
||||
if cfg.get("fbgemm"):
|
||||
cfg["enabled"] = False
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
|
||||
|
||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
|
||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
|
||||
# Compute global scale for weight
|
||||
b_amax = torch.abs(b).max().to(torch.float32)
|
||||
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
|
||||
else:
|
||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||
return b_fp4, scale_b_fp4, b_global_scale
|
||||
|
||||
|
||||
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
|
||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
|
||||
|
||||
# Compute global scale for activation
|
||||
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
||||
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||
|
||||
# Alpha for the GEMM operation
|
||||
alpha = 1.0 / (a_global_scale * b_global_scale)
|
||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||
if cfg["no_a_quant"]:
|
||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||
|
||||
def run():
|
||||
return torch.ops.fbgemm.f4f4bf16(
|
||||
a_fp4,
|
||||
b_fp4,
|
||||
scale_a_fp4,
|
||||
scale_b_fp4,
|
||||
global_scale=alpha,
|
||||
use_mx=False,
|
||||
)
|
||||
|
||||
return run
|
||||
else:
|
||||
|
||||
def run():
|
||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||
return torch.ops.fbgemm.f4f4bf16(
|
||||
a_fp4,
|
||||
b_fp4,
|
||||
scale_a_fp4,
|
||||
scale_b_fp4,
|
||||
global_scale=alpha,
|
||||
use_mx=False,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
if cfg["no_a_quant"]:
|
||||
# Pre-quantize activation
|
||||
@ -184,13 +130,10 @@ if __name__ == "__main__":
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=save_dir,
|
||||
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
|
||||
|
||||
|
||||
def make_rand_tensors(
|
||||
a_shape: tuple[int, ...],
|
||||
b_shape: tuple[int, ...],
|
||||
c_shape: tuple[int, ...],
|
||||
a_shape: tuple[int],
|
||||
b_shape: tuple[int],
|
||||
c_shape: tuple[int],
|
||||
a_dtype: torch.dtype,
|
||||
b_dtype: torch.dtype,
|
||||
c_dtype: torch.dtype,
|
||||
@ -243,7 +243,7 @@ class OpType(Enum):
|
||||
lora_rank: int,
|
||||
num_loras: int,
|
||||
num_slices: int,
|
||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||
) -> tuple[tuple[int], tuple[int], tuple[int]]:
|
||||
"""
|
||||
Given num_slices, return the shapes of the A, B, and C matrices
|
||||
in A x B = C, for the op_type
|
||||
|
@ -584,9 +584,8 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
elif config.architectures[0] in (
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"DeepseekV2ForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
|
@ -8,16 +8,12 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_token_group_quant_fp8,
|
||||
w8a8_block_fp8_matmul,
|
||||
w8a8_triton_block_scaled_mm,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import (
|
||||
calc_diff,
|
||||
fp8_gemm_nt,
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_block_cast_to_fp8,
|
||||
)
|
||||
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
|
||||
|
||||
|
||||
def benchmark_shape(m: int,
|
||||
@ -63,7 +59,7 @@ def benchmark_shape(m: int,
|
||||
|
||||
# === vLLM Triton Implementation ===
|
||||
def vllm_triton_gemm():
|
||||
return w8a8_block_fp8_matmul(A_vllm,
|
||||
return w8a8_triton_block_scaled_mm(A_vllm,
|
||||
B_vllm,
|
||||
A_scale_vllm,
|
||||
B_scale_vllm,
|
||||
|
@ -101,7 +101,6 @@ else()
|
||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||
endif()
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
@ -178,14 +177,8 @@ elseif (S390_FOUND)
|
||||
"-mzvector"
|
||||
"-march=native"
|
||||
"-mtune=native")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||
if(RVV_FOUND)
|
||||
message(FAIL_ERROR "Can't support rvv now.")
|
||||
else()
|
||||
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
|
||||
endif()
|
||||
|
||||
#
|
||||
|
@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
flashmla
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
|
||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
|
||||
GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
|
||||
GIT_PROGRESS TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
@ -33,64 +33,23 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
||||
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
||||
# Only build FlashMLA kernels if we are building for something compatible with
|
||||
# sm90a
|
||||
|
||||
set(SUPPORT_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
|
||||
list(APPEND SUPPORT_ARCHS 9.0a)
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
|
||||
list(APPEND SUPPORT_ARCHS 10.0a)
|
||||
endif()
|
||||
|
||||
|
||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
||||
if(FLASH_MLA_ARCHS)
|
||||
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
||||
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
||||
|
||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
|
||||
set(FlashMLA_SOURCES
|
||||
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
|
||||
)
|
||||
|
||||
set(FlashMLA_Extension_SOURCES
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
||||
)
|
||||
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
|
||||
${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
|
||||
${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
|
||||
|
||||
set(FlashMLA_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
)
|
||||
|
||||
set(FlashMLA_Extension_INCLUDES
|
||||
${flashmla_SOURCE_DIR}/csrc
|
||||
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||
)
|
||||
${flashmla_SOURCE_DIR}/csrc)
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${FlashMLA_SOURCES}"
|
||||
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${FlashMLA_Extension_SOURCES}"
|
||||
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
||||
|
||||
define_gpu_extension_target(
|
||||
_flashmla_C
|
||||
DESTINATION vllm
|
||||
@ -101,32 +60,8 @@ if(FLASH_MLA_ARCHS)
|
||||
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
||||
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
||||
target_compile_options(_flashmla_C PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
|
||||
define_gpu_extension_target(
|
||||
_flashmla_extension_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
SOURCES ${FlashMLA_Extension_SOURCES}
|
||||
COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
||||
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
||||
target_compile_options(_flashmla_extension_C PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||
else()
|
||||
# Create empty targets for setup.py when not targeting sm90a systems
|
||||
# Create an empty target for setup.py when not targeting sm90a systems
|
||||
add_custom_target(_flashmla_C)
|
||||
add_custom_target(_flashmla_extension_C)
|
||||
endif()
|
||||
|
||||
|
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
|
||||
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
||||
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
||||
|
||||
# If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
||||
# remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
|
||||
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
||||
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
|
||||
set(_CUDA_ARCHS)
|
||||
foreach(_arch ${_SRC_CUDA_ARCHS})
|
||||
if(_arch MATCHES "[af]$")
|
||||
if(_arch MATCHES "\\a$")
|
||||
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
||||
string(REGEX REPLACE "[af]$" "" _base "${_arch}")
|
||||
string(REPLACE "a" "" _base "${_arch}")
|
||||
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
||||
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
||||
list(APPEND _CUDA_ARCHS "${_arch}")
|
||||
|
@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_page_table(
|
||||
blk_coord,
|
||||
problem_shape,
|
||||
params.mainloop,
|
||||
shared_storage.tensors,
|
||||
pipeline_page_table, pipeline_pt_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_cpasync(
|
||||
blk_coord,
|
||||
@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
params.mainloop_params,
|
||||
shared_storage.tensors,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
local_split_kv,
|
||||
local_split_kv,
|
||||
/* must be shared pipe */
|
||||
pipeline_page_table, pipeline_pt_consumer_state
|
||||
);
|
||||
@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_tma</* paged= */ true>(
|
||||
blk_coord,
|
||||
@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
shared_storage.tensors,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
||||
}
|
||||
@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_tma<false>(
|
||||
blk_coord,
|
||||
@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
shared_storage.tensors,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
||||
}
|
||||
@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
mma(blk_coord,
|
||||
problem_shape,
|
||||
@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
pipeline_mma_s, pipeline_mma_s_producer_state,
|
||||
pipeline_p_mma, pipeline_p_mma_consumer_state,
|
||||
pipeline_mma_o, pipeline_mma_o_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto split_kv = params.split_kv;
|
||||
auto local_split_kv = split_kv;
|
||||
auto split_kv = params.split_kv;
|
||||
auto local_split_kv = split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
compute(
|
||||
blk_coord,
|
||||
@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
pipeline_mma_s, pipeline_mma_s_consumer_state,
|
||||
pipeline_p_mma, pipeline_p_mma_producer_state,
|
||||
pipeline_mma_o, pipeline_mma_o_consumer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
}
|
||||
|
||||
@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
cutlass::arch::NamedBarrier(
|
||||
(kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
|
||||
kNamedBarrierEpilogue
|
||||
).arrive_and_wait();
|
||||
).arrive();
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -56,11 +56,3 @@ void cp_gather_cache(
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
// Indexer K quantization and cache function
|
||||
void indexer_k_quant_and_cache(
|
||||
torch::Tensor& k, // [num_tokens, head_dim]
|
||||
torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
int64_t quant_block_size, // quantization block size
|
||||
const std::string& scale_fmt);
|
||||
|
@ -16,7 +16,6 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cfloat> // FLT_MIN
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
@ -397,180 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
|
||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void concat_and_cache_ds_mla_kernel(
|
||||
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
|
||||
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
|
||||
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
|
||||
// + pe_dim)]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int block_stride, //
|
||||
const int entry_stride, //
|
||||
const int kv_c_stride, //
|
||||
const int k_pe_stride, //
|
||||
const int kv_lora_rank, //
|
||||
const int pe_dim, //
|
||||
const int block_size, //
|
||||
const float* scale //
|
||||
) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0) {
|
||||
return;
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
const int64_t dst_idx_start =
|
||||
block_idx * block_stride + block_offset * entry_stride;
|
||||
|
||||
// Create 4 tile scales in shared memory
|
||||
__shared__ float smem[20];
|
||||
float* shard_abs_max = smem;
|
||||
float* tile_scales = smem + 16;
|
||||
|
||||
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||
// The first thread of the first warp in each tile writes the scale
|
||||
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||
// by another 2 warps (64 threads).
|
||||
// So in total, we use 18 warps (576 threads) per block.
|
||||
|
||||
// Cast kv_cache to 16_bit for RoPE values
|
||||
scalar_t* kv_cache_16bit =
|
||||
reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]);
|
||||
|
||||
// The last 64 threads handle the RoPE part
|
||||
if (threadIdx.x >= kv_lora_rank) {
|
||||
const int8_t pe_idx = threadIdx.x - kv_lora_rank;
|
||||
const int64_t src_idx = token_idx * k_pe_stride + pe_idx;
|
||||
// RoPE values start after the packed 8-bit NoPE values and the
|
||||
// 32-bit scales
|
||||
const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx;
|
||||
kv_cache_16bit[dst_idx] = k_pe[src_idx];
|
||||
return;
|
||||
}
|
||||
|
||||
// Determine the scale for each chunk of NoPE
|
||||
const int16_t tile_idx = threadIdx.x >> 7;
|
||||
const int16_t warp_idx = (threadIdx.x & 127) >> 5;
|
||||
const int16_t lane_idx = threadIdx.x & 31;
|
||||
|
||||
// Load the NoPE element for this thread into registers
|
||||
const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x;
|
||||
const scalar_t src_val = kv_c[src_idx];
|
||||
|
||||
// Warp-level reduction to find the max absolute value in the warp
|
||||
float max_abs = fabsf(src_val);
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset /= 2) {
|
||||
#ifdef USE_ROCM
|
||||
max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset));
|
||||
#else
|
||||
max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset));
|
||||
#endif
|
||||
}
|
||||
|
||||
// The first lane of each warp in each tile writes the max_abs of this part
|
||||
// of the tile to shared memory
|
||||
if (lane_idx == 0) {
|
||||
shard_abs_max[tile_idx * 4 + warp_idx] = max_abs;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// The first lane of the first warp in each tile computes the scale for the
|
||||
// tile and writes it to shared memory and to kv_cache
|
||||
if (warp_idx == 0 && lane_idx == 0) {
|
||||
float4 shard_abs_max_vec =
|
||||
reinterpret_cast<float4*>(shard_abs_max)[tile_idx];
|
||||
float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y),
|
||||
fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) /
|
||||
448.f;
|
||||
|
||||
// Avoid division by zero in `scaled_convert`
|
||||
tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN);
|
||||
float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]);
|
||||
const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx;
|
||||
kv_cache_32bit[dst_idx] = tile_scales[tile_idx];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Now all threads in the block scale and write their element
|
||||
const float scale_val = tile_scales[tile_idx];
|
||||
const int64_t dst_idx = dst_idx_start + threadIdx.x;
|
||||
kv_cache[dst_idx] =
|
||||
fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>(
|
||||
src_val, scale_val);
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void indexer_k_quant_and_cache_kernel(
|
||||
const scalar_t* __restrict__ k, // [num_tokens, head_dim]
|
||||
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, cache_stride]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int head_dim, // dimension of each head
|
||||
const int quant_block_size, // quantization block size
|
||||
const int cache_block_size, // cache block size
|
||||
const int cache_stride, // stride for each token in kv_cache
|
||||
const bool use_ue8m0 // use ue8m0 scale format
|
||||
) {
|
||||
constexpr int VEC_SIZE = 4;
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x +
|
||||
threadIdx.y * blockDim.x + threadIdx.x) *
|
||||
VEC_SIZE;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
const int64_t block_idx = slot_idx / cache_block_size;
|
||||
const int64_t block_offset = slot_idx % cache_block_size;
|
||||
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0 || (head_dim_idx >= head_dim)) {
|
||||
return;
|
||||
}
|
||||
|
||||
float2 k_val = (reinterpret_cast<const float2*>(
|
||||
k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE];
|
||||
scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val);
|
||||
float amax = 0.0f;
|
||||
for (int i = 0; i < VEC_SIZE; i++) {
|
||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
// Reduced amax
|
||||
for (int mask = 16; mask > 0; mask /= 2) {
|
||||
#ifdef USE_ROCM
|
||||
amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask));
|
||||
#else
|
||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
||||
#endif
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
}
|
||||
|
||||
const int64_t dst_offset = block_idx * cache_block_size * cache_stride +
|
||||
block_offset * head_dim + head_dim_idx;
|
||||
for (int i = 0; i < VEC_SIZE; i++) {
|
||||
kv_cache[dst_offset + i] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale);
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
const int64_t dst_scale_idx =
|
||||
block_idx * cache_block_size * cache_stride +
|
||||
cache_block_size * head_dim +
|
||||
(block_offset * head_dim + head_dim_idx) * 4 / quant_block_size;
|
||||
reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
@ -613,7 +438,7 @@ void reshape_and_cache(
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
|
||||
CALL_RESHAPE_AND_CACHE);
|
||||
CALL_RESHAPE_AND_CACHE)
|
||||
}
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
@ -684,18 +509,6 @@ void reshape_and_cache_flash(
|
||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
#define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(kv_c.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(k_pe.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
|
||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
||||
|
||||
void concat_and_cache_mla(
|
||||
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
||||
torch::Tensor& k_pe, // [num_tokens, pe_dim]
|
||||
@ -718,44 +531,20 @@ void concat_and_cache_mla(
|
||||
int pe_dim = k_pe.size(1);
|
||||
int block_size = kv_cache.size(1);
|
||||
|
||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
||||
TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla");
|
||||
TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla");
|
||||
TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(),
|
||||
"kv_cache.size(2) must be 656 bytes for fp8_ds_mla");
|
||||
TORCH_CHECK(kv_c.itemsize() == 2,
|
||||
"kv_c.itemsize() must be 2 for fp8_ds_mla");
|
||||
TORCH_CHECK(k_pe.itemsize() == 2,
|
||||
"k_pe.itemsize() must be 2 for fp8_ds_mla");
|
||||
} else {
|
||||
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
||||
}
|
||||
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
||||
|
||||
int kv_c_stride = kv_c.stride(0);
|
||||
int k_pe_stride = k_pe.stride(0);
|
||||
int block_stride = kv_cache.stride(0);
|
||||
int entry_stride = kv_cache.stride(1);
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(kv_lora_rank, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (kv_cache_dtype == "fp8_ds_mla") {
|
||||
dim3 grid(num_tokens);
|
||||
// For the NoPE part, each tile of 128 elements is handled by 4 warps
|
||||
// (128 threads). There are 4 total tiles, so 16 warps (512 threads).
|
||||
// The first thread of the first warp in each tile writes the scale
|
||||
// value for the tile. The RoPE part (last 64 elements) is handled
|
||||
// by another 2 warps (64 threads).
|
||||
// So in total, we use 18 warps (576 threads) per block.
|
||||
dim3 block(576);
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||
CALL_CONCAT_AND_CACHE_DS_MLA);
|
||||
} else {
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(kv_lora_rank, 512));
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||
CALL_CONCAT_AND_CACHE_MLA);
|
||||
}
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||
CALL_CONCAT_AND_CACHE_MLA);
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
@ -1133,42 +922,3 @@ void cp_gather_cache(
|
||||
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
|
||||
}
|
||||
}
|
||||
|
||||
// Macro to dispatch the kernel based on the data type.
|
||||
#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(k.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \
|
||||
cache_block_size, cache_stride, use_ue8m0);
|
||||
|
||||
void indexer_k_quant_and_cache(
|
||||
torch::Tensor& k, // [num_tokens, head_dim]
|
||||
torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
int64_t quant_block_size, // quantization block size
|
||||
const std::string& scale_fmt) {
|
||||
int num_tokens = k.size(0);
|
||||
int head_dim = k.size(1);
|
||||
int cache_block_size = kv_cache.size(1);
|
||||
int cache_stride = kv_cache.size(2);
|
||||
bool use_ue8m0 = scale_fmt == "ue8m0";
|
||||
|
||||
TORCH_CHECK(k.device() == kv_cache.device(),
|
||||
"k and kv_cache must be on the same device");
|
||||
TORCH_CHECK(k.device() == slot_mapping.device(),
|
||||
"k and slot_mapping must be on the same device");
|
||||
TORCH_CHECK(head_dim % quant_block_size == 0,
|
||||
"head_dim must be divisible by quant_block_size");
|
||||
|
||||
constexpr int vec_size = 4;
|
||||
dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) /
|
||||
(quant_block_size * vec_size));
|
||||
dim3 block(32, vec_size);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
||||
CALL_INDEXER_K_QUANT_AND_CACHE);
|
||||
}
|
||||
|
@ -1,16 +0,0 @@
|
||||
#pragma once
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cctype>
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// vllm_kernel_override_batch_invariant(); returns true
|
||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
||||
inline bool vllm_kernel_override_batch_invariant() {
|
||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
@ -14,8 +14,7 @@
|
||||
// arm implementation
|
||||
#include "cpu_types_arm.hpp"
|
||||
#else
|
||||
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
|
||||
#include "cpu_types_scalar.hpp"
|
||||
#warning "unsupported vLLM cpu implementation"
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
|
@ -1,513 +0,0 @@
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <torch/all.h>
|
||||
#include "float_convert.hpp"
|
||||
|
||||
namespace vec_op {
|
||||
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||
|
||||
#ifndef CPU_OP_GUARD
|
||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||
#else
|
||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||
std::cout << #NAME << " invoked." << std::endl;
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME) \
|
||||
std::cout << #NAME << " exit." << std::endl;
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
#define __max(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define __min(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define __abs(a) ((a) < (0) ? (0 - a) : (a))
|
||||
|
||||
typedef struct f16x8_t {
|
||||
uint16_t val[8];
|
||||
} f16x8_t;
|
||||
|
||||
typedef struct f16x16_t {
|
||||
uint16_t val[16];
|
||||
} f16x16_t;
|
||||
|
||||
typedef struct f16x32_t {
|
||||
uint16_t val[32];
|
||||
} f16x32_t;
|
||||
|
||||
typedef struct f32x4_t {
|
||||
float val[4];
|
||||
} f32x4_t;
|
||||
|
||||
typedef struct f32x8_t {
|
||||
float val[8];
|
||||
} f32x8_t;
|
||||
|
||||
typedef struct f32x16_t {
|
||||
float val[16];
|
||||
} f32x16_t;
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
|
||||
(f(std::integral_constant<T, indexes>{}), ...);
|
||||
};
|
||||
}; // namespace
|
||||
|
||||
template <typename T, T count, typename F,
|
||||
typename = std::enable_if_t<std::is_invocable_v<F, T> > >
|
||||
constexpr void unroll_loop(F&& f) {
|
||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct Vec {
|
||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||
};
|
||||
|
||||
struct FP32Vec8;
|
||||
struct FP32Vec16;
|
||||
|
||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
f16x8_t reg;
|
||||
|
||||
explicit FP16Vec8(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
|
||||
|
||||
explicit FP16Vec8(const FP32Vec8&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
f16x16_t reg;
|
||||
|
||||
explicit FP16Vec16(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
|
||||
|
||||
explicit FP16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
||||
|
||||
void save(void* ptr, const int elem_num) const {
|
||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
f16x8_t reg;
|
||||
|
||||
explicit BF16Vec8(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
|
||||
|
||||
explicit BF16Vec8(const FP32Vec8&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
f16x16_t reg;
|
||||
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
||||
|
||||
void save(void* ptr, const int elem_num) const {
|
||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
f16x32_t reg;
|
||||
|
||||
explicit BF16Vec32(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
|
||||
|
||||
explicit BF16Vec32(f16x32_t data) : reg(data) {};
|
||||
|
||||
explicit BF16Vec32(BF16Vec8& vec8_data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
|
||||
}
|
||||
}
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
constexpr static int VEC_ELEM_NUM = 4;
|
||||
|
||||
f32x4_t reg;
|
||||
|
||||
explicit FP32Vec4(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec4() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec4(const float* ptr)
|
||||
: reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
|
||||
|
||||
explicit FP32Vec4(f32x4_t data) : reg(data) {};
|
||||
|
||||
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
|
||||
};
|
||||
|
||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
f32x8_t reg;
|
||||
|
||||
explicit FP32Vec8(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec8() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec8(const float* ptr)
|
||||
: reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
|
||||
|
||||
explicit FP32Vec8(f32x8_t data) : reg(data) {};
|
||||
|
||||
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
|
||||
|
||||
explicit FP32Vec8(const FP16Vec8& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
FP32Vec8(const BF16Vec8& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
float result = 0;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result += reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = expf(reg.val[i]);
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = tanhf(reg.val[i]);
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = erf(reg.val[i]);
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] * b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator+(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] + b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator-(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] - b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator/(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] / b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
f32x16_t reg;
|
||||
|
||||
explicit FP32Vec16(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const float* ptr)
|
||||
: reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
|
||||
|
||||
explicit FP32Vec16(f32x16_t data) : reg(data) {};
|
||||
|
||||
FP32Vec16(const FP32Vec4& data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
|
||||
}
|
||||
}
|
||||
|
||||
FP32Vec16(const FP32Vec8& data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
|
||||
}
|
||||
}
|
||||
|
||||
FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
|
||||
|
||||
explicit FP32Vec16(const FP16Vec16& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
||||
|
||||
FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] * b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] + b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] - b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] / b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 max(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 min(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 abs() const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __abs(reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
float result = 0.0f;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result += reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_max() const {
|
||||
float result = reg.val[0];
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result = __max(reg.val[i], result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_min() const {
|
||||
float result = reg.val[0];
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result = __min(reg.val[i], result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <int group_size>
|
||||
float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
float sum = 0.0;
|
||||
int start = idx * group_size;
|
||||
int end = (idx + 1) * group_size;
|
||||
|
||||
for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
|
||||
sum += reg.val[start];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VecType {
|
||||
using vec_type = void;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using vec_t = typename VecType<T>::vec_type;
|
||||
|
||||
template <>
|
||||
struct VecType<float> {
|
||||
using vec_type = FP32Vec8;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecType<c10::Half> {
|
||||
using vec_type = FP16Vec8;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecType<c10::BFloat16> {
|
||||
using vec_type = BF16Vec8;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void storeFP32(float v, T* ptr) {
|
||||
*ptr = v;
|
||||
}
|
||||
|
||||
/*
|
||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||
c10::Half __attribute__((__may_alias__)) *v_ptr =
|
||||
reinterpret_cast<c10::Half *>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
*/
|
||||
|
||||
template <>
|
||||
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
|
||||
uint16_t fp16 = float_to_fp16(v);
|
||||
*reinterpret_cast<uint16_t*>(ptr) = fp16;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||
c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
|
||||
reinterpret_cast<c10::BFloat16*>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
|
||||
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
|
||||
|
||||
}; // namespace vec_op
|
@ -1,106 +0,0 @@
|
||||
|
||||
static float bf16_to_float(uint16_t bf16) {
|
||||
uint32_t bits = static_cast<uint32_t>(bf16) << 16;
|
||||
float fp32;
|
||||
std::memcpy(&fp32, &bits, sizeof(fp32));
|
||||
return fp32;
|
||||
}
|
||||
|
||||
static uint16_t float_to_bf16(float fp32) {
|
||||
uint32_t bits;
|
||||
std::memcpy(&bits, &fp32, sizeof(fp32));
|
||||
return static_cast<uint16_t>(bits >> 16);
|
||||
}
|
||||
|
||||
/************************************************
|
||||
* Copyright (c) 2015 Princeton Vision Group
|
||||
* Licensed under the MIT license.
|
||||
* Codes below copied from
|
||||
* https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
|
||||
*************************************************/
|
||||
static uint16_t float_to_fp16(float fp32) {
|
||||
uint16_t fp16;
|
||||
|
||||
unsigned x;
|
||||
unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
|
||||
unsigned sign, exponent, mantissa;
|
||||
|
||||
std::memcpy(&x, &fp32, sizeof(fp32));
|
||||
u = (x & 0x7fffffff);
|
||||
|
||||
// Get rid of +NaN/-NaN case first.
|
||||
if (u > 0x7f800000) {
|
||||
fp16 = 0x7fffU;
|
||||
return fp16;
|
||||
}
|
||||
|
||||
sign = ((x >> 16) & 0x8000);
|
||||
|
||||
// Get rid of +Inf/-Inf, +0/-0.
|
||||
if (u > 0x477fefff) {
|
||||
fp16 = sign | 0x7c00U;
|
||||
return fp16;
|
||||
}
|
||||
if (u < 0x33000001) {
|
||||
fp16 = (sign | 0x0000);
|
||||
return fp16;
|
||||
}
|
||||
|
||||
exponent = ((u >> 23) & 0xff);
|
||||
mantissa = (u & 0x7fffff);
|
||||
|
||||
if (exponent > 0x70) {
|
||||
shift = 13;
|
||||
exponent -= 0x70;
|
||||
} else {
|
||||
shift = 0x7e - exponent;
|
||||
exponent = 0;
|
||||
mantissa |= 0x800000;
|
||||
}
|
||||
lsb = (1 << shift);
|
||||
lsb_s1 = (lsb >> 1);
|
||||
lsb_m1 = (lsb - 1);
|
||||
|
||||
// Round to nearest even.
|
||||
remainder = (mantissa & lsb_m1);
|
||||
mantissa >>= shift;
|
||||
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
|
||||
++mantissa;
|
||||
if (!(mantissa & 0x3ff)) {
|
||||
++exponent;
|
||||
mantissa = 0;
|
||||
}
|
||||
}
|
||||
|
||||
fp16 = (sign | (exponent << 10) | mantissa);
|
||||
|
||||
return fp16;
|
||||
}
|
||||
|
||||
static float fp16_to_float(uint16_t fp16) {
|
||||
unsigned sign = ((fp16 >> 15) & 1);
|
||||
unsigned exponent = ((fp16 >> 10) & 0x1f);
|
||||
unsigned mantissa = ((fp16 & 0x3ff) << 13);
|
||||
int temp;
|
||||
float fp32;
|
||||
if (exponent == 0x1f) { /* NaN or Inf */
|
||||
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
|
||||
exponent = 0xff;
|
||||
} else if (!exponent) { /* Denorm or Zero */
|
||||
if (mantissa) {
|
||||
unsigned int msb;
|
||||
exponent = 0x71;
|
||||
do {
|
||||
msb = (mantissa & 0x400000);
|
||||
mantissa <<= 1; /* normalize */
|
||||
--exponent;
|
||||
} while (!msb);
|
||||
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
|
||||
}
|
||||
} else {
|
||||
exponent += 0x70;
|
||||
}
|
||||
temp = ((sign << 31) | (exponent << 23) | mantissa);
|
||||
std::memcpy(&fp32, &temp, sizeof(temp));
|
||||
return fp32;
|
||||
}
|
@ -8,37 +8,11 @@
|
||||
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
||||
#endif
|
||||
|
||||
// Compile-time estimate of max threads per SM for launch bounds.
|
||||
// Families: 1024, 1536, 2048 threads/SM.
|
||||
// compile-time estimate of max threads per SM for launch bounds.
|
||||
#ifndef VLLM_MAX_THREADS_PER_SM
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
/* 1024 thr/SM: Turing (sm_75) */
|
||||
#if (__CUDA_ARCH__ == 750)
|
||||
#define VLLM_MAX_THREADS_PER_SM 1024
|
||||
|
||||
/* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
|
||||
GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
|
||||
#elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
|
||||
(__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \
|
||||
(__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
|
||||
(__CUDA_ARCH__ == 1210)
|
||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||
|
||||
/* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
|
||||
Hopper (sm_90), Blackwell (sm_100/103) */
|
||||
#elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
|
||||
(__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \
|
||||
(__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
|
||||
/* Fallback: use 2048 for unknown future CCs */
|
||||
#else
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
#endif
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||
#else
|
||||
/* Host pass (no __CUDA_ARCH__): neutral default */
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
#endif
|
||||
#endif
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include "type_convert.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -414,9 +413,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
wt_ptr % req_alignment_bytes == 0;
|
||||
bool offsets_are_multiple_of_vector_width =
|
||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
||||
!batch_invariant_launch) {
|
||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||
@ -462,8 +459,7 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
||||
LAUNCH_FUSED_POLY_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_POLY_NORM(0);
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include "quantization/fp8/common.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -241,9 +240,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||
bool ptrs_are_aligned =
|
||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
||||
!batch_invariant_launch) {
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||
|
@ -44,9 +44,6 @@ __global__ void moe_align_block_size_kernel(
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int expert_id = topk_ids[i];
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
int warp_idx = expert_id / experts_per_warp;
|
||||
int expert_offset = expert_id % experts_per_warp;
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
||||
@ -98,15 +95,12 @@ template <typename scalar_t>
|
||||
__global__ void count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
size_t numel, int32_t num_experts) {
|
||||
size_t numel) {
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
}
|
||||
@ -275,7 +269,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -21,7 +21,6 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
#include "../core/batch_invariant.hpp"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -406,8 +405,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
|
@ -231,7 +231,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
} else {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
@ -245,7 +245,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
} else {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
@ -259,7 +259,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
} else {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
|
||||
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
@ -271,10 +271,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
// TMA epilogue isn't compatible with Swap A/B
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
@ -25,10 +25,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
|
||||
int8_func(c, a, b, a_scales, b_scales, bias);
|
||||
} else {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
TORCH_CHECK(
|
||||
false, "Int8 not supported on SM", version_num,
|
||||
". Use FP8 quantization instead, or run on older arch (SM < 100).");
|
||||
TORCH_CHECK(false, "Int8 not supported for this architecture");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
@ -67,9 +67,8 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
|
||||
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
|
||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
|
||||
void get_cutlass_moe_mm_data_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
|
@ -576,17 +576,6 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
|
||||
TORCH_CHECK(false, \
|
||||
"Unsupported input type of kv cache: ", SRC_DTYPE); \
|
||||
} \
|
||||
} else if (KV_DTYPE == "fp8_ds_mla") { \
|
||||
if (SRC_DTYPE == at::ScalarType::Float) { \
|
||||
FN(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \
|
||||
} else if (SRC_DTYPE == at::ScalarType::Half) { \
|
||||
FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \
|
||||
} else if (SRC_DTYPE == at::ScalarType::BFloat16) { \
|
||||
FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3); \
|
||||
} else { \
|
||||
TORCH_CHECK(false, \
|
||||
"Unsupported input type of kv cache: ", SRC_DTYPE); \
|
||||
} \
|
||||
} else { \
|
||||
TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE); \
|
||||
} \
|
||||
|
@ -40,8 +40,7 @@ using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
|
||||
#define __HIP__FP8MFMA__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__) || \
|
||||
defined(__gfx1150__) || defined(__gfx1151__))
|
||||
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
|
||||
#define __HIP__GFX11__
|
||||
#endif
|
||||
|
||||
|
@ -6,11 +6,11 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
||||
const int64_t rows_per_block);
|
||||
|
||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias,
|
||||
const c10::optional<at::Tensor>& in_bias,
|
||||
const int64_t CuCount);
|
||||
|
||||
void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const at::Tensor& scale_a, const at::Tensor& scale_b,
|
||||
const int64_t CuCount);
|
||||
|
||||
|
@ -1271,7 +1271,7 @@ int mindiv(int N, int div1, int div2) {
|
||||
}
|
||||
|
||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias,
|
||||
const c10::optional<at::Tensor>& in_bias,
|
||||
const int64_t CuCount) {
|
||||
auto M_in = in_a.size(0);
|
||||
auto K_in = in_a.size(1);
|
||||
@ -1729,7 +1729,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
|
||||
void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const at::Tensor& scale_a, const at::Tensor& scale_b,
|
||||
const int64_t CuCount) {
|
||||
static c10::ScalarType kFp8Type = is_fp8_ocp()
|
||||
|
@ -713,13 +713,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
"cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
|
||||
"Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
|
||||
cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
|
||||
|
||||
cache_ops.def(
|
||||
"indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor "
|
||||
"slot_mapping, "
|
||||
"int quant_block_size, str kv_cache_dtype) -> ()");
|
||||
cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
|
||||
&indexer_k_quant_and_cache);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
|
||||
|
@ -13,8 +13,8 @@ ARG PYTHON_VERSION=3.12
|
||||
# private registries that use a different repository naming conventions.
|
||||
#
|
||||
# Example:
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
|
||||
@ -79,11 +79,31 @@ ARG DEADSNAKES_MIRROR_URL
|
||||
ARG DEADSNAKES_GPGKEY_URL
|
||||
ARG GET_PIP_URL
|
||||
|
||||
# Install minimal dependencies
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
||||
fi ; \
|
||||
else \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done ; \
|
||||
fi \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
@ -91,9 +111,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Install uv and Python
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -276,12 +296,32 @@ ARG GET_PIP_URL
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install minimal dependencies
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
||||
fi ; \
|
||||
else \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done ; \
|
||||
fi \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
@ -289,9 +329,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Install uv and Python
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -351,32 +391,19 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
git clone --depth 1 --recursive --shallow-submodules \
|
||||
--branch ${FLASHINFER_GIT_REF} \
|
||||
${FLASHINFER_GIT_REPO} flashinfer
|
||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||
else
|
||||
# CUDA 12.8+ supports 10.0a and 12.0
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||
fi
|
||||
pushd flashinfer
|
||||
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
|
||||
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
|
||||
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
|
||||
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||
# Download pre-compiled cubins
|
||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
|
||||
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||
else
|
||||
# CUDA 12.8+ supports 10.0a and 12.0
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||
fi
|
||||
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
|
||||
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
|
||||
# Build AOT kernels
|
||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||
python3 -m flashinfer.aot
|
||||
@ -416,7 +443,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||
|
||||
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
|
||||
RUN set -eux; \
|
||||
@ -434,12 +461,6 @@ ENV CUDA_HOME=/usr/local/cuda
|
||||
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
|
||||
&& bash install_python_libraries.sh
|
||||
|
||||
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
||||
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
|
||||
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
|
||||
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
|
||||
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
#################### TEST IMAGE ####################
|
||||
@ -512,7 +533,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="0.46.1"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3]>=0.14.0'
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
|
@ -47,7 +47,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
# Install Python dependencies
|
||||
# Install Python dependencies
|
||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||
|
||||
######################### TEST DEPS #########################
|
||||
FROM base AS vllm-test-deps
|
||||
@ -114,10 +114,13 @@ WORKDIR /workspace/vllm
|
||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
cp requirements/test.in requirements/cpu-test.in && \
|
||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||
sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -r requirements/cpu-test.txt
|
||||
uv pip install -r requirements/cpu-test.txt
|
||||
|
||||
######################### DEV IMAGE #########################
|
||||
FROM vllm-build AS vllm-dev
|
||||
@ -130,12 +133,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||
|
||||
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
|
||||
|
||||
@ -160,12 +163,11 @@ ADD ./benchmarks/ ./benchmarks/
|
||||
ADD ./vllm/collect_env.py .
|
||||
ADD ./.buildkite/ ./.buildkite/
|
||||
|
||||
# Create symlink for vllm-workspace to maintain CI compatibility
|
||||
RUN ln -sf /workspace /vllm-workspace
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
ENTRYPOINT ["bash"]
|
||||
|
||||
######################### RELEASE IMAGE #########################
|
||||
FROM base AS vllm-openai
|
||||
|
@ -6,7 +6,7 @@ ARG CUDA_VERSION=12.8.0
|
||||
#
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||
ARG CUDA_VERSION=12.8.0
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
|
@ -15,7 +15,7 @@ FROM ${BASE_IMAGE} AS base
|
||||
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
||||
|
||||
@ -141,4 +141,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
@ -1,2 +1,2 @@
|
||||
search:
|
||||
exclude: true
|
||||
boost: 0.5
|
||||
|
Before Width: | Height: | Size: 127 KiB After Width: | Height: | Size: 119 KiB |
Before Width: | Height: | Size: 627 KiB |
Before Width: | Height: | Size: 350 KiB |
Before Width: | Height: | Size: 814 KiB |
Before Width: | Height: | Size: 267 KiB |
Before Width: | Height: | Size: 354 KiB |
Before Width: | Height: | Size: 781 KiB |
Before Width: | Height: | Size: 51 KiB |
Before Width: | Height: | Size: 359 KiB |
Before Width: | Height: | Size: 82 KiB |
@ -2,7 +2,6 @@
|
||||
|
||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||
|
||||
- [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
|
||||
- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
|
||||
- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
|
||||
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
|
||||
|
@ -139,9 +139,9 @@ there is relatively little gain from TP. On the other hand, TP incurs significan
|
||||
overhead because of all-reduce being performed after every layer.
|
||||
|
||||
Given this, it may be advantageous to instead shard the batched input data using TP, essentially
|
||||
performing batch-level DP. This has been shown to improve the throughput and TTFT by around 10% for
|
||||
performing batch-level DP. This has been shown to improve the throughput by around 10% for
|
||||
`tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations,
|
||||
batch-level DP can provide another 40% improvement compared to regular TP.
|
||||
batch-level DP can provide another 40% increase to throughput compared to regular TP.
|
||||
|
||||
Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank,
|
||||
there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already.
|
||||
@ -172,15 +172,14 @@ Batch-level DP needs to be implemented on a per-model basis,
|
||||
and enabled by setting `supports_encoder_tp_data = True` in the model class.
|
||||
Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.
|
||||
|
||||
Known supported models (with corresponding benchmarks):
|
||||
Known supported models:
|
||||
|
||||
- dots_ocr (<gh-pr:25466>)
|
||||
- GLM-4.1V or above (<gh-pr:23168>)
|
||||
- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
|
||||
- InternVL (<gh-pr:23909>)
|
||||
- Kimi-VL (<gh-pr:23817>)
|
||||
- Llama4 (<gh-pr:18368>)
|
||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
|
||||
- Qwen2.5-VL (<gh-pr:22742>)
|
||||
- Step3 (<gh-pr:22697>)
|
||||
|
||||
## Input Processing
|
||||
|
@ -823,30 +823,6 @@ The latest performance results are hosted on the public [vLLM Performance Dashbo
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
### Continuous Benchmarking
|
||||
|
||||
The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
|
||||
|
||||
#### How It Works
|
||||
|
||||
The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
|
||||
|
||||
- **Serving tests**: Measure request handling and API performance
|
||||
- **Throughput tests**: Evaluate token generation rates
|
||||
- **Latency tests**: Assess response time characteristics
|
||||
|
||||
#### Benchmark Configuration
|
||||
|
||||
The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
|
||||
|
||||
1. Navigate to the appropriate GPU directory in the benchmarks configuration
|
||||
2. Add your model specifications to the corresponding configuration files
|
||||
3. The new models will be included in the next scheduled benchmark run
|
||||
|
||||
#### Viewing Results
|
||||
|
||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
[](){ #nightly-benchmarks }
|
||||
|
||||
## Nightly Benchmarks
|
||||
|
@ -66,12 +66,35 @@ Further update the model as follows:
|
||||
!!! important
|
||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||
|
||||
!!! note
|
||||
By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in
|
||||
[PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
|
||||
This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings].
|
||||
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||
|
||||
You may override this method if additional logic is required for your model when merging embeddings.
|
||||
??? code
|
||||
|
||||
```python
|
||||
from .utils import merge_multimodal_embeddings
|
||||
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
# `get_input_embeddings` should already be implemented for the language
|
||||
# model as one of the requirements of basic vLLM model implementation.
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
|
||||
if multimodal_embeddings is not None:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids=input_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
multimodal_embeddings=multimodal_embeddings,
|
||||
placeholder_token_id=self.config.image_token_index)
|
||||
|
||||
return inputs_embeds
|
||||
```
|
||||
|
||||
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
||||
|
||||
|
@ -160,22 +160,6 @@ GUI example:
|
||||
|
||||
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
|
||||
|
||||
## Continuous Profiling
|
||||
|
||||
There is a [GitHub CI workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-profiling.yml) in the PyTorch infrastructure repository that provides continuous profiling for different models on vLLM. This automated profiling helps track performance characteristics over time and across different model configurations.
|
||||
|
||||
### How It Works
|
||||
|
||||
The workflow currently runs weekly profiling sessions for selected models, generating detailed performance traces that can be analyzed using different tools to identify performance regressions or optimization opportunities. But, it can be triggered manually as well, using the Github Action tool.
|
||||
|
||||
### Adding New Models
|
||||
|
||||
To extend the continuous profiling to additional models, you can modify the [profiling-tests.json](https://github.com/pytorch/pytorch-integration-testing/blob/main/vllm-profiling/cuda/profiling-tests.json) configuration file in the PyTorch integration testing repository. Simply add your model specifications to this file to include them in the automated profiling runs.
|
||||
|
||||
### Viewing Profiling Results
|
||||
|
||||
The profiling traces generated by the continuous profiling workflow are publicly available on the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). Look for the **Profiling traces** table to access and download the traces for different models and runs.
|
||||
|
||||
## Profiling vLLM Python Code
|
||||
|
||||
The Python standard library includes
|
||||
@ -224,11 +208,3 @@ One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
|
||||
pip install snakeviz
|
||||
snakeviz expensive_function.prof
|
||||
```
|
||||
|
||||
### Analyzing Garbage Collection Costs
|
||||
|
||||
Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
|
||||
|
||||
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
|
||||
- VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
|
||||
collected objects for each gc.collect
|
||||
|
@ -1,170 +0,0 @@
|
||||
# Hugging Face Inference Endpoints
|
||||
|
||||
## Overview
|
||||
|
||||
Models compatible with vLLM can be deployed on Hugging Face Inference Endpoints, either starting from the [Hugging Face Hub](https://huggingface.co) or directly from the [Inference Endpoints](https://endpoints.huggingface.co/) interface. This allows you to serve models in a fully managed environment with GPU acceleration, auto-scaling, and monitoring, without managing the infrastructure manually.
|
||||
|
||||
For advanced details on vLLM integration and deployment options, see [Advanced Deployment Details](#advanced-deployment-details).
|
||||
|
||||
## Deployment Methods
|
||||
|
||||
- [**Method 1: Deploy from the Catalog.**](#method-1-deploy-from-the-catalog) One-click deploy models from the Hugging Face Hub with ready-made optimized configurations.
|
||||
- [**Method 2: Guided Deployment (Transformers Models).**](#method-2-guided-deployment-transformers-models) Instantly deploy models tagged with `transformers` from the Hub UI using the **Deploy** button.
|
||||
- [**Method 3: Manual Deployment (Advanced Models).**](#method-3-manual-deployment-advanced-models) For models that either use custom code with the `transformers` tag, or don’t run with standard `transformers` but are supported by vLLM. This method requires manual configuration.
|
||||
|
||||
### Method 1: Deploy from the Catalog
|
||||
|
||||
This is the easiest way to get started with vLLM on Hugging Face Inference Endpoints. You can browse a catalog of models with verified and optimized deployment configuration at [Inference Endpoints](https://endpoints.huggingface.co/catalog) to maximize performance.
|
||||
|
||||
1. Go to [Endpoints Catalog](https://endpoints.huggingface.co/catalog) and in the **Inference Server** options, select `vLLM`.This will display the current list of models with optimized preconfigured options.
|
||||
|
||||

|
||||
|
||||
1. Select the desired model and click **Create Endpoint**.
|
||||
|
||||

|
||||
|
||||
1. Once the deployment is ready, you can use the endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console, remembering to append `/v1` as required.
|
||||
|
||||
```python
|
||||
# pip install openai
|
||||
from openai import OpenAI
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "HuggingFaceTB/SmolLM3-3B",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Give me a brief explanation of gravity in simple terms."
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
```
|
||||
|
||||
!!! note
|
||||
The catalog provides models optimized for vLLM, including GPU settings and inference engine configurations. You can monitor the endpoint and update the **container or its configuration** from the Inference Endpoints UI.
|
||||
|
||||
### Method 2: Guided Deployment (Transformers Models)
|
||||
|
||||
This method applies to models with the `transformers` library tag in their metadata. It allows you to deploy a model directly from the Hub UI without manual configuration.
|
||||
|
||||
1. Navigate to a model on [Hugging Face Hub](https://huggingface.co/models).
|
||||
For this example we will use the [`ibm-granite/granite-docling-258M`](https://huggingface.co/ibm-granite/granite-docling-258M) model. You can verify that the model is compatible by checking the front matter in the [README](https://huggingface.co/ibm-granite/granite-docling-258M/blob/main/README.md), where the library is tagged as `library: transformers`.
|
||||
|
||||
2. Locate the **Deploy** button. The button appears for models tagged with `transformers` at the top right of the [model card](https://huggingface.co/ibm-granite/granite-docling-258M).
|
||||
|
||||

|
||||
|
||||
3. Click to **Deploy** button > **HF Inference Endpoints**. You will be taken to the Inference Endpoints interface to configure the deployment.
|
||||
|
||||

|
||||
|
||||
4. Select the Hardware (we choose AWS>GPU>T4 for the example) and Container Configuration. Choose `vLLM` as the container type and finalize the deployment pressing **Create Endpoint**.
|
||||
|
||||

|
||||
|
||||
5. Use the deployed endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console (remember to add `/v1` needed). You can then use your endpoint programmatically or via the SDK.
|
||||
|
||||
```python
|
||||
# pip install openai
|
||||
from openai import OpenAI
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "ibm-granite/granite-docling-258M",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Convert this page to docling."
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
```
|
||||
|
||||
!!! note
|
||||
This method uses best-guess defaults. You may need to adjust the configuration to fit your specific requirements.
|
||||
|
||||
### Method 3: Manual Deployment (Advanced Models)
|
||||
|
||||
Some models require manual deployment because they:
|
||||
|
||||
- Use custom code with the `transformers` tag
|
||||
- Don't run with standard `transformers` but are supported by `vLLM`
|
||||
|
||||
These models cannot be deployed using the **Deploy** button on the model card.
|
||||
|
||||
In this guide, we demonstrate manual deployment using the [rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) model, an OCR model integrated with vLLM (see vLLM [PR](https://github.com/vllm-project/vllm/pull/24645)).
|
||||
|
||||
1. Start a new deployment. Go to [Inference Endpoints](https://endpoints.huggingface.co/) and click `New`.
|
||||
|
||||

|
||||
|
||||
2. Search the model in the Hub. In the dialog, switch to **Hub** and search for the desired model.
|
||||
|
||||

|
||||
|
||||
3. Choosing infrastructure. On the configuration page, select the cloud provider and hardware from the available options.
|
||||
For this demo, we choose AWS and L4 GPU. Adjust according to your hardware needs.
|
||||
|
||||

|
||||
|
||||
4. Configure the container. Scroll to the **Container Configuration** and select `vLLM` as the container type.
|
||||
|
||||

|
||||
|
||||
5. Create the endpoint. Click **Create Endpoint** to deploy the model.
|
||||
|
||||
Once the endpoint is ready, you can use it with the OpenAI Completion API, cURL, or other SDKs. Remember to append `/v1` to the deployment URL if needed.
|
||||
|
||||
!!! note
|
||||
You can adjust the **container settings** (Container URI, Container Arguments) from the Inference Endpoints UI and press **Update Endpoint**. This redeploys the endpoint with the updated container configuration. Changes to the model itself require creating a new endpoint or redeploying with a different model. For example, for this demo, you may need to update the Container URI to the nightly image (`vllm/vllm-openai:nightly`) and add the `--trust-remote-code` flag in the container arguments.
|
||||
|
||||
## Advanced Deployment Details
|
||||
|
||||
With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
|
||||
|
||||
Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
|
||||
|
||||
The platform integrates seamlessly with the Hugging Face Hub, allowing you to deploy any vLLM- or `transformers`-compatible model, track usage, and update the inference engine directly. The vLLM engine comes preconfigured, enabling optimized inference and easy switching between models or engines without modifying your code. This setup simplifies production deployment: endpoints are ready in minutes, include monitoring and logging, and let you focus on serving models rather than maintaining infrastructure.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
|
||||
- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
|
||||
- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
|
||||
- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
|
@ -242,8 +242,30 @@ Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kern
|
||||
|
||||
## FusedMoEPrepareAndFinalize Implementations
|
||||
|
||||
See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-modular-all2all-backends) for a list of all the available modular prepare and finalize subclasses.
|
||||
The following table lists the `FusedMoEPrepareAndFinalize` implementations at the time of writing,
|
||||
|
||||
| Implementation | Type | Comments |
|
||||
| :--- | :--- | :--- |
|
||||
| DeepEPHTPrepareAndFinalize | Contiguous / Non-Batched | Uses the DeepEP High-Throughput all2all kernels. |
|
||||
| DeepEPLLPrepareAndFinalize | Batched | Uses the DeepEP Low-Latency all2all kernels. |
|
||||
| PplxPrepareAndFinalize | Batched | Uses the Perplexity all2all kernels. |
|
||||
| FlashInferCutlassMoEPrepareAndFinalize | Contiguous | |
|
||||
| MoEPrepareAndFinalizeNoEP | Contiguous | This implementation is used when there is no EP. i.e. no all2all kernels are invoked. |
|
||||
| BatchedPrepareAndFinalize | Batched | A reference prepare/finalize class that reorganizes the tokens into expert batched format, i.e. E x max_num_tokens x K. (Doesn’t use any all2all kernels. This is primarily used in unit testing) |
|
||||
|
||||
## FusedMoEPermuteExpertsUnpermute
|
||||
|
||||
See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-experts-kernels) for a list of all the available modular experts.
|
||||
The following table lists the `FusedMoEPermuteExpertsUnpermute` implementations at the time of writing,
|
||||
|
||||
| Implementation | Type | Comment |
|
||||
| :--- | :--- | :--- |
|
||||
| BatchedDeepGemmExperts | Batched | Uses the DeepGemm’s Masked Grouped Gemm kernels for the fused_moe operation. |
|
||||
| BatchedTritonExperts | Batched | Uses a Triton Kernel for the Batched matmuls. |
|
||||
| BatchedTritonOrDeepGemmExperts | Batched | Chooses either the `BatchedDeepGemmExperts` or `BatchedTritonExperts` based on environment settings. |
|
||||
| DeepGemmExperts | Contiguous / Non-Batched | Uses DeepGemm’s Grouped Gemm kernels for fused_moe operation. |
|
||||
| TritonExperts | Contiguous / Non-Batched | Uses a Triton Kernel for fused_moe matmuls. |
|
||||
| TritonOrDeepGemmExperts | Contiguous / Non-Batched | Chooses either the `DeepGemmExperts` or `TritonExperts` based on fused_moe inputs. |
|
||||
| CutlassExpertsFP8 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp8 matmuls. |
|
||||
| CutlassExpertsFP4 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp4 matmuls. |
|
||||
| FlashInferExperts | Contiguous | Uses fused_moe operation from FlashInfer |
|
||||
| NaiveBatchedExperts | Batched | Reference Batched Experts implementation. Primarily used in unit tests. |
|
||||
|
@ -1,119 +0,0 @@
|
||||
# Fused MoE Kernel features
|
||||
|
||||
The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels.
|
||||
|
||||
## Fused MoE Modular All2All backends
|
||||
|
||||
There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend.
|
||||
|
||||
The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
|
||||
|
||||
The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
|
||||
|
||||
The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
|
||||
|
||||
Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step).
|
||||
|
||||
Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag.
|
||||
|
||||
unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP.
|
||||
|
||||
<style>
|
||||
td {
|
||||
padding: 0.5rem !important;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
th {
|
||||
padding: 0.5rem !important;
|
||||
min-width: 0 !important;
|
||||
}
|
||||
</style>
|
||||
|
||||
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Sub-class |
|
||||
|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
|
||||
| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
|
||||
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
|
||||
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
|
||||
| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
|
||||
| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
|
||||
| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
|
||||
| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] |
|
||||
| BatchedPrepareAndFinalize<sup>5</sup> | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] |
|
||||
|
||||
!!! info "Table key"
|
||||
1. All types: mxfp4, nvfp4, int4, int8, fp8
|
||||
2. A,T quantization occurs after dispatch.
|
||||
3. All quantization happens after dispatch.
|
||||
4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
|
||||
5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
|
||||
6. This depends on the experts implementation.
|
||||
|
||||
---
|
||||
|
||||
- G - Grouped
|
||||
- G(N) - Grouped w/block size N
|
||||
- A - Per activation token
|
||||
- T - Per tensor
|
||||
|
||||
Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
||||
|
||||
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
|
||||
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
|
||||
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
|
||||
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
|
||||
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
|
||||
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
|
||||
|
||||
## Fused MoE Experts Kernels
|
||||
|
||||
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
||||
|
||||
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
|
||||
|
||||
Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
|
||||
|
||||
Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results.
|
||||
|
||||
As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts.
|
||||
|
||||
Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`.
|
||||
|
||||
To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats.
|
||||
|
||||
| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
|
||||
|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
|
||||
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
|
||||
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
|
||||
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
|
||||
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
|
||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
||||
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
||||
| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_moe_impl] |
|
||||
| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
|
||||
| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
|
||||
|
||||
!!! info "Table key"
|
||||
1. All types: mxfp4, nvfp4, int4, int8, fp8
|
||||
2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params
|
||||
3. uint4, uint8, fp8, fp4
|
||||
4. This is a naive implementation of experts that supports batched format. Mainly used for testing.
|
||||
5. The `activation` parameter is ignored and SwiGlu is used by default instead.
|
||||
6. Only handled by or supported when used with modular kernels.
|
||||
|
||||
## Modular Kernel "families"
|
||||
|
||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
|
||||
|
||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
||||
| deepep_high_throughput,</br>pplx | `DeepEPHTPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8` |
|
||||
| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8` |
|
||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
@ -16,7 +16,7 @@ vLLM will take all the available factors into consideration, and decide a direct
|
||||
|
||||
The factors considered include:
|
||||
|
||||
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
|
||||
- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
|
||||
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
|
||||
- The model's forward function and the relevant functions called by the forward function (see below)
|
||||
|
||||
|
@ -52,7 +52,7 @@ th:not(:first-child) {
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
|
||||
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
|
@ -6,10 +6,6 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup
|
||||
We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
|
||||
and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
|
||||
|
||||
!!! tip
|
||||
When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`
|
||||
This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
|
||||
@ -428,7 +424,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||
|
||||
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
|
||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
||||
|
||||
### Image Inputs
|
||||
|
||||
|
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
|
||||
Install the NIXL library: `uv pip install nixl`, as a quick start.
|
||||
|
||||
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
|
||||
|
||||
### Transport Configuration
|
||||
|
||||
@ -84,7 +84,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
||||
- Connection info is passed via KVTransferParams from prefiller to decoder for handshake
|
||||
|
||||
- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
||||
- Default: 480
|
||||
- Default: 120
|
||||
- If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
||||
|
||||
## Multi-Instance Setup
|
||||
@ -154,6 +154,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
||||
|
||||
Refer to these example scripts in the vLLM repository:
|
||||
|
||||
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
|
@ -6,11 +6,7 @@ This quantization method is particularly useful for reducing model size while ma
|
||||
Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
|
||||
|
||||
!!! note
|
||||
INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
|
||||
|
||||
!!! warning
|
||||
**Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell).
|
||||
Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
|
||||
INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
|
@ -6,17 +6,6 @@ vLLM supports the generation of structured outputs using
|
||||
This document shows you some examples of the different options that are
|
||||
available to generate structured outputs.
|
||||
|
||||
!!! warning
|
||||
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
|
||||
|
||||
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
|
||||
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
|
||||
- `guided_choice` -> `{"structured_outputs": {"choice": ...}}` or `StructuredOutputsParams(choice=...)`
|
||||
- `guided_grammar` -> `{"structured_outputs": {"grammar": ...}}` or `StructuredOutputsParams(grammar=...)`
|
||||
- `guided_whitespace_pattern` -> `{"structured_outputs": {"whitespace_pattern": ...}}` or `StructuredOutputsParams(whitespace_pattern=...)`
|
||||
- `structural_tag` -> `{"structured_outputs": {"structural_tag": ...}}` or `StructuredOutputsParams(structural_tag=...)`
|
||||
- `guided_decoding_backend` -> Remove this field from your request
|
||||
|
||||
## Online Serving (OpenAI API)
|
||||
|
||||
You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
|
||||
|
@ -310,23 +310,12 @@ Flags:
|
||||
* For non-reasoning: `--tool-call-parser hunyuan_a13b`
|
||||
* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
|
||||
|
||||
### LongCat-Flash-Chat Models (`longcat`)
|
||||
|
||||
Supported models:
|
||||
|
||||
* `meituan-longcat/LongCat-Flash-Chat`
|
||||
* `meituan-longcat/LongCat-Flash-Chat-FP8`
|
||||
|
||||
Flags: `--tool-call-parser longcat`
|
||||
|
||||
### GLM-4.5 Models (`glm45`)
|
||||
|
||||
Supported models:
|
||||
|
||||
* `zai-org/GLM-4.5`
|
||||
* `zai-org/GLM-4.5-Air`
|
||||
* `zai-org/GLM-4.6`
|
||||
* `zai-org/GLM-4.6-Air`
|
||||
* `ZhipuAI/GLM-4.5`
|
||||
* `ZhipuAI/GLM-4.5-Air`
|
||||
|
||||
Flags: `--tool-call-parser glm45`
|
||||
|
||||
|
@ -25,4 +25,3 @@ The backends below live **outside** the main `vllm` repository and follow the
|
||||
| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
|
||||
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
|
||||
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
|
||||
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
|
||||
|
@ -46,22 +46,22 @@ Execute the following commands to build and install vLLM from source.
|
||||
Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
|
||||
|
||||
```bash
|
||||
sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
|
||||
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||
uv pip install -v \
|
||||
--torch-backend auto \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu.txt \
|
||||
-r requirements-build.txt \
|
||||
-r requirements-cpu.txt \
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||
uv pip install dist/*.whl
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
|
||||
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||
pip install -v \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu.txt \
|
||||
-r requirements-build.txt \
|
||||
-r requirements-cpu.txt \
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||
pip install dist/*.whl
|
||||
```
|
||||
|
@ -20,80 +20,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||
|
||||
```bash
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
```
|
||||
|
||||
Clone the vLLM project:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git vllm_source
|
||||
cd vllm_source
|
||||
```
|
||||
|
||||
Install the required dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
|
||||
uv pip install -r requirements/cpu.txt --torch-backend cpu
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Build and install vLLM:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
|
||||
```
|
||||
|
||||
If you want to develop vLLM, install it in editable mode instead.
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
|
||||
```
|
||||
|
||||
Optionally, build a portable wheel which you can then install elsewhere:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv build --wheel
|
||||
```
|
||||
|
||||
```bash
|
||||
uv pip install dist/*.whl
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu python -m build --wheel --no-isolation
|
||||
```
|
||||
|
||||
```bash
|
||||
pip install dist/*.whl
|
||||
```
|
||||
|
||||
!!! example "Troubleshooting"
|
||||
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
|
||||
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
|
||||
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
|
||||
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
|
||||
```toml title="pyproject.toml"
|
||||
[build-system]
|
||||
requires = [
|
||||
"cmake>=3.26.1",
|
||||
...
|
||||
"torch==X.Y.Z+cpu" # <-------
|
||||
]
|
||||
```
|
||||
- If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
|
||||
--8<-- "docs/getting_started/installation/cpu/build.inc.md"
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
@ -130,4 +57,4 @@ docker run --rm \
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
# --8<-- [end:extra-information]
|
||||
# --8<-- [end:extra-information]
|
||||
|
@ -32,9 +32,8 @@ def auto_mock(module, attr, max_mocks=50):
|
||||
for _ in range(max_mocks):
|
||||
try:
|
||||
# First treat attr as an attr, then as a submodule
|
||||
with patch("importlib.metadata.version", return_value="0.0.0"):
|
||||
return getattr(importlib.import_module(module), attr,
|
||||
importlib.import_module(f"{module}.{attr}"))
|
||||
return getattr(importlib.import_module(module), attr,
|
||||
importlib.import_module(f"{module}.{attr}"))
|
||||
except importlib.metadata.PackageNotFoundError as e:
|
||||
raise e
|
||||
except ModuleNotFoundError as e:
|
||||
@ -168,5 +167,5 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
|
||||
# Specify encoding for building on Windows
|
||||
with open(doc_path, "w", encoding="utf-8") as f:
|
||||
f.write(super(type(parser), parser).format_help())
|
||||
f.write(parser.format_help())
|
||||
logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
|
@ -24,13 +24,6 @@ vllm serve s3://core-llm/Llama-3-8b \
|
||||
--load-format runai_streamer
|
||||
```
|
||||
|
||||
To run model from Google Cloud Storage run:
|
||||
|
||||
```bash
|
||||
vllm serve gs://core-llm/Llama-3-8b \
|
||||
--load-format runai_streamer
|
||||
```
|
||||
|
||||
To run model from a S3 compatible object store run:
|
||||
|
||||
```bash
|
||||
|
@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
|
||||
|
||||
In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
|
||||
Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
|
||||
which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.
|
||||
which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
|
||||
|
||||
## Configuration
|
||||
|
||||
|
@ -29,7 +29,7 @@ _*Vision-language models currently accept only image inputs. Support for video i
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||
- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
- Pipeline parallel
|
||||
- Tensor parallel
|
||||
@ -367,7 +367,7 @@ th {
|
||||
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
|
||||
@ -396,6 +396,7 @@ th {
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MotifForCausalLM` | Motif-1-Tiny | `Motif-Technologies/Motif-2.6B`, `Motif-Technologies/Motif-2.6b-v1.1-LC`, etc. | ✅︎ | ✅︎ | |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
@ -403,11 +404,12 @@ th {
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
@ -426,7 +428,6 @@ th {
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
@ -626,29 +627,7 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
|
||||
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
|
||||
|
||||
!!! note
|
||||
vLLM currently only supports dynamic LoRA adapters on the language backbone of multimodal models.
|
||||
If you wish to use a model with LoRA in the multi-modal encoder,
|
||||
please merge the weights into the base model first before running it in vLLM like a regular model.
|
||||
|
||||
```python
|
||||
from peft import PeftConfig, PeftModel
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
def merge_and_save(model_id: str, output_dir: str):
|
||||
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
model_id,
|
||||
config=PeftConfig.from_pretrained(model_id),
|
||||
)
|
||||
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
|
||||
model._hf_peft_config_loaded = False # Needed to save the merged model
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
|
||||
model.save_pretrained(output_dir)
|
||||
processor.save_pretrained(output_dir)
|
||||
```
|
||||
vLLM currently only supports adding LoRA to the language backbone of multimodal models.
|
||||
|
||||
### Generative Models
|
||||
|
||||
@ -827,8 +806,8 @@ The following table lists those that are tested in vLLM.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
|
@ -236,32 +236,10 @@ The following extra parameters are supported:
|
||||
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py>
|
||||
|
||||
If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
|
||||
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
|
||||
which will be treated as a single prompt to the model.
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
|
||||
def create_chat_embeddings(
|
||||
client: OpenAI,
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
return client.post(
|
||||
"/embeddings",
|
||||
cast_to=CreateEmbeddingResponse,
|
||||
body={"messages": messages, "model": model, "encoding_format": encoding_format},
|
||||
)
|
||||
```
|
||||
Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py>
|
||||
|
||||
#### Multi-modal inputs
|
||||
|
||||
@ -276,7 +254,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||
--chat-template examples/template_vlm2vec.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
@ -284,36 +262,34 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
The custom chat template is completely different from the original one for this model,
|
||||
and can be found here: <gh-file:examples/template_vlm2vec_phi3v.jinja>
|
||||
and can be found here: <gh-file:examples/template_vlm2vec.jinja>
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="EMPTY",
|
||||
)
|
||||
import requests
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
messages=[
|
||||
{
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
encoding_format="float",
|
||||
}],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
```
|
||||
|
||||
=== "DSE-Qwen2-MRL"
|
||||
@ -375,92 +351,13 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
|
||||
To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
|
||||
<!-- TODO: api enforced limits + uploading audios -->
|
||||
|
||||
#### API Enforced Limits
|
||||
|
||||
Set the maximum audio file size (in MB) that VLLM will accept, via the
|
||||
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
|
||||
|
||||
#### Uploading Audio Files
|
||||
|
||||
The Transcriptions API supports uploading audio files in various formats including FLAC, MP3, MP4, MPEG, MPGA, M4A, OGG, WAV, and WEBM.
|
||||
|
||||
**Using OpenAI Python Client:**
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="token-abc123",
|
||||
)
|
||||
|
||||
# Upload audio file from disk
|
||||
with open("audio.mp3", "rb") as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
file=audio_file,
|
||||
language="en",
|
||||
response_format="verbose_json"
|
||||
)
|
||||
|
||||
print(transcription.text)
|
||||
```
|
||||
|
||||
**Using curl with multipart/form-data:**
|
||||
|
||||
??? code
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
|
||||
-H "Authorization: Bearer token-abc123" \
|
||||
-F "file=@audio.mp3" \
|
||||
-F "model=openai/whisper-large-v3-turbo" \
|
||||
-F "language=en" \
|
||||
-F "response_format=verbose_json"
|
||||
```
|
||||
|
||||
**Supported Parameters:**
|
||||
|
||||
- `file`: The audio file to transcribe (required)
|
||||
- `model`: The model to use for transcription (required)
|
||||
- `language`: The language code (e.g., "en", "zh") (optional)
|
||||
- `prompt`: Optional text to guide the transcription style (optional)
|
||||
- `response_format`: Format of the response ("json", "text") (optional)
|
||||
- `temperature`: Sampling temperature between 0 and 1 (optional)
|
||||
|
||||
For the complete list of supported parameters including sampling parameters and vLLM extensions, see the [protocol definitions](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L2182).
|
||||
|
||||
**Response Format:**
|
||||
|
||||
For `verbose_json` response format:
|
||||
|
||||
??? code
|
||||
|
||||
```json
|
||||
{
|
||||
"text": "Hello, this is a transcription of the audio file.",
|
||||
"language": "en",
|
||||
"duration": 5.42,
|
||||
"segments": [
|
||||
{
|
||||
"id": 0,
|
||||
"seek": 0,
|
||||
"start": 0.0,
|
||||
"end": 2.5,
|
||||
"text": "Hello, this is a transcription",
|
||||
"tokens": [50364, 938, 428, 307, 275, 28347],
|
||||
"temperature": 0.0,
|
||||
"avg_logprob": -0.245,
|
||||
"compression_ratio": 1.235,
|
||||
"no_speech_prob": 0.012
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Extra Parameters
|
||||
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
@ -1,54 +1,12 @@
|
||||
# Transformers Reinforcement Learning
|
||||
|
||||
[Transformers Reinforcement Learning](https://huggingface.co/docs/trl) (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
|
||||
Transformers Reinforcement Learning (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
|
||||
|
||||
Online methods such as GRPO or Online DPO require the model to generate completions. vLLM can be used to generate these completions!
|
||||
|
||||
See the [vLLM integration guide](https://huggingface.co/docs/trl/main/en/vllm_integration) in the TRL documentation for more information.
|
||||
|
||||
TRL currently supports the following online trainers with vLLM:
|
||||
|
||||
- [GRPO](https://huggingface.co/docs/trl/main/en/grpo_trainer)
|
||||
- [Online DPO](https://huggingface.co/docs/trl/main/en/online_dpo_trainer)
|
||||
- [RLOO](https://huggingface.co/docs/trl/main/en/rloo_trainer)
|
||||
- [Nash-MD](https://huggingface.co/docs/trl/main/en/nash_md_trainer)
|
||||
- [XPO](https://huggingface.co/docs/trl/main/en/xpo_trainer)
|
||||
|
||||
To enable vLLM in TRL, set the `use_vllm` flag in the trainer configuration to `True`.
|
||||
|
||||
## Modes of Using vLLM During Training
|
||||
|
||||
TRL supports **two modes** for integrating vLLM during training: **server mode** and **colocate mode**. You can control how vLLM operates during training with the `vllm_mode` parameter.
|
||||
|
||||
### Server mode
|
||||
|
||||
In **server mode**, vLLM runs as an independent process on dedicated GPUs and communicates with the trainer through HTTP requests. This configuration is ideal when you have separate GPUs for inference, as it isolates generation workloads from training, ensuring stable performance and easier scaling.
|
||||
|
||||
```python
|
||||
from trl import GRPOConfig
|
||||
|
||||
training_args = GRPOConfig(
|
||||
...,
|
||||
use_vllm=True,
|
||||
vllm_mode="server", # default value, can be omitted
|
||||
)
|
||||
```
|
||||
|
||||
### Colocate mode
|
||||
|
||||
In **colocate mode**, vLLM runs inside the trainer process and shares GPU memory with the training model. This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.
|
||||
|
||||
```python
|
||||
from trl import GRPOConfig
|
||||
|
||||
training_args = GRPOConfig(
|
||||
...,
|
||||
use_vllm=True,
|
||||
vllm_mode="colocate",
|
||||
)
|
||||
```
|
||||
|
||||
Some trainers also support **vLLM sleep mode**, which offloads parameters and caches to GPU RAM during training, helping reduce memory usage. Learn more in the [memory optimization docs](https://huggingface.co/docs/trl/main/en/reducing_memory_usage#vllm-sleep-mode).
|
||||
See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information.
|
||||
|
||||
!!! info
|
||||
For detailed configuration options and flags, refer to the documentation of the specific trainer you are using.
|
||||
For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
|
||||
- [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
|
||||
- [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Using vLLM
|
||||
|
||||
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
|
||||
First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
|
||||
|
||||
Then, vLLM supports the following usage patterns:
|
||||
|
||||
|
@ -60,12 +60,6 @@ Key points from the PyTorch security guide:
|
||||
- Implement proper authentication and authorization for management interfaces
|
||||
- Follow the principle of least privilege for all system components
|
||||
|
||||
### 4. **Restrict Domains Access for Media URLs:**
|
||||
|
||||
Restrict domains that vLLM can access for media URLs by setting
|
||||
`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks.
|
||||
(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`)
|
||||
|
||||
## Security and Firewalls: Protecting Exposed vLLM Systems
|
||||
|
||||
While vLLM is designed to allow unsafe network services to be isolated to
|
||||
|
@ -87,7 +87,6 @@ def main(args: dict):
|
||||
use_tqdm=False,
|
||||
chat_template=chat_template,
|
||||
)
|
||||
print_outputs(outputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,30 +0,0 @@
|
||||
# KV Load Failure Recovery Test
|
||||
|
||||
This example builds upon the `disaggregated-prefill-v1` example in `examples/offline_inference`.
|
||||
|
||||
It demonstrates vLLM's ability to recover from KV load failures in both synchronous and asynchronous loading modes. The goal is to verify that vLLM correctly identifies invalid KV blocks, reschedules the affected requests, and ensures successful and consistent output.
|
||||
|
||||
## Files
|
||||
|
||||
- `prefill_example.py` – performs the prefill stage and saves KV data (same as in `disaggregated-prefill-v1`).
|
||||
- `decode_example.py` – performs the decode stage. Accepts:
|
||||
- `--simulate-failure`: simulates KV load failure using a custom connector.
|
||||
- `--async-load`: enables asynchronous KV loading mode.
|
||||
- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
|
||||
- `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
|
||||
1. Normal decode (baseline).
|
||||
2. Decode with simulated sync KV load failure.
|
||||
3. Decode with simulated async KV load failure.
|
||||
|
||||
Finally, it compares the output of the baseline with the recovered outputs to verify correctness.
|
||||
|
||||
## How It Works
|
||||
|
||||
- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
|
||||
- The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
|
||||
- If recovery fails, the script prints a unified diff of the output mismatch and exits with error.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
./run.sh
|
@ -1,85 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
|
||||
def read_prompts():
|
||||
"""Read prompts from prefill_output.txt"""
|
||||
prompts = []
|
||||
try:
|
||||
with open("prefill_output.txt") as f:
|
||||
for line in f:
|
||||
prompts.append(line.strip())
|
||||
print(f"Loaded {len(prompts)} prompts from prefill_output.txt")
|
||||
return prompts
|
||||
except FileNotFoundError:
|
||||
print("Error: prefill_output.txt file not found")
|
||||
exit(-1)
|
||||
|
||||
|
||||
def main():
|
||||
prompts = read_prompts()
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--simulate-failure", action="store_true", help="Simulate KV load failure."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--async-load", action="store_true", help="Simulate async KV load"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.simulate_failure:
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="RogueSharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"shared_storage_path": "local_storage",
|
||||
"async_load": args.async_load,
|
||||
},
|
||||
kv_connector_module_path="rogue_shared_storage_connector",
|
||||
)
|
||||
out_file = (
|
||||
"async_decode_recovered_output.txt"
|
||||
if args.async_load
|
||||
else "sync_decode_recovered_output.txt"
|
||||
)
|
||||
else:
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"shared_storage_path": "local_storage",
|
||||
},
|
||||
)
|
||||
out_file = "decode_output.txt"
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.8,
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=16,
|
||||
kv_transfer_config=ktc,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
sep_str = "-" * 30
|
||||
with open(out_file, "w", encoding="utf-8") as f:
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
out_str = f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}"
|
||||
print(out_str)
|
||||
print(sep_str)
|
||||
f.write(out_str)
|
||||
f.write(sep_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,58 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
|
||||
def read_prompts():
|
||||
context = "Hi " * 1000
|
||||
context2 = "Hey " * 500
|
||||
return [
|
||||
context + "Hello, my name is",
|
||||
context + "The capital of France is",
|
||||
context2 + "Your name is",
|
||||
context2 + "The capital of China is",
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
prompts = read_prompts()
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.8,
|
||||
kv_transfer_config=KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": "local_storage"},
|
||||
),
|
||||
) # , max_model_len=2048, max_num_batched_tokens=2048)
|
||||
|
||||
# 1ST generation (prefill instance)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
)
|
||||
|
||||
new_prompts = []
|
||||
print("-" * 30)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
new_prompts.append(prompt + generated_text)
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 30)
|
||||
|
||||
# Write new_prompts to prefill_output.txt
|
||||
with open("prefill_output.txt", "w") as f:
|
||||
for prompt in new_prompts:
|
||||
f.write(prompt + "\n")
|
||||
print(f"Saved {len(new_prompts)} prompts to prefill_output.txt")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,145 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorMetadata,
|
||||
KVConnectorRole,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
|
||||
SharedStorageConnector,
|
||||
SharedStorageConnectorMetadata,
|
||||
)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.request import Request
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
logger = logging.getLogger()
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
|
||||
req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_base(cls, base: SharedStorageConnectorMetadata):
|
||||
return cls(requests=base.requests)
|
||||
|
||||
|
||||
class RogueSharedStorageConnector(SharedStorageConnector):
|
||||
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
|
||||
super().__init__(vllm_config=vllm_config, role=role)
|
||||
self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
|
||||
"async_load", False
|
||||
)
|
||||
self._invalid_block_ids: set = None
|
||||
self._seen_requests: set = set()
|
||||
self._req_to_block_ids: dict[str, list[int]] = dict()
|
||||
|
||||
def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
|
||||
assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
|
||||
index, failed_request = next(
|
||||
(
|
||||
(i, x)
|
||||
for i, x in enumerate(connector_metadata.requests)
|
||||
if not x.is_store
|
||||
),
|
||||
(None, None),
|
||||
)
|
||||
if index is not None:
|
||||
del connector_metadata.requests[index]
|
||||
self._invalid_block_ids = set(
|
||||
(
|
||||
failed_request.slot_mapping[:: self._block_size] // self._block_size
|
||||
).tolist()
|
||||
)
|
||||
logger.info(
|
||||
"Simulating failure to load all KV blocks for the "
|
||||
"first load request. Total blocks: %d",
|
||||
len(self._invalid_block_ids),
|
||||
)
|
||||
super().bind_connector_metadata(connector_metadata)
|
||||
|
||||
def clear_connector_metadata(self) -> None:
|
||||
self._invalid_block_ids = None
|
||||
super().clear_connector_metadata()
|
||||
|
||||
def start_load_kv(self, forward_context: ForwardContext, **kwargs) -> None:
|
||||
if self._async_load and forward_context.attn_metadata is None:
|
||||
# Bypass sanity check in super().start_load_kv
|
||||
forward_context.attn_metadata = "None"
|
||||
|
||||
super().start_load_kv(forward_context, **kwargs)
|
||||
|
||||
def get_finished(
|
||||
self, finished_req_ids: set[str]
|
||||
) -> tuple[Optional[set[str]], Optional[set[str]]]:
|
||||
if self._async_load:
|
||||
meta = self._get_connector_metadata()
|
||||
assert isinstance(meta, RogueSharedStorageConnectorMetadata)
|
||||
if meta.req_to_block_ids:
|
||||
return None, set(meta.req_to_block_ids)
|
||||
|
||||
return None, None
|
||||
|
||||
def get_block_ids_with_load_errors(self) -> set[int]:
|
||||
return self._invalid_block_ids
|
||||
|
||||
def get_num_new_matched_tokens(
|
||||
self,
|
||||
request: Request,
|
||||
num_computed_tokens: int,
|
||||
) -> tuple[int, bool]:
|
||||
if request.request_id in self._seen_requests:
|
||||
return 0, False
|
||||
|
||||
self._seen_requests.add(request.request_id)
|
||||
|
||||
num_tokens, _ = super().get_num_new_matched_tokens(request, num_computed_tokens)
|
||||
return num_tokens, self._async_load and num_tokens > 0
|
||||
|
||||
def update_state_after_alloc(
|
||||
self, request: Request, blocks: KVCacheBlocks, num_external_tokens: int
|
||||
):
|
||||
"""
|
||||
Update KVConnector state after block allocation.
|
||||
|
||||
If blocks were allocated, add to _requests_need_load,
|
||||
such that we load the KVs in the next forward pass.
|
||||
"""
|
||||
super().update_state_after_alloc(request, blocks, num_external_tokens)
|
||||
|
||||
if num_external_tokens > 0:
|
||||
self._req_to_block_ids[request.request_id] = blocks.get_block_ids()[0]
|
||||
|
||||
def build_connector_meta(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> KVConnectorMetadata:
|
||||
if not self._async_load:
|
||||
base = super().build_connector_meta(scheduler_output)
|
||||
meta = RogueSharedStorageConnectorMetadata.from_base(base)
|
||||
else:
|
||||
meta = RogueSharedStorageConnectorMetadata()
|
||||
if self._requests_need_load:
|
||||
for req_id, request in self._requests_need_load.items():
|
||||
meta.add_request(
|
||||
token_ids=request.prompt_token_ids,
|
||||
block_ids=self._req_to_block_ids[req_id],
|
||||
block_size=self._block_size,
|
||||
is_store=False,
|
||||
mm_hashes=[],
|
||||
)
|
||||
# Clear state
|
||||
self._requests_need_load.clear()
|
||||
meta.req_to_block_ids = self._req_to_block_ids
|
||||
self._req_to_block_ids = dict()
|
||||
return meta
|
@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Constants
|
||||
SHARED_STORAGE_DIR="local_storage"
|
||||
PREFILL_OUTPUT="prefill_output.txt"
|
||||
DECODE_OUTPUT="decode_output.txt"
|
||||
SYNC_DECODE_RECOVERED_OUTPUT="sync_decode_recovered_output.txt"
|
||||
ASYNC_DECODE_RECOVERED_OUTPUT="async_decode_recovered_output.txt"
|
||||
|
||||
# Cleanup
|
||||
rm -rf "$SHARED_STORAGE_DIR"
|
||||
rm -f "$PREFILL_OUTPUT" "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"
|
||||
|
||||
# Run inference examples
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py --simulate-failure
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py --simulate-failure --async-load
|
||||
|
||||
# Compare outputs
|
||||
if ! cmp -s "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT"; then
|
||||
echo "❌ Outputs differ: sync recovery failed."
|
||||
diff -u "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! cmp -s "$DECODE_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"; then
|
||||
echo "❌ Outputs differ: async recovery failed."
|
||||
diff -u "$DECODE_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✅ Outputs match: recovery successful."
|
@ -54,7 +54,6 @@ def parse_args():
|
||||
"--method",
|
||||
type=str,
|
||||
default="eagle",
|
||||
choices=["ngram", "eagle", "eagle3", "mtp"],
|
||||
)
|
||||
parser.add_argument("--num-spec-tokens", type=int, default=2)
|
||||
parser.add_argument("--prompt-lookup-max", type=int, default=5)
|
||||
@ -119,9 +118,9 @@ def main(args):
|
||||
"prompt_lookup_max": args.prompt_lookup_max,
|
||||
"prompt_lookup_min": args.prompt_lookup_min,
|
||||
}
|
||||
elif args.method == "mtp":
|
||||
elif args.method.endswith("mtp"):
|
||||
speculative_config = {
|
||||
"method": "mtp",
|
||||
"method": args.method,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
}
|
||||
else:
|
||||
|
@ -4,11 +4,6 @@
|
||||
experimental support for data-parallel inference with torchrun
|
||||
Note the data load balancing and distribution is done out of the vllm engine,
|
||||
no internal lb supported in external_launcher mode.
|
||||
|
||||
To run this example:
|
||||
```bash
|
||||
$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
|
||||
```
|
||||
"""
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
@ -19,7 +14,7 @@ prompts = [
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
] * 50
|
||||
|
||||
# Create sampling parameters, the same across all ranks
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
@ -50,13 +45,14 @@ prompts = [
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
# all ranks will have the same outputs
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(
|
||||
f"DP Rank: {dp_rank} Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n"
|
||||
)
|
||||
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
|
||||
print("-" * 50)
|
||||
"""
|
||||
Further tips:
|
||||
|
||||
|
@ -10,7 +10,6 @@ on HuggingFace model repository.
|
||||
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
@ -20,9 +19,6 @@ from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
EXAMPLES_DIR = ROOT_DIR / "examples"
|
||||
|
||||
|
||||
class TextQuery(TypedDict):
|
||||
modality: Literal["text"]
|
||||
@ -86,27 +82,23 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||
def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
image = query["image"]
|
||||
elif query["modality"] == "text+image":
|
||||
text = query["text"]
|
||||
prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
|
||||
prompt = (
|
||||
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
|
||||
)
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: {modality!r}")
|
||||
|
||||
return prompt, image
|
||||
|
||||
|
||||
def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
@ -124,66 +116,6 @@ def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
||||
# vLLM does not support LoRA adapters on multi-modal encoder,
|
||||
# so we merge the weights first
|
||||
from huggingface_hub.constants import HF_HUB_CACHE
|
||||
from peft import PeftConfig, PeftModel
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
|
||||
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
|
||||
|
||||
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
model_id,
|
||||
config=PeftConfig.from_pretrained(model_id),
|
||||
)
|
||||
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
|
||||
model._hf_peft_config_loaded = False # Needed to save the merged model
|
||||
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_id,
|
||||
# `min_pixels` and `max_pixels` are deprecated
|
||||
size={"shortest_edge": 3136, "longest_edge": 12845056},
|
||||
)
|
||||
processor.chat_template = load_chat_template(
|
||||
# The original chat template is not correct
|
||||
EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
|
||||
)
|
||||
|
||||
merged_path = str(
|
||||
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
|
||||
)
|
||||
print(f"Saving merged model to {merged_path}...")
|
||||
print(
|
||||
"NOTE: This directory is not tracked by `huggingface_hub` "
|
||||
"so you have to delete this manually if you don't want it anymore."
|
||||
)
|
||||
model.save_pretrained(merged_path)
|
||||
processor.save_pretrained(merged_path)
|
||||
print("Done!")
|
||||
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=merged_path,
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
@ -300,8 +232,7 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
|
||||
model_example_map = {
|
||||
"e5_v": run_e5_v,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
}
|
||||
|
||||
@ -315,7 +246,7 @@ def parse_args():
|
||||
"--model-name",
|
||||
"-m",
|
||||
type=str,
|
||||
default="vlm2vec_phi3v",
|
||||
default="vlm2vec",
|
||||
choices=model_example_map.keys(),
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
|
@ -11,9 +11,9 @@ vLLM performance and metrics.
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **performance_statistics.json**: Tracks performance metrics including latency and
|
||||
- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
|
||||
throughput for your vLLM service.
|
||||
- **query_statistics.json**: Tracks query performance, request volume, and key
|
||||
- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
|
||||
performance indicators for your vLLM service.
|
||||
|
||||
## Deployment Options
|
||||
|
@ -21,9 +21,9 @@ deployment methods:
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **performance_statistics.yaml**: Performance metrics with aggregated latency
|
||||
- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
|
||||
statistics
|
||||
- **query_statistics.yaml**: Query performance and deployment metrics
|
||||
- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
|
||||
|
||||
## Deployment Options
|
||||
|
||||
|
@ -38,13 +38,11 @@ client = OpenAI(
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
headers = {"User-Agent": "vLLM Example Client"}
|
||||
|
||||
|
||||
def encode_base64_content_from_url(content_url: str) -> str:
|
||||
"""Encode a content retrieved from a remote url to base64 format."""
|
||||
|
||||
with requests.get(content_url, headers=headers) as response:
|
||||
with requests.get(content_url) as response:
|
||||
response.raise_for_status()
|
||||
result = base64.b64encode(response.content).decode("utf-8")
|
||||
|
||||
@ -52,19 +50,19 @@ def encode_base64_content_from_url(content_url: str) -> str:
|
||||
|
||||
|
||||
# Text-only inference
|
||||
def run_text_only(model: str, max_completion_tokens: int) -> None:
|
||||
def run_text_only(model: str) -> None:
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[{"role": "user", "content": "What's the capital of France?"}],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion.choices[0].message.content
|
||||
print("Chat completion output:\n", result)
|
||||
print("Chat completion output:", result)
|
||||
|
||||
|
||||
# Single-image input inference
|
||||
def run_single_image(model: str, max_completion_tokens: int) -> None:
|
||||
def run_single_image(model: str) -> None:
|
||||
## Use image url in the payload
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
@ -81,11 +79,11 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:\n", result)
|
||||
print("Chat completion output from image url:", result)
|
||||
|
||||
## Use base64 encoded image in the payload
|
||||
image_base64 = encode_base64_content_from_url(image_url)
|
||||
@ -103,7 +101,7 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
@ -111,7 +109,7 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
|
||||
|
||||
|
||||
# Multi-image input inference
|
||||
def run_multi_image(model: str, max_completion_tokens: int) -> None:
|
||||
def run_multi_image(model: str) -> None:
|
||||
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
||||
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
@ -132,15 +130,15 @@ def run_multi_image(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output:\n", result)
|
||||
print("Chat completion output:", result)
|
||||
|
||||
|
||||
# Video input inference
|
||||
def run_video(model: str, max_completion_tokens: int) -> None:
|
||||
def run_video(model: str) -> None:
|
||||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||
video_base64 = encode_base64_content_from_url(video_url)
|
||||
|
||||
@ -159,11 +157,11 @@ def run_video(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from video url:\n", result)
|
||||
print("Chat completion output from image url:", result)
|
||||
|
||||
## Use base64 encoded video in the payload
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
@ -180,15 +178,15 @@ def run_video(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded video:\n", result)
|
||||
print("Chat completion output from base64 encoded image:", result)
|
||||
|
||||
|
||||
# Audio input inference
|
||||
def run_audio(model: str, max_completion_tokens: int) -> None:
|
||||
def run_audio(model: str) -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
audio_url = AudioAsset("winning_call").url
|
||||
@ -213,11 +211,11 @@ def run_audio(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:\n", result)
|
||||
print("Chat completion output from input audio:", result)
|
||||
|
||||
# HTTP URL
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
@ -237,11 +235,11 @@ def run_audio(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from audio url:\n", result)
|
||||
print("Chat completion output from audio url:", result)
|
||||
|
||||
# base64 URL
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
@ -261,14 +259,14 @@ def run_audio(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded audio:\n", result)
|
||||
print("Chat completion output from base64 encoded audio:", result)
|
||||
|
||||
|
||||
def run_multi_audio(model: str, max_completion_tokens: int) -> None:
|
||||
def run_multi_audio(model: str) -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
# Two different audios to showcase batched inference.
|
||||
@ -302,11 +300,11 @@ def run_multi_audio(model: str, max_completion_tokens: int) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:\n", result)
|
||||
print("Chat completion output from input audio:", result)
|
||||
|
||||
|
||||
example_function_map = {
|
||||
@ -332,20 +330,13 @@ def parse_args():
|
||||
choices=list(example_function_map.keys()),
|
||||
help="Conversation type with multimodal data.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-completion-tokens",
|
||||
"-n",
|
||||
type=int,
|
||||
default=128,
|
||||
help="Maximum number of tokens to generate for each completion.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args) -> None:
|
||||
chat_type = args.chat_type
|
||||
model = get_first_model(client)
|
||||
example_function_map[chat_type](model, args.max_completion_tokens)
|
||||
example_function_map[chat_type](model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -4,137 +4,69 @@
|
||||
"""Example Python client for multimodal embedding API using vLLM API server
|
||||
NOTE:
|
||||
start a supported multimodal embeddings model server with `vllm serve`, e.g.
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
from typing import Literal, Union
|
||||
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
|
||||
def create_chat_embeddings(
|
||||
client: OpenAI,
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
"""
|
||||
Convenience function for accessing vLLM's Chat Embeddings API,
|
||||
which is an extension of OpenAI's existing Embeddings API.
|
||||
"""
|
||||
return client.post(
|
||||
"/embeddings",
|
||||
cast_to=CreateEmbeddingResponse,
|
||||
body={"messages": messages, "model": model, "encoding_format": encoding_format},
|
||||
def vlm2vec():
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
|
||||
|
||||
def run_vlm2vec(client: OpenAI, model: str):
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Represent the given image with the following question: What is in the image.",
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image+Text embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "A cat and a dog"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
def dse_qwen2_vl(inp: dict):
|
||||
# Embedding an Image
|
||||
if inp["type"] == "image":
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"url": inp["image_url"],
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
]
|
||||
# Embedding a Text Query
|
||||
else:
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
@ -144,21 +76,23 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
"url": f"data:image/jpeg;base64,{image_placeholder}",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "Query: What is the weather like today?"},
|
||||
{"type": "text", "text": f"Query: {inp['content']}"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "MrLight/dse-qwen2-2b-mrl-v1",
|
||||
"messages": messages,
|
||||
"encoding_format": "float",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"dse_qwen2_vl": run_dse_qwen2_vl,
|
||||
}
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
|
||||
|
||||
def parse_args():
|
||||
@ -169,24 +103,29 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
choices=model_example_map.keys(),
|
||||
choices=["vlm2vec", "dse_qwen2_vl"],
|
||||
required=True,
|
||||
help="The name of the embedding model.",
|
||||
help="Which model to call.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model_id = models.data[0].id
|
||||
|
||||
model_example_map[args.model](client, model_id)
|
||||
if args.model == "vlm2vec":
|
||||
vlm2vec()
|
||||
elif args.model == "dse_qwen2_vl":
|
||||
dse_qwen2_vl(
|
||||
{
|
||||
"type": "image",
|
||||
"image_url": image_url,
|
||||
}
|
||||
)
|
||||
dse_qwen2_vl(
|
||||
{
|
||||
"type": "text",
|
||||
"content": "What is the weather like today?",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,15 +0,0 @@
|
||||
{%- if messages | length > 1 -%}
|
||||
{{ raise_exception('Embedding models should only embed one message at a time') }}
|
||||
{%- endif -%}
|
||||
|
||||
{% set vars = namespace(parts=[]) %}
|
||||
{%- for message in messages -%}
|
||||
{%- for content in message['content'] -%}
|
||||
{%- if content['type'] == 'text' -%}
|
||||
{%- set vars.parts = vars.parts + [content['text']] %}
|
||||
{%- elif content['type'] == 'image' -%}
|
||||
{%- set vars.parts = vars.parts + ['<|image_pad|>'] %}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endfor -%}
|
||||
{{ vars.parts | join(' ') }}
|
@ -102,6 +102,7 @@ plugins:
|
||||
- https://numpy.org/doc/stable/objects.inv
|
||||
- https://pytorch.org/docs/stable/objects.inv
|
||||
- https://psutil.readthedocs.io/en/stable/objects.inv
|
||||
- https://huggingface.co/docs/transformers/main/en/objects.inv
|
||||
|
||||
markdown_extensions:
|
||||
- attr_list
|
||||
|
@ -126,7 +126,6 @@ markers = [
|
||||
"core_model: enable this model test in each PR instead of only nightly",
|
||||
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
|
||||
"cpu_model: enable this model test in CPU tests",
|
||||
"cpu_test: mark test as CPU-only test",
|
||||
"split: run this test as part of a split",
|
||||
"distributed: run this test only in distributed GPU tests",
|
||||
"skip_v1: do not run this test with v1",
|
||||
|
@ -1,11 +1,12 @@
|
||||
# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
|
||||
# see https://github.com/pytorch/pytorch/pull/151218
|
||||
cmake>=3.26.1
|
||||
ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
||||
torch==2.6.0+cpu
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
regex
|
||||
|
@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||
torch==2.8.0; platform_system == "Darwin"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
|
||||
@ -23,7 +23,7 @@ datasets # for benchmark scripts
|
||||
|
||||
# Intel Extension for PyTorch, only for x86_64 CPUs
|
||||
intel-openmp==2024.2.1; platform_machine == "x86_64"
|
||||
intel_extension_for_pytorch==2.8.0; platform_machine == "x86_64"
|
||||
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
|
||||
|
||||
# Use this to gather CPU info and optimize based on ARM Neoverse cores
|
||||
|
@ -29,8 +29,8 @@ opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
mteb>=1.38.11, <2 # required for mteb test
|
||||
transformers==4.56.2
|
||||
tokenizers==0.22.0
|
||||
transformers==4.52.4
|
||||
tokenizers==0.21.1
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
bitsandbytes>=0.46.1
|
||||
@ -43,6 +43,7 @@ tritonclient==2.51.0
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
numpy
|
||||
runai-model-streamer[s3,gcs]==0.14.0
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
|