Compare commits

...

22 Commits

Author SHA1 Message Date
a5dd03c1eb Revert "[V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412)"
This reverts commit e202dd2736bc575b11250b15311512d19d3225d5.
2025-07-06 14:02:36 -07:00
c18b3b8e8b [Bugfix] Add use_cross_encoder flag to use correct activation in ClassifierPooler (#20527)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-06 14:01:48 -07:00
9528e3a05e [BugFix][Spec Decode] Fix spec token ids in model runner (#20530)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-07-06 19:44:52 +00:00
9fb52e523a [V1] Support any head size for FlexAttention backend (#20467)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-06 09:54:36 -07:00
e202dd2736 [V0 deprecation] Remove V0 CPU/XPU/TPU backends (#20412)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-07-06 08:48:13 -07:00
43813e6361 [Misc] call the pre-defined func (#20518)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-06 10:25:29 +00:00
cede942b87 [Benchmark] Add support for multiple batch size benchmark through CLI in benchmark_moe.py (#20516)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-07-06 09:20:11 +00:00
fe1e924811 [Frontend] Support image object in llm.chat (#19635)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
Signed-off-by: Flora Feng <4florafeng@gmail.com>
2025-07-06 06:47:13 +00:00
4548c03c50 [TPU][Bugfix] fix the MoE OOM issue (#20339)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-07-05 21:19:09 -07:00
40b86aa05e [BugFix] Fix: ImportError when building on hopper systems (#20513)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-06 12:17:30 +08:00
432870829d [Bugfix] Fix missing per_act_token parameter in compressed_tensors_moe (#20509)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-07-06 12:08:30 +08:00
f73d02aadc [BUG] Fix #20484. Support empty sequence in cuda penalty kernel (#20491)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
2025-07-05 19:38:02 -07:00
c5ebe040ac test_attention compat with coming xformers change (#20487)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-05 19:37:59 -07:00
8d763cb891 [Misc] remove unused import (#20517)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-05 19:17:06 -07:00
cf4cd53982 [Misc] Add logger.exception for TPU information collection failures (#20510)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-05 07:24:32 -07:00
32c9be2200 [v1] Re-add fp32 support to v1 engine through FlexAttention (#19754)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-05 09:41:10 +00:00
8aeaa910a2 Fix unknown attribute of topk_indices_dtype in CompressedTensorsW8A8Fp8MoECutlassMethod (#20507)
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-07-05 14:03:20 +08:00
906e05d840 [Misc] Remove the unused LoRA test code (#20494)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-05 13:48:16 +08:00
ef9a2990ae [doc] small fix (#20506)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-04 20:56:39 -07:00
7e90870491 [Misc] Add security warning for development mode endpoints (#20508)
Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-07-04 20:52:13 -07:00
d3f05c9248 [Doc] fix mutltimodal_inputs.md gh examples link (#20497)
Signed-off-by: Guy Stone <guys@spotify.com>
2025-07-04 16:41:35 -07:00
c108781c85 [CI Bugfix] Fix pre-commit failures on main (#20502) 2025-07-04 14:17:30 -07:00
58 changed files with 659 additions and 228 deletions

View File

@ -107,10 +107,9 @@ fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/stest_attention_selector.py \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_blocksparse_attention.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \

2
.github/CODEOWNERS vendored
View File

@ -16,7 +16,7 @@
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people

View File

@ -68,7 +68,7 @@ jobs:
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test
run: |

View File

@ -620,7 +620,7 @@ def main(args: argparse.Namespace):
4096,
]
else:
batch_sizes = [args.batch_size]
batch_sizes = args.batch_size
use_deep_gemm = bool(args.use_deep_gemm)
@ -728,7 +728,7 @@ if __name__ == "__main__":
)
parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--batch-size", type=int, nargs="+", required=False)
parser.add_argument("--tune", action="store_true")
parser.add_argument("--trust-remote-code", action="store_true")
parser.add_argument("--model-prefix", type=str, required=False)

View File

@ -239,11 +239,6 @@ void cutlass_moe_mm(
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch);
void cutlass_blockwise_scaled_grouped_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets);
void cutlass_fp4_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,

View File

@ -1,3 +1,5 @@
#include "core/registration.h"
#include <torch/all.h>
#include <cutlass/arch/arch.h>
@ -364,4 +366,9 @@ void cutlass_blockwise_scaled_grouped_mm(
TORCH_CHECK(false, "Unsupported output tensor type");
}
#endif
}
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("cutlass_blockwise_scaled_grouped_mm",
&cutlass_blockwise_scaled_grouped_mm);
}

View File

@ -59,6 +59,8 @@ void apply_repetition_penalties_(
int vocab_size = logits.size(-1);
int num_seqs = logits.size(0);
if (num_seqs == 0) return;
// Get number of SMs on the current device
int sms = 0;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,

View File

@ -399,8 +399,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"Tensor scales_a, Tensor scales_b, "
"Tensor problem_sizes, Tensor expert_offsets) -> ()",
{stride_tag});
ops.impl("cutlass_blockwise_scaled_grouped_mm", torch::kCUDA,
&cutlass_blockwise_scaled_grouped_mm);
// conditionally compiled so impl registration is in source file
// cutlass nvfp4 block scaled group GEMM
ops.def(

View File

@ -14,7 +14,7 @@ Before setting up the incremental build:
VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
```
2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.

View File

@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
```python
from vllm import LLM
from vllm.assets.image import ImageAsset
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
image_url = "https://picsum.photos/id/32/512/512"
image_pil = ImageAsset('cherry_blossom').pil_image
image_embeds = torch.load(...)
conversation = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hello! How can I assist you today?"},
{
"role": "user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
},{
"type": "image_pil",
"image_pil": image_pil
}, {
"type": "image_embeds",
"image_embeds": image_embeds
}, {
"type": "text",
"text": "What's in these images?"
}],
},
]
# Perform inference and log output.
outputs = llm.chat(conversation)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
??? Code
@ -228,7 +271,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
### Image Inputs

View File

@ -626,9 +626,6 @@ Specified using `--task generate`.
!!! note
Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
!!! note
`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
!!! note
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
@ -671,11 +668,8 @@ Specified using `--task generate`.
Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
!!! note
To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
`pip install git+https://github.com/huggingface/transformers.git`.
Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
For Qwen2.5-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`)
is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
#### Transcription

View File

@ -6,6 +6,7 @@ import argparse
from vllm import LLM
from vllm.sampling_params import SamplingParams
from vllm.assets.image import ImageAsset
# This script is an offline demo for running Mistral-Small-3.1
#
@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
)
prompt = "Describe this image in one sentence."
image_url = "https://picsum.photos/id/237/200/300"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "image_pil",
"image_pil": ImageAsset("cherry_blossom").pil_image,
},
],
},
]

View File

@ -98,7 +98,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b",
model="Salesforce/blip2-opt-2.7b",
limit_mm_per_prompt={modality: 1},
)
@ -971,7 +971,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
)
# Qwen
# Qwen-VL
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"

View File

@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
"type": "image_pil",
"image_pil": ImageAsset('cherry_blossom').pil_image
}, {
"type": "text",
"text": "What's in these images?"
@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
"type": "image_pil",
"image_pil": ImageAsset('cherry_blossom').pil_image
}, {
"type": "text",
"text": "What's in these images?"

View File

@ -450,7 +450,8 @@ def test_multi_query_kv_attention(
start += seq_len
# xformers.AttentionBias to Tensor for use in reference impl.
alibi_bias = [
b.materialize(b.shape, device=device).squeeze() for b in attn_bias
b.materialize((1, num_query_heads, i, i), device=device).squeeze()
for b, i in zip(attn_bias, seq_lens)
]
else:
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)

View File

@ -171,7 +171,7 @@ def test_env(
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
backend = get_attn_backend(32,
torch.float16,
torch.float16,
block_size,
@ -180,6 +180,45 @@ def test_env(
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
assert backend.get_name() == expected
if use_v1:
backend = get_attn_backend(16,
torch.float16,
torch.float16,
block_size,
False,
use_mla=use_mla)
assert backend.get_name() == "FLEX_ATTENTION", (
"Should fallback to FlexAttention if head size is "
"not supported by FlashAttention")
@pytest.mark.parametrize("device", ["cpu", "cuda"])
@pytest.mark.parametrize("use_v1", [True, False])
def test_fp32_fallback(
device: str,
use_v1: bool,
monkeypatch: pytest.MonkeyPatch,
):
"""Test attention backend selection with fp32."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
if device == "cpu":
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float32, torch.float32,
16, False)
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
if use_v1 else "TORCH_SDPA")
elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
CudaPlatform()):
backend = get_attn_backend(16, torch.float32, torch.float32,
16, False)
assert (backend.get_name() == "FLEX_ATTENTION"
if use_v1 else "XFORMERS")
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# DeepGEMM Style Cutlass Grouped GEMM Test
# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py

View File

@ -75,3 +75,51 @@ def test_apply_repetition_penalties(
# Test the operator by applying the opcheck utility
opcheck(torch.ops._C.apply_repetition_penalties_,
(logits.clone(), prompt_mask, output_mask, repetition_penalties))
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="This test for checking CUDA kernel")
@torch.inference_mode()
def test_apply_repetition_penalties_zero_seqs() -> None:
"""
Test the apply_repetition_penalties custom op with num_seqs=0
against a reference implementation.
"""
num_seqs = 0
vocab_size = 17
repetition_penalty = 1.05
dtype = torch.float32
seed = 0
current_platform.seed_everything(seed)
torch.set_default_device("cuda:0")
# Create test data
logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
# Create masks with some random tokens marked as repeated
prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
# No tokens to mark as repeated since num_seqs=0
# Create repetition penalties tensor
repetition_penalties = torch.full((num_seqs, ),
repetition_penalty,
dtype=dtype)
# Run all three implementations
logits_torch = logits.clone()
logits_cuda = logits.clone()
apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
repetition_penalties)
apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
repetition_penalties)
# Compare all outputs to reference
torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
# Test the operator by applying the opcheck utility
opcheck(torch.ops._C.apply_repetition_penalties_,
(logits.clone(), prompt_mask, output_mask, repetition_penalties))

View File

@ -249,23 +249,6 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
model_runner.model)
@pytest.fixture(params=[True, False])
def run_with_both_engines_lora(request, monkeypatch):
# Automatically runs tests twice, once with V1 and once without
use_v1 = request.param
# Tests decorated with `@skip_v1` are only run without v1
skip_v1 = request.node.get_closest_marker("skip_v1")
if use_v1:
if skip_v1:
pytest.skip("Skipping test on vllm V1")
monkeypatch.setenv('VLLM_USE_V1', '1')
else:
monkeypatch.setenv('VLLM_USE_V1', '0')
yield
@pytest.fixture
def reset_default_device():
"""

View File

@ -33,9 +33,6 @@ if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
REQUIRES_V0_MODELS = [
# V1 Test: no way to fall back for head_dim = 80
# https://github.com/vllm-project/vllm/issues/14524
"qwen_vl",
# V1 Test: not enough KV cache space in C1.
"fuyu",
]
@ -221,8 +218,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=32)],
),
"blip2": VLMTestInfo(
# TODO: Change back to 2.7b once head_dim = 80 is supported
models=["Salesforce/blip2-opt-6.7b"],
models=["Salesforce/blip2-opt-2.7b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "",
@ -340,8 +336,7 @@ VLM_TEST_SETTINGS = {
"h2ovl": VLMTestInfo(
models = [
"h2oai/h2ovl-mississippi-800m",
# TODO: Re-enable once head_dim = 80 is supported
# "h2oai/h2ovl-mississippi-2b",
"h2oai/h2ovl-mississippi-2b",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501

View File

@ -83,7 +83,7 @@ MODELS = [
QWEN2_CONFIG,
PHI3_CONFIG,
GPT2_CONFIG,
# STABLELM_CONFIG, # enable this when v1 support head_size=80
STABLELM_CONFIG,
DOLPHIN_CONFIG,
# STARCODER_CONFIG, # broken
]

View File

@ -240,8 +240,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
trust_remote_code=True),
"PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
"PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
"Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
# Blocksparse attention not supported in V1 yet
"Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
trust_remote_code=True,
v0_only=True),
@ -258,10 +259,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
"Qwen3ForSequenceClassification": _HfExamplesInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls"), # noqa: E501
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
v0_only=True),
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
v0_only=True),
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@ -330,8 +329,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
extras={"6b": "Salesforce/blip2-opt-6.7b"}, # noqa: E501
v0_only=True),
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501
@ -359,8 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
trust_remote_code=True,
v0_only=True),
trust_remote_code=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
max_model_len=10240),
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",

View File

@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
model_info.check_transformers_version(on_fail="skip")
# FIXME: Possible memory leak in the previous tests?
if model_arch == "GraniteSpeechForConditionalGeneration":
if model_arch in ("GraniteSpeechForConditionalGeneration",
"KimiVLForConditionalGeneration"):
pytest.skip("Avoid OOM")
# Avoid OOM and reduce initialization time by only using 1 layer

View File

@ -450,6 +450,7 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):
def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
error_msg = f"{layer_1} must come before the current layer"
@ -478,6 +479,7 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
invalid_layer = "model.layers.0.cross_attn.attn"
@ -506,6 +508,7 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
def test_init_kv_cache_with_kv_sharing_target_same_as_current():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
error_msg = f"{layer_1} cannot be the same as the current layer"
@ -534,6 +537,7 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():
def test_init_kv_cache_without_kv_sharing():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
vllm_config = get_vllm_config()
@ -601,6 +605,7 @@ def test_init_kv_cache_without_kv_sharing():
def test_init_kv_cache_with_kv_sharing_valid():
torch.set_default_dtype(torch.float16)
layer_0 = "model.layers.0.self_attn.attn"
layer_1 = "model.layers.1.self_attn.attn"
vllm_config = get_vllm_config()

View File

@ -310,7 +310,8 @@ class MultiHeadAttention(nn.Module):
# currently, only torch_sdpa is supported on rocm
self.attn_backend = _Backend.TORCH_SDPA
else:
if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
_Backend.FLEX_ATTENTION):
backend = _Backend.XFORMERS
self.attn_backend = backend if backend in {

View File

@ -4,7 +4,7 @@
import os
from contextlib import contextmanager
from functools import cache
from typing import Generator, Optional, Type
from typing import Generator, Optional, Union
import torch
@ -79,6 +79,33 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
return forced_attn_backend
def supports_head_size(
attn_backend: Union[str, type[AttentionBackend]],
head_size: int,
) -> bool:
if isinstance(attn_backend, str):
try:
attn_backend = resolve_obj_by_qualname(attn_backend)
except ImportError:
return False
assert isinstance(attn_backend, type)
# TODO: Update the interface once V0 is removed
if get_supported_head_sizes := getattr(attn_backend,
"get_supported_head_sizes", None):
return head_size in get_supported_head_sizes()
if validate_head_size := getattr(attn_backend, "validate_head_size", None):
try:
validate_head_size(head_size)
return True
except Exception:
return False
raise NotImplementedError(f"{attn_backend.__name__} does not support "
"head size validation")
def get_attn_backend(
head_size: int,
dtype: torch.dtype,
@ -87,7 +114,7 @@ def get_attn_backend(
is_attention_free: bool,
is_blocksparse: bool = False,
use_mla: bool = False,
) -> Type[AttentionBackend]:
) -> type[AttentionBackend]:
"""Selects which attention backend to use and lazily imports it."""
# Accessing envs.* behind an @lru_cache decorator can cause the wrong
# value to be returned from the cache if the value changes between calls.
@ -115,7 +142,7 @@ def _cached_get_attn_backend(
is_blocksparse: bool = False,
use_v1: bool = False,
use_mla: bool = False,
) -> Type[AttentionBackend]:
) -> type[AttentionBackend]:
if is_blocksparse:
logger.info("Using BlocksparseFlashAttention backend.")
from vllm.attention.backends.blocksparse_attn import (

View File

@ -2319,7 +2319,7 @@ class SchedulerConfig:
if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
logger.warning(
"max_num_batched_tokens (%d) exceeds max_num_seqs"
"max_num_batched_tokens (%d) exceeds max_num_seqs "
"* max_model_len (%d). This may lead to unexpected behavior.",
self.max_num_batched_tokens,
self.max_num_seqs * self.max_model_len)

View File

@ -1393,13 +1393,6 @@ class EngineArgs:
recommend_to_remove=False)
return False
# Only Fp16 and Bf16 dtypes since we only support FA.
V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
if model_config.dtype not in V1_SUPPORTED_DTYPES:
_raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
recommend_to_remove=False)
return False
# No Mamba or Encoder-Decoder so far.
if not model_config.is_v1_compatible:
_raise_or_fallback(feature_name=model_config.architectures,

View File

@ -28,7 +28,8 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
ChatCompletionToolMessageParam)
from openai.types.chat.chat_completion_content_part_input_audio_param import (
InputAudio)
from pydantic import TypeAdapter
from PIL import Image
from pydantic import BaseModel, ConfigDict, TypeAdapter
# yapf: enable
from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
ProcessorMixin)
@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
"""The type of the content part."""
class PILImage(BaseModel):
"""
A PIL.Image.Image object.
"""
image_pil: Image.Image
model_config = ConfigDict(arbitrary_types_allowed=True)
class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
"""A simpler version of the param that only accepts a PIL image.
Example:
{
"image_pil": ImageAsset('cherry_blossom').pil_image
}
"""
image_pil: Required[PILImage]
class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
"""A simpler version of the param that only accepts a plain image_url.
This is supported by OpenAI API, although it is not documented.
@ -129,6 +149,7 @@ ChatCompletionContentPartParam: TypeAlias = Union[
OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
ChatCompletionContentPartInputAudioParam,
ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
CustomChatCompletionContentPILImageParam,
CustomChatCompletionContentSimpleImageParam,
ChatCompletionContentPartImageEmbedsParam,
CustomChatCompletionContentSimpleAudioParam,
@ -631,6 +652,10 @@ class BaseMultiModalContentParser(ABC):
image_embeds: Union[str, dict[str, str]]) -> None:
raise NotImplementedError
@abstractmethod
def parse_image_pil(self, image_pil: Image.Image) -> None:
raise NotImplementedError
@abstractmethod
def parse_audio(self, audio_url: str) -> None:
raise NotImplementedError
@ -677,6 +702,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
self._add_placeholder(placeholder)
def parse_image_pil(self, image_pil: Image.Image) -> None:
placeholder = self._tracker.add("image", image_pil)
self._add_placeholder(placeholder)
def parse_audio(self, audio_url: str) -> None:
audio = self._connector.fetch_audio(audio_url)
@ -733,6 +762,13 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
placeholder = self._tracker.add("image_embeds", future)
self._add_placeholder(placeholder)
def parse_image_pil(self, image_pil: Image.Image) -> None:
future: asyncio.Future[Image.Image] = asyncio.Future()
future.set_result(image_pil)
placeholder = self._tracker.add("image", future)
self._add_placeholder(placeholder)
def parse_audio(self, audio_url: str) -> None:
audio_coro = self._connector.fetch_audio_async(audio_url)
@ -851,12 +887,13 @@ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
# Need to validate url objects
_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
# Define a mapping from part types to their corresponding parsing functions.
MM_PARSER_MAP: dict[
@ -869,6 +906,7 @@ MM_PARSER_MAP: dict[
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
"image_embeds":
lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
"image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
"audio_url":
lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
"input_audio":
@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
"image_embeds",
"image_embeds", "image_pil",
"audio_url", "input_audio", "video_url")
@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
else:
return str_content
if part_type == "image_pil":
image_content = cast(Image.Image, content)
mm_parser.parse_image_pil(image_content)
return {'type': 'image'} if wrap_dicts else None
if part_type == "image_url":
str_content = cast(str, content)
mm_parser.parse_image(str_content)

View File

@ -1204,7 +1204,7 @@ class LLM:
input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
pooling_params = PoolingParams()
pooling_params = PoolingParams(use_cross_encoder=True)
tokenization_kwargs: dict[str, Any] = {}
_validate_truncation_size(self.llm_engine.model_config.max_model_len,

View File

@ -910,6 +910,8 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
}
if envs.VLLM_SERVER_DEV_MODE:
logger.warning("SECURITY WARNING: Development endpoints are enabled! "
"This should NOT be used in production!")
@router.get("/server_info")
async def show_server_info(raw_request: Request):

View File

@ -1156,8 +1156,9 @@ class ScoreRequest(OpenAIBaseModel):
# --8<-- [end:score-extra-params]
def to_pooling_params(self):
return PoolingParams(additional_data=self.additional_data)
def to_pooling_params(self, *, use_cross_encoder: bool = False):
return PoolingParams(use_cross_encoder=use_cross_encoder,
additional_data=self.additional_data)
class RerankRequest(OpenAIBaseModel):
@ -1182,8 +1183,9 @@ class RerankRequest(OpenAIBaseModel):
# --8<-- [end:rerank-extra-params]
def to_pooling_params(self):
return PoolingParams(additional_data=self.additional_data)
def to_pooling_params(self, *, use_cross_encoder: bool = False):
return PoolingParams(use_cross_encoder=use_cross_encoder,
additional_data=self.additional_data)
class RerankDocument(BaseModel):

View File

@ -25,9 +25,7 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
PreTrainedTokenizer,
PreTrainedTokenizerFast)
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import make_async, merge_async_iterators
logger = init_logger(__name__)
@ -50,7 +48,7 @@ class ServingScores(OpenAIServing):
async def _embedding_score(
self,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
tokenizer: AnyTokenizer,
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
@ -141,7 +139,7 @@ class ServingScores(OpenAIServing):
async def _cross_encoding_score(
self,
tokenizer: Union[AnyTokenizer],
tokenizer: AnyTokenizer,
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
@ -190,7 +188,7 @@ class ServingScores(OpenAIServing):
# Schedule the request and get the result generator.
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
pooling_params = request.to_pooling_params()
pooling_params = request.to_pooling_params(use_cross_encoder=True)
for i, engine_prompt in enumerate(engine_prompts):
request_id_item = f"{request_id}-{i}"

View File

@ -6,6 +6,7 @@ from typing import Union
import regex as re
from vllm.entrypoints.chat_utils import random_tool_call_id
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaFunctionCall, DeltaMessage,
DeltaToolCall,
@ -15,7 +16,6 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser, ToolParserManager)
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import random_uuid
logger = init_logger(__name__)
@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser):
DeltaToolCall(
index=self.current_tool_id,
type="function",
id=f"chatcmpl-tool-{random_uuid()}",
id=random_tool_call_id(),
function=DeltaFunctionCall(
name=function_name).model_dump(
exclude_none=True),

View File

@ -3,10 +3,11 @@
# ruff: noqa
import json
from collections.abc import Sequence
from typing import Any, Dict, List, Optional, Union
from typing import Any, Optional, Union
import regex as re
from vllm.entrypoints.chat_utils import random_tool_call_id
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaFunctionCall, DeltaMessage,
DeltaToolCall,
@ -225,7 +226,7 @@ class xLAMToolParser(ToolParser):
function_name = name_match.group(1)
# The test expects us to send just the name first
tool_id = f"chatcmpl-tool-{random_uuid()}"
tool_id = random_tool_call_id()
delta = DeltaMessage(tool_calls=[
DeltaToolCall(
index=0,

View File

@ -0,0 +1,147 @@
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 5
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"1536": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"3072": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 4
}
}

View File

@ -322,7 +322,7 @@ def cutlass_moe_fp8(
topk_ids: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
per_act_token: bool,
per_act_token: Optional[bool] = None,
activation: str = "silu",
a1_scale: Optional[torch.Tensor] = None,
a2_scale: Optional[torch.Tensor] = None,
@ -366,6 +366,9 @@ def cutlass_moe_fp8(
Returns:
- torch.Tensor: The fp16 output tensor after applying the MoE layer.
"""
if per_act_token is None:
per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
a2_scale.numel() != 1 if a2_scale is not None else False)
per_out_ch = w1_scale.numel() != w1_q.size(0)
num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(

View File

@ -1320,8 +1320,13 @@ class FusedMoE(torch.nn.Module):
def forward(self, hidden_states: torch.Tensor,
router_logits: torch.Tensor):
return torch.ops.vllm.moe_forward(hidden_states, router_logits,
self.layer_name)
# TODO: Once the OOM issue for the TPU backend is resolved, we will
# switch to using the moe_forward custom op.
if current_platform.is_tpu():
return self.forward_impl(hidden_states, router_logits)
else:
return torch.ops.vllm.moe_forward(hidden_states, router_logits,
self.layer_name)
def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
full_router_logits: torch.Tensor):

View File

@ -15,6 +15,7 @@ from vllm.model_executor.pooling_metadata import ( # noqa: E501
from vllm.model_executor.pooling_metadata import PoolingTensors
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
from vllm.transformers_utils.config import (
get_classification_activation_function,
get_cross_encoder_activation_function)
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
@ -388,15 +389,14 @@ class ClassifierPooler(nn.Module):
self.classifier = classifier
self.pooler = pooler
if config.task == "score":
self.default_activation_function = \
get_cross_encoder_activation_function(config.hf_config)
elif config.task == "classify":
self.default_activation_function = nn.Sigmoid() \
if config.hf_config.num_labels == 1 else nn.Softmax()
else:
raise NotImplementedError(f"task={config.task!r} is not supported"
" with the classification pooler")
self.classification_act_fn = get_classification_activation_function(
config.hf_config)
self.cross_encoder_act_fn = get_cross_encoder_activation_function(
config.hf_config)
def _get_act_fn(self, use_cross_encoder: bool):
return (self.cross_encoder_act_fn
if use_cross_encoder else self.classification_act_fn)
def get_prompt_lens(
self,
@ -446,8 +446,28 @@ class ClassifierPooler(nn.Module):
# apply classifier once on the full batch if possible
pooled_output = self.classifier(pooled_output)
# shape: (batch_size, num_labels)
scores = self.default_activation_function(pooled_output)
if isinstance(pooling_metadata, V0PoolingMetadata):
use_cross_encoder_list = [
pooling_param.use_cross_encoder
for _, pooling_param in pooling_metadata.seq_groups
]
else:
use_cross_encoder_list = [
pooling_param.use_cross_encoder
for pooling_param in pooling_metadata.pooling_params
]
# shape of scores: (batch_size, num_labels)
if all(use_cross_encoder == use_cross_encoder_list[0]
for use_cross_encoder in use_cross_encoder_list):
act_fn = self._get_act_fn(use_cross_encoder_list[0])
scores = act_fn(pooled_output)
else:
scores = torch.stack([
self._get_act_fn(use_cross_encoder)(vecs)
for use_cross_encoder, vecs in zip(use_cross_encoder_list,
pooled_output)
])
pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
return PoolerOutput(outputs=pooled_outputs)

View File

@ -368,6 +368,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
"weights")
self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
"input_activations")
self.topk_indices_dtype = None
per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
and self.input_quant.strategy
@ -738,6 +739,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
cutlass_moe_fp8)
self.topk_indices_dtype = None
self.fused_experts = cutlass_moe_fp8 # type: ignore
self.disable_expert_map = False

View File

@ -104,8 +104,12 @@ class TensorizerLoader(BaseModelLoader):
if is_vllm_tensorized(self.tensorizer_config):
tensorizer_config = self._patch_tensorizer_config(model_config)
model = init_tensorizer_model(tensorizer_config=tensorizer_config,
vllm_config=vllm_config)
device_config = vllm_config.device_config
with set_default_torch_dtype(model_config.dtype):
with torch.device(device_config.device):
model = init_tensorizer_model(
tensorizer_config=tensorizer_config,
vllm_config=vllm_config)
self.load_weights(model, model_config)
return model
return self._load_model_serialized_cpu(vllm_config=vllm_config)

View File

@ -25,8 +25,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput
from vllm.transformers_utils.config import (
get_cross_encoder_activation_function)
from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
from .utils import WeightsMapper, maybe_prefix
@ -462,9 +460,6 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
super().__init__()
config = vllm_config.model_config.hf_config
self.default_activation_function = \
get_cross_encoder_activation_function(config)
self.num_labels = config.num_labels
self.bert = BertModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "bert"),

View File

@ -18,8 +18,6 @@ from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput
from vllm.transformers_utils.config import (
get_cross_encoder_activation_function)
from .bert_with_rope import BertWithRope, JinaRobertaModel
from .interfaces import SupportsCrossEncoding, SupportsV0Only
@ -178,9 +176,6 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
super().__init__()
config = vllm_config.model_config.hf_config
self.default_activation_function = \
get_cross_encoder_activation_function(config)
self.num_labels = config.num_labels
self.roberta = BertModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "bert"),

View File

@ -48,9 +48,10 @@ class MediaConnector:
media_io_kwargs: Additional args passed to process media
inputs, keyed by modalities. For example,
to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`
`--media-io-kwargs '{"video":{"num_frames":40}}'`
connection: HTTP connection client to download media contents.
allowed_local_media_path: A local directory to load media files from.
allowed_local_media_path: A local directory to load media files
from.
"""
super().__init__()
@ -488,4 +489,4 @@ def fetch_video(
"video": video_io_kwargs
}
media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
return media_connector.fetch_video(video_url)
return media_connector.fetch_video(video_url)

View File

@ -234,31 +234,44 @@ class CudaPlatformBase(Platform):
return ("vllm.attention.backends."
"flashmla.FlashMLABackend")
if use_v1:
FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501
FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501
FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501
if selected_backend == _Backend.FLASHINFER:
logger.info_once("Using FlashInfer backend on V1 engine.")
return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
return FLASHINFER_V1
elif selected_backend == _Backend.FLEX_ATTENTION:
logger.info("Using FlexAttenion backend on V1 engine.")
return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501
logger.info_once("Using FlexAttention backend on V1 engine.")
return FLEX_ATTENTION_V1
elif selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
logger.info_once("Using Triton backend on V1 engine.")
return ("vllm.v1.attention.backends."
"triton_attn.TritonAttentionBackend")
return TRITON_ATTN_VLLM_V1
elif selected_backend == _Backend.FLASH_ATTN:
logger.info_once("Using Flash Attention backend on V1 engine.")
return ("vllm.v1.attention.backends."
"flash_attn.FlashAttentionBackend")
return FLASH_ATTN_V1
from vllm.attention.selector import supports_head_size
# Default backends for V1 engine
# FP32 is only supported by FlexAttention
if dtype not in (torch.float16, torch.bfloat16):
logger.info_once(
"Using FlexAttention backend for %s on V1 engine.",
dtype,
)
return FLEX_ATTENTION_V1
# Prefer FlashInfer for Blackwell GPUs if installed
if cls.is_device_capability(100):
if cls.is_device_capability(100) and \
supports_head_size(FLASHINFER_V1, head_size):
try:
import flashinfer # noqa: F401
logger.info_once(
"Using FlashInfer backend on V1 engine by default for "
"Blackwell (SM 10.0) GPUs.")
return ("vllm.v1.attention.backends."
"flashinfer.FlashInferBackend")
return FLASHINFER_V1
except ImportError:
logger.info_once(
"FlashInfer failed to import for V1 engine on "
@ -266,10 +279,13 @@ class CudaPlatformBase(Platform):
"install FlashInfer for better performance.")
pass
# FlashAttention is the default for SM 8.0+ GPUs
if cls.has_device_capability(80):
if cls.has_device_capability(80) and \
supports_head_size(FLASH_ATTN_V1, head_size):
logger.info_once("Using Flash Attention backend on V1 engine.")
return ("vllm.v1.attention.backends."
"flash_attn.FlashAttentionBackend")
return FLASH_ATTN_V1
logger.info_once("Using FlexAttention backend on V1 engine.")
return FLEX_ATTENTION_V1
# Backends for V0 engine
if selected_backend == _Backend.FLASHINFER:

View File

@ -24,12 +24,14 @@ class PoolingParams(
"""
dimensions: Optional[int] = None
use_cross_encoder: bool = False
additional_data: Optional[Any] = None
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
def clone(self) -> "PoolingParams":
"""Returns a deep copy of the PoolingParams instance."""
return PoolingParams(dimensions=self.dimensions,
use_cross_encoder=self.use_cross_encoder,
additional_data=self.additional_data)
def verify(self, model_config: "ModelConfig") -> None:
@ -54,6 +56,7 @@ class PoolingParams(
def __repr__(self) -> str:
return (f"PoolingParams("
f"dimensions={self.dimensions}, "
f"use_cross_encoder={self.use_cross_encoder}, "
f"additional_metadata={self.additional_data})")
def __post_init__(self) -> None:

View File

@ -866,24 +866,26 @@ def try_get_generation_config(
return None
def get_classification_activation_function(config: PretrainedConfig):
return nn.Sigmoid() if config.num_labels == 1 else nn.Softmax()
def get_cross_encoder_activation_function(config: PretrainedConfig):
function_name: Optional[str] = None
if hasattr(config, "sentence_transformers") and "activation_fn" in \
config.sentence_transformers:
if (hasattr(config, "sentence_transformers")
and "activation_fn" in config.sentence_transformers):
function_name = config.sentence_transformers["activation_fn"]
elif (hasattr(config, "sbert_ce_default_activation_function")
and config.sbert_ce_default_activation_function is not None):
function_name = config.sbert_ce_default_activation_function
if function_name is not None:
assert function_name.startswith("torch.nn.modules."), \
"Loading of activation functions is restricted to " \
"torch.nn.modules for security reasons"
assert function_name.startswith("torch.nn.modules."), (
"Loading of activation functions is restricted to "
"torch.nn.modules for security reasons")
return resolve_obj_by_qualname(function_name)()
else:
return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
return nn.Sigmoid() if config.num_labels == 1 else nn.Identity()
def try_get_safetensors_metadata(

View File

@ -20,9 +20,12 @@ import torch
import vllm.envs as envs
from vllm.connections import global_http_connection
from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)
_config_home = envs.VLLM_CONFIG_ROOT
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
@ -183,7 +186,7 @@ class UsageMessage:
self.gpu_memory_per_device = (
torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
except Exception:
pass
logger.exception("Failed to collect TPU information")
self.provider = _detect_cloud_provider()
self.architecture = platform.machine()
self.platform = platform.platform()

View File

@ -3,7 +3,8 @@
import numpy as np
import torch
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.backends.abstract import (AttentionBackend,
AttentionMetadata)
from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
TorchSDPAMetadata)
from vllm.attention.backends.utils import CommonAttentionState
@ -17,9 +18,24 @@ from vllm.v1.worker.cpu_model_runner import CPUModelRunner
from vllm.v1.worker.gpu_input_batch import InputBatch
class TorchSDPABackend:
class TorchSDPABackend(AttentionBackend):
accept_output_buffer: bool = False
@classmethod
def get_supported_head_sizes(cls) -> list[int]:
return PagedAttention.get_supported_head_sizes()
@classmethod
def validate_head_size(cls, head_size: int) -> None:
supported_head_sizes = cls.get_supported_head_sizes()
if head_size not in supported_head_sizes:
attn_type = cls.__name__.removesuffix("Backend")
raise ValueError(
f"Head size {head_size} is not supported by {attn_type}. "
f"Supported head sizes are: {supported_head_sizes}. "
"Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes.")
@staticmethod
def get_name() -> str:
return "TORCH_SDPA_VLLM_V1"

View File

@ -44,10 +44,21 @@ class FlashAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
@staticmethod
def get_supported_head_sizes() -> list[int]:
@classmethod
def get_supported_head_sizes(cls) -> list[int]:
return [32, 64, 96, 128, 160, 192, 224, 256]
@classmethod
def validate_head_size(cls, head_size: int) -> None:
supported_head_sizes = cls.get_supported_head_sizes()
if head_size not in supported_head_sizes:
attn_type = cls.__name__.removesuffix("Backend")
raise ValueError(
f"Head size {head_size} is not supported by {attn_type}. "
f"Supported head sizes are: {supported_head_sizes}. "
"Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes.")
@staticmethod
def get_name() -> str:
return "FLASH_ATTN_VLLM_V1"
@ -416,12 +427,7 @@ class FlashAttentionImpl(AttentionImpl):
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
if head_size not in support_head_sizes:
raise ValueError(
f"Head size {head_size} is not supported by FlashAttention. "
f"Supported head sizes are: {support_head_sizes}. "
"Set VLLM_USE_V1=0 to use another attention backend.")
FlashAttentionBackend.validate_head_size(head_size)
if attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and "

View File

@ -38,10 +38,22 @@ class FlashInferBackend(AttentionBackend):
accept_output_buffer: bool = True
@staticmethod
def get_supported_head_sizes() -> list[int]:
@classmethod
def get_supported_head_sizes(cls) -> list[int]:
# https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
return [64, 128, 256]
@classmethod
def validate_head_size(cls, head_size: int) -> None:
supported_head_sizes = cls.get_supported_head_sizes()
if head_size not in supported_head_sizes:
attn_type = cls.__name__.removesuffix("Backend")
raise ValueError(
f"Head size {head_size} is not supported by {attn_type}. "
f"Supported head sizes are: {supported_head_sizes}. "
"Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes.")
@staticmethod
def get_name() -> str:
return "FLASHINFER_VLLM_V1"
@ -207,14 +219,8 @@ class FlashInferMetadata:
return self.qo_indptr
def __post_init__(self):
# Refer to
# https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
if self.head_dim is not None and self.head_dim \
not in supported_head_sizes:
raise ValueError(
f"Only {supported_head_sizes} are supported for head_dim,",
f" received {self.head_dim}.")
if self.head_dim is not None:
FlashInferBackend.validate_head_size(self.head_dim)
class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with FlashAttention."""
from collections import defaultdict
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional
@ -21,9 +21,6 @@ from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
from vllm.v1.kv_cache_interface import AttentionSpec
from vllm.v1.worker.block_table import BlockTable
if current_platform.is_cuda():
pass
logger = init_logger(__name__)
if TYPE_CHECKING:
@ -45,9 +42,9 @@ def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
class FlexAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
@staticmethod
def get_supported_head_sizes() -> list[int]:
return [16, 32, 64, 96, 128, 160, 192, 224, 256]
@classmethod
def validate_head_size(cls, head_size: int) -> None:
return # FlexAttention supports any head size
@staticmethod
def get_name() -> str:
@ -384,12 +381,8 @@ class FlexAttentionImpl(AttentionImpl):
raise NotImplementedError(
"FlexAttention does not support kv sharing yet.")
support_head_sizes = FlexAttentionBackend.get_supported_head_sizes()
if head_size not in support_head_sizes:
raise ValueError(
f"Head size {head_size} is not supported by FlashAttention. "
f"Supported head sizes are: {support_head_sizes}. "
"Set VLLM_USE_V1=0 to use another attention backend.")
FlexAttentionBackend.validate_head_size(head_size)
if is_quantized_kv_cache(self.kv_cache_dtype):
raise NotImplementedError(
"FlexAttention does not support quantized kv-cache. Yet")
@ -463,6 +456,21 @@ class FlexAttentionImpl(AttentionImpl):
query = query[:, :, :num_actual_tokens, :]
# Doesn't work for now -> constraint violation
# torch._dynamo.try_mark_dynamic(query, 2)
# default M=64, N=64 may run out of shared memory on some GPUs
# TODO: Explicit configs for each GPU?
# Not sure how to calculate the shared memory requirement
extra_kernel_options = defaultdict[str, int](lambda: 64)
if query.dtype == torch.float32:
extra_kernel_options["BLOCK_M"] //= 2
extra_kernel_options["BLOCK_N"] //= 2
if current_platform.is_cuda():
device_props = torch.cuda.get_device_properties()
max_shared_memory = device_props.shared_memory_per_block_optin
if max_shared_memory < 144 * 1024:
extra_kernel_options["BLOCK_M"] //= 2
extra_kernel_options["BLOCK_N"] //= 2
out = flex_attention_compiled(
query,
key_cache,
@ -471,7 +479,10 @@ class FlexAttentionImpl(AttentionImpl):
attn_metadata.block_mask,
self.scale,
enable_gqa=enable_gqa,
kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
kernel_options={
"FORCE_USE_FLEX_ATTENTION": True,
**extra_kernel_options
},
)
# Flex doesn't have an out variant today, rely on epilogue fusion

View File

@ -254,10 +254,21 @@ class MLACommonBackend(AttentionBackend):
) -> tuple[int, ...]:
return (num_blocks, block_size, head_size)
@staticmethod
def get_supported_head_sizes() -> list[int]:
@classmethod
def get_supported_head_sizes(cls) -> list[int]:
return [576]
@classmethod
def validate_head_size(cls, head_size: int) -> None:
supported_head_sizes = cls.get_supported_head_sizes()
if head_size not in supported_head_sizes:
attn_type = cls.__name__.removesuffix("Backend")
raise ValueError(
f"Head size {head_size} is not supported by {attn_type}. "
f"Supported head sizes are: {supported_head_sizes}. "
"Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes.")
@dataclass
class MLACommonPrefillMetadata:
@ -320,12 +331,8 @@ class MLACommonMetadata(Generic[D]):
prefill: Optional[MLACommonPrefillMetadata] = None
def __post_init__(self):
supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
if self.head_dim is not None and self.head_dim \
not in supported_head_sizes:
raise ValueError(
f"Only {supported_head_sizes} are supported for head_dim,",
f"received {self.head_dim}.")
if self.head_dim is not None:
MLACommonBackend.validate_head_size(self.head_dim)
M = TypeVar("M", bound=MLACommonMetadata)

View File

@ -314,10 +314,21 @@ class AiterFlashAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
@staticmethod
def get_supported_head_sizes() -> list[int]:
@classmethod
def get_supported_head_sizes(cls) -> list[int]:
return [32, 64, 96, 128, 160, 192, 224, 256]
@classmethod
def validate_head_size(cls, head_size: int) -> None:
supported_head_sizes = cls.get_supported_head_sizes()
if head_size not in supported_head_sizes:
attn_type = cls.__name__.removesuffix("Backend")
raise ValueError(
f"Head size {head_size} is not supported by {attn_type}. "
f"Supported head sizes are: {supported_head_sizes}. "
"Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes.")
@staticmethod
def get_name() -> str:
return "FLASH_ATTN_VLLM_V1"
@ -428,14 +439,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
assert self.num_heads % self.num_kv_heads == 0
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
support_head_sizes = \
AiterFlashAttentionBackend.get_supported_head_sizes()
if head_size not in support_head_sizes:
raise ValueError(
f"Head size {head_size} is not supported by "
"AiterFlashAttention. "
f"Supported head sizes are: {support_head_sizes}. "
"Set VLLM_USE_V1=0 to use another attention backend.")
AiterFlashAttentionBackend.validate_head_size(head_size)
if attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and "

View File

@ -190,10 +190,21 @@ class TritonAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True
@staticmethod
def get_supported_head_sizes() -> list[int]:
@classmethod
def get_supported_head_sizes(cls) -> list[int]:
return [32, 64, 96, 128, 160, 192, 224, 256]
@classmethod
def validate_head_size(cls, head_size: int) -> None:
supported_head_sizes = cls.get_supported_head_sizes()
if head_size not in supported_head_sizes:
attn_type = cls.__name__.removesuffix("Backend")
raise ValueError(
f"Head size {head_size} is not supported by {attn_type}. "
f"Supported head sizes are: {supported_head_sizes}. "
"Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes.")
@staticmethod
def get_name() -> str:
return "TRITON_ATTN_VLLM_V1"
@ -268,11 +279,7 @@ class TritonAttentionImpl(AttentionImpl):
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
support_head_sizes = TritonAttentionBackend.get_supported_head_sizes()
if head_size not in support_head_sizes:
raise ValueError(
f"Head size {head_size} is not supported by TritonAttention. "
f"Supported head sizes are: {support_head_sizes}.")
TritonAttentionBackend.validate_head_size(head_size)
if attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and "

View File

@ -101,7 +101,10 @@ class TopKTopPSampler(nn.Module):
"per-request generators. Falling back to "
"PyTorch-native implementation.")
return self.forward_native(logits, generators, k, p)
return flashinfer_sample(logits, k, p, generators)
# flashinfer sampling functions expect contiguous logits.
# In flex_attn/triton_attn fp32 inference, logits can be non-contiguous
# because of slicing operation in logits_processor.
return flashinfer_sample(logits.contiguous(), k, p, generators)
def forward_tpu(
self,

View File

@ -12,8 +12,8 @@ from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models import supports_multimodal
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata,
FlashAttentionMetadata)
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.spec_decode.utils import prepare_eagle_input_kernel

View File

@ -528,19 +528,20 @@ class GPUModelRunner(LoRAModelRunnerMixin):
start_token_index:end_token_index] = new_token_ids
self.input_batch.num_tokens_no_spec[
req_index] = end_token_index
# Add spec_token_ids to token_ids_cpu.
spec_token_ids = (
scheduler_output.scheduled_spec_decode_tokens.get(
req_id, ()))
if spec_token_ids:
start_index = end_token_index
end_token_index += len(spec_token_ids)
self.input_batch.token_ids_cpu[
req_index,
start_index:end_token_index] = spec_token_ids
# NOTE(woosuk): `num_tokens` here may include spec tokens.
self.input_batch.num_tokens[req_index] = end_token_index
# Add spec_token_ids to token_ids_cpu.
spec_token_ids = (
scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
if spec_token_ids:
num_spec_tokens = len(spec_token_ids)
start_index = self.input_batch.num_tokens_no_spec[req_index]
end_token_index = start_index + num_spec_tokens
self.input_batch.token_ids_cpu[
req_index, start_index:end_token_index] = spec_token_ids
# NOTE(woosuk): `num_tokens` here may include spec tokens.
self.input_batch.num_tokens[req_index] += num_spec_tokens
# Add the new or resumed requests to the persistent batch.
# The smaller empty indices are filled first.
for req_id in req_ids_to_add: