Compare commits

..

1 Commits

Author SHA1 Message Date
b6381ced9c updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-07-15 13:50:42 +00:00
118 changed files with 1189 additions and 3200 deletions

View File

@ -6,17 +6,19 @@ set -exuo pipefail
# Try building the docker image
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
FROM gaudi-base-image:latest
FROM 1.22-413-pt2.7.1:latest
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements/hpu.txt
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=empty pip install .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils

View File

@ -645,7 +645,7 @@ steps:
optional: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s models/test_initialization.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py

View File

@ -1,6 +0,0 @@
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
have_fun: false # Just review the code
code_review:
comment_severity_threshold: HIGH # Reduce quantity of comments
pull_request_opened:
summary: false # Don't summarize the PR in a separate comment

View File

@ -21,7 +21,7 @@ repos:
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
rev: v1.34.0
rev: v1.32.0
hooks:
- id: typos
- repo: https://github.com/PyCQA/isort

View File

@ -30,11 +30,17 @@ from datasets import load_dataset
from PIL import Image
from transformers import PreTrainedTokenizerBase
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.image import convert_image_mode
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
try:
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.image import convert_image_mode
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
except:
MultiModalDataDict = None
AnyTokenizer = None
LoRARequest = None
print("Install vLLM to use LoRA or Multimodal benchmarking.")
logger = logging.getLogger(__name__)

View File

@ -24,7 +24,6 @@
#include "attention_dtypes.h"
#include "attention_utils.cuh"
#include "cuda_compat.h"
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
@ -34,6 +33,12 @@ typedef __hip_bfloat16 __nv_bfloat16;
#include "../quantization/fp8/nvidia/quant_utils.cuh"
#endif
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
@ -665,6 +670,7 @@ __global__ void paged_attention_v2_reduce_kernel(
} // namespace vllm
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP

View File

@ -18,7 +18,6 @@ limitations under the License.
* Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
* by Alcanderian JieXin Liang
*/
#include "core/registration.h"
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
@ -271,13 +270,4 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
}
#endif
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
}
// clang-format on

View File

@ -18,7 +18,12 @@
*/
#include "attention_kernels.cuh"
#include "cuda_compat.h"
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -182,6 +187,7 @@ void paged_attention_v1(
CALL_V1_LAUNCHER_BLOCK_SIZE)
}
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP

View File

@ -18,7 +18,12 @@
*/
#include "attention_kernels.cuh"
#include "cuda_compat.h"
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -192,6 +197,7 @@ void paged_attention_v2(
CALL_V2_LAUNCHER_BLOCK_SIZE)
}
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP

View File

@ -58,7 +58,7 @@ namespace {
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
#define CHECK_INPUT(x) \
CHECK_CPU(x); \

View File

@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
int64_t topk,
int64_t num_tokens_post_pad);
// shared expert implementation for int8 w8a8
// shared expert implememntation for int8 w8a8
template <typename scalar_t>
void shared_expert_int8_kernel_impl(
scalar_t* __restrict__ output,

View File

@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
__m512 vd0;
__m512 vd1[COLS];
// oops! 4x4 spills but luckily we use 4x2
// oops! 4x4 spills but luckly we use 4x2
__m512 vbias[COLS];
// [NOTE]: s8s8 igemm compensation in avx512-vnni

View File

@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
#define CVT_FP16_TO_FP32(a) \
_mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
// this doesn't handle NaN.
// this doesn't hanel NaN.
inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);

View File

@ -4,10 +4,10 @@
#include <hip/hip_runtime.h>
#endif
#if defined(USE_ROCM) && defined(__GFX9__)
#define WARP_SIZE 64
#else
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#ifndef USE_ROCM

View File

@ -167,6 +167,19 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table, double scale);
void sm100_cutlass_mla_decode(
torch::Tensor const& out, torch::Tensor const& q_nope,
torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens, torch::Tensor const& page_table,
torch::Tensor const& workspace, double sm_scale,
int64_t num_kv_splits =
1 /* Set to 1 to avoid cuda_graph issue by default. */);
int64_t sm100_cutlass_mla_get_workspace_size(
int64_t max_seq_len, int64_t num_batches, int64_t sm_count = 0,
int64_t num_kv_splits =
1 /* Set to 1 to avoid cuda_graph issue by default. */);
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
#ifndef USE_ROCM

View File

@ -521,14 +521,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor page_table, Tensor workspace, float "
"scale,"
" int num_kv_splits) -> ()");
// conditionally compiled so impl in source file
ops.impl("sm100_cutlass_mla_decode", torch::kCUDA, &sm100_cutlass_mla_decode);
// SM100 CUTLASS MLA workspace
ops.def(
"sm100_cutlass_mla_get_workspace_size(int max_seq_len, int num_batches,"
" int sm_count, int num_kv_splits) "
"-> int");
// conditionally compiled so impl in source file
ops.impl("sm100_cutlass_mla_get_workspace_size",
&sm100_cutlass_mla_get_workspace_size);
// Compute NVFP4 block quantized tensor.
ops.def(

View File

@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
ARG PIP_KEYRING_PROVIDER=disabled
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
# Flag enables built-in KV-connector dependency libs into docker images
# Flag enables build-in KV-connector dependency libs into docker images
ARG INSTALL_KV_CONNECTORS=false
#################### BASE BUILD IMAGE ####################
@ -207,19 +207,6 @@ ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV VLLM_USE_PRECOMPILED=""
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
export VLLM_USE_PRECOMPILED=1 && \
echo "Using precompiled wheels"; \
else \
unset VLLM_USE_PRECOMPILED && \
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
fi
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \

View File

@ -8,6 +8,7 @@ API documentation for vLLM's configuration classes.
- [vllm.config.ModelConfig][]
- [vllm.config.CacheConfig][]
- [vllm.config.TokenizerPoolConfig][]
- [vllm.config.LoadConfig][]
- [vllm.config.ParallelConfig][]
- [vllm.config.SchedulerConfig][]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 57 KiB

After

Width:  |  Height:  |  Size: 68 KiB

View File

@ -1,7 +1,3 @@
---
toc_depth: 4
---
# vLLM CLI Guide
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@ -46,10 +42,6 @@ Start the vLLM OpenAI Compatible API server.
vllm serve --help=page
```
### Options
--8<-- "docs/argparse/serve.md"
## chat
Generate chat completions via the running API server.

View File

@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
## CLI Arguments
The `vllm serve` command is used to launch the OpenAI-compatible server.
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
To see the available CLI arguments, run `vllm serve --help`!
## Configuration file

View File

@ -3,15 +3,6 @@
[](){ #deployment-anyscale }
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
It hosts Ray clusters inside your own AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, or managing observability stacks.
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
## Production-ready vLLM on Anyscale quickstarts
- [Offline batch inference](https://console.anyscale.com/template-preview/llm_batch_inference?utm_source=vllm_docs)
- [Deploy vLLM services](https://console.anyscale.com/template-preview/llm_serving?utm_source=vllm_docs)
- [Curate a dataset](https://console.anyscale.com/template-preview/audio-dataset-curation-llm-judge?utm_source=vllm_docs)
- [Finetune an LLM](https://console.anyscale.com/template-preview/entity-recognition-with-llms?utm_source=vllm_docs)

View File

@ -1,42 +1,26 @@
# Open WebUI
[Open WebUI](https://github.com/open-webui/open-webui) is an extensible, feature-rich,
and user-friendly self-hosted AI platform designed to operate entirely offline.
It supports various LLM runners like Ollama and OpenAI-compatible APIs,
with built-in RAG capabilities, making it a powerful AI deployment solution.
1. Install the [Docker](https://docs.docker.com/engine/install/)
To get started with Open WebUI using vLLM, follow these steps:
2. Start the vLLM server with the supported chat completion model, e.g.
1. Install the [Docker](https://docs.docker.com/engine/install/).
```bash
vllm serve qwen/Qwen1.5-0.5B-Chat
```
2. Start the vLLM server with a supported chat completion model:
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
```console
vllm serve Qwen/Qwen3-0.6B-Chat
```
```bash
docker run -d -p 3000:8080 \
--name open-webui \
-v open-webui:/app/backend/data \
-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
--restart always \
ghcr.io/open-webui/open-webui:main
```
!!! note
When starting the vLLM server, be sure to specify the host and port using the `--host` and `--port` flags.
For example:
1. Open it in the browser: <http://open-webui-host:3000/>
```console
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
```
On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
3. Start the Open WebUI Docker container:
```console
docker run -d \
--name open-webui \
-p 3000:8080 \
-v open-webui:/app/backend/data \
-e OPENAI_API_BASE_URL=http://0.0.0.0:8000/v1 \
--restart always \
ghcr.io/open-webui/open-webui:main
```
4. Open it in the browser: <http://open-webui-host:3000/>
At the top of the page, you should see the model `Qwen/Qwen3-0.6B-Chat`.
![Web portal of model Qwen/Qwen3-0.6B-Chat](../../assets/deployment/open_webui.png)
![](../../assets/deployment/open_webui.png)

View File

@ -10,7 +10,6 @@ Contents:
- [BitBLAS](bitblas.md)
- [GGUF](gguf.md)
- [GPTQModel](gptqmodel.md)
- [INC](inc.md)
- [INT4 W4A16](int4.md)
- [INT8 W8A8](int8.md)
- [FP8 W8A8](fp8.md)

View File

@ -1,56 +0,0 @@
---
title: FP8 INC
---
[](){ #inc }
vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
Currently, quantization is validated only in Llama models.
Intel Gaudi supports quantization of various modules and functions, including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. For more information, please refer to:
[Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules).
!!! note
Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
!!! note
`QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
## Run Online Inference Using FP8
Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
```bash
export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
```
!!! tip
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
!!! tip
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
`VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes.
## Run Offline Inference Using FP8
To run offline inference (after completing the model calibration process):
* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
* Call shutdown method of the model_executor at the end of the run.
```python
from vllm import LLM
llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc")
...
# Call llm.generate on the required prompts and sampling params.
...
llm.llm_engine.model_executor.shutdown()
```
## Device for the Model's Weights Uploading
The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.

View File

@ -2,19 +2,18 @@
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU |
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | | ✅︎ | ❌ | ❌ |
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ |
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ |
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU |
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | | ❌ |
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ |
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ |
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
- ✅︎ indicates that the quantization method is supported on the specified hardware.

View File

@ -103,7 +103,9 @@ When tool_choice='required' is set, the model is guaranteed to generate one or m
vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.
However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.
By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.
## Automatic Function Calling

View File

@ -28,7 +28,7 @@ To verify that the Intel Gaudi software was correctly installed, run:
hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
pip list | grep neural # verify that neural_compressor_pt is installed
pip list | grep neural # verify that neural_compressor is installed
```
Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
@ -120,13 +120,12 @@ docker run \
- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
for accelerating low-batch latency and throughput
- Attention with Linear Biases (ALiBi)
- INC quantization
### Unsupported features
- Beam search
- LoRA adapters
- AWQ quantization
- Quantization
- Prefill chunking (mixed-batch inferencing)
### Supported configurations

View File

@ -16,7 +16,6 @@ sys.modules["blake3"] = MagicMock()
sys.modules["vllm._C"] = MagicMock()
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
from vllm.utils import FlexibleArgumentParser # noqa: E402
logger = logging.getLogger("mkdocs")
@ -25,18 +24,15 @@ logger = logging.getLogger("mkdocs")
class MarkdownFormatter(HelpFormatter):
"""Custom formatter that generates markdown for argument groups."""
def __init__(self, prog, starting_heading_level=3):
def __init__(self, prog):
super().__init__(prog,
max_help_position=float('inf'),
width=float('inf'))
self._section_heading_prefix = "#" * starting_heading_level
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
self._markdown_output = []
def start_section(self, heading):
if heading not in {"positional arguments", "options"}:
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
self._markdown_output.append(heading_md)
self._markdown_output.append(f"\n### {heading}\n\n")
def end_section(self):
pass
@ -50,13 +46,9 @@ class MarkdownFormatter(HelpFormatter):
def add_arguments(self, actions):
for action in actions:
if (len(action.option_strings) == 0
or "--help" in action.option_strings):
continue
option_strings = f'`{"`, `".join(action.option_strings)}`'
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
self._markdown_output.append(heading_md)
self._markdown_output.append(f"#### {option_strings}\n\n")
if choices := action.choices:
choices = f'`{"`, `".join(str(c) for c in choices)}`'
@ -89,14 +81,6 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
return cls.add_cli_args(parser, **kwargs)
def create_serve_parser() -> FlexibleArgumentParser:
"""Create a parser for the serve command with markdown formatting."""
parser = FlexibleArgumentParser()
parser.formatter_class = lambda prog: MarkdownFormatter(
prog, starting_heading_level=4)
return make_arg_parser(parser)
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
logger.info("Generating argparse documentation")
logger.debug("Root directory: %s", ROOT_DIR.resolve())
@ -111,7 +95,6 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
"engine_args": create_parser(EngineArgs),
"async_engine_args": create_parser(AsyncEngineArgs,
async_args_only=True),
"serve": create_serve_parser(),
}
# Generate documentation for each parser

View File

@ -106,7 +106,7 @@ to enable simultaneous generation and embedding using the same engine instance i
Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet suported. Please note that these models currently require
enforcing eager mode and disabling prefix caching in V1.
Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,

View File

@ -10,7 +10,7 @@ on HuggingFace model repository.
import os
from dataclasses import asdict
from typing import Any, NamedTuple, Optional
from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
@ -30,9 +30,7 @@ question_per_audio_count = {
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: Optional[str] = None
prompt_token_ids: Optional[dict[str, list[int]]] = None
multi_modal_data: Optional[dict[str, Any]] = None
prompt: str
stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None
@ -42,60 +40,6 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4.
# Voxtral
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.messages import (
AudioChunk,
RawAudio,
TextChunk,
UserMessage,
)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
model_name = "mistralai/Voxtral-Mini-3B-2507"
tokenizer = MistralTokenizer.from_hf_hub(model_name)
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
config_format="mistral",
load_format="mistral",
tokenizer_mode="mistral",
enforce_eager=True,
enable_chunked_prefill=False,
)
text_chunk = TextChunk(text=question)
audios = [
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
for i in range(audio_count)
]
audio_chunks = [
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
]
messages = [UserMessage(content=[*audio_chunks, text_chunk])]
req = ChatCompletionRequest(messages=messages, model=model_name)
tokens = tokenizer.encode_chat_completion(req)
prompt_ids, audios = tokens.tokens, tokens.audios
audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
multi_modal_data = {"audio": audios_and_sr}
return ModelRequestData(
engine_args=engine_args,
prompt_token_ids=prompt_ids,
multi_modal_data=multi_modal_data,
)
# Granite Speech
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
# NOTE - the setting in this example are somehat different than what is
@ -299,7 +243,6 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
model_example_map = {
"voxtral": run_voxtral,
"granite_speech": run_granite_speech,
"minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm,
@ -368,24 +311,16 @@ def main(args):
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
)
mm_data = req_data.multi_modal_data
if not mm_data:
mm_data = {}
if audio_count > 0:
mm_data = {
"audio": [
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
]
}
mm_data = {}
if audio_count > 0:
mm_data = {
"audio": [
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
]
}
assert args.num_prompts > 0
inputs = {"multi_modal_data": mm_data}
if req_data.prompt:
inputs["prompt"] = req_data.prompt
else:
inputs["prompt_token_ids"] = req_data.prompt_token_ids
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1:
# Batch inference
inputs = [inputs] * args.num_prompts

View File

@ -84,7 +84,6 @@ def main():
gpu_memory_utilization=0.8,
speculative_config=speculative_config,
disable_log_stats=False,
max_model_len=16384,
)
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)

View File

@ -174,186 +174,3 @@ respect-ignore-files = true
[tool.ty.environment]
python = "./.venv"
[tool.typos.files]
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
"vllm/third_party/*"]
ignore-hidden = true
ignore-files = true
ignore-dot = true
ignore-vcs = true
ignore-global = true
ignore-parent = true
[tool.typos.default]
binary = false
check-filename = false
check-file = true
unicode = true
ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
".*[Tt]h[rR].*"]
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.default.extend-identifiers]
bbc5b7ede = "bbc5b7ede"
womens_doubles = "womens_doubles"
v_2nd = "v_2nd"
# splitted_input = "splitted_input"
NOOPs = "NOOPs"
typ = "typ"
nin_shortcut = "nin_shortcut"
UperNetDecoder = "UperNetDecoder"
subtile = "subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput = "SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
depthwise_seperable_CNN = "depthwise_seperable_CNN"
[tool.typos.default.extend-words]
iy = "iy"
tendencias = "tendencias"
# intel cpu features
tme = "tme"
dout = "dout"
Pn = "Pn"
arange = "arange"
[tool.typos.type.py]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.py.extend-identifiers]
arange = "arange"
NDArray = "NDArray"
EOFError = "EOFError"
fo = "fo"
ba = "ba"
[tool.typos.type.py.extend-words]
[tool.typos.type.cpp]
extend-glob = ["*.cu"]
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cpp.extend-identifiers]
countr_one = "countr_one"
k_ot = "k_ot"
ot = "ot"
[tool.typos.type.cpp.extend-words]
[tool.typos.type.rust]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.rust.extend-identifiers]
flate2 = "flate2"
[tool.typos.type.rust.extend-words]
ser = "ser"
[tool.typos.type.lock]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.lock.extend-identifiers]
[tool.typos.type.lock.extend-words]
[tool.typos.type.jl]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.jl.extend-identifiers]
[tool.typos.type.jl.extend-words]
modul = "modul"
egals = "egals"
usig = "usig"
egal = "egal"
[tool.typos.type.go]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.go.extend-identifiers]
flate = "flate"
[tool.typos.type.go.extend-words]
[tool.typos.type.css]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.css.extend-identifiers]
nd = "nd"
[tool.typos.type.css.extend-words]
[tool.typos.type.man]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.man.extend-identifiers]
Nd = "Nd"
[tool.typos.type.man.extend-words]
[tool.typos.type.cert]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cert.extend-identifiers]
[tool.typos.type.cert.extend-words]
[tool.typos.type.sh]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.sh.extend-identifiers]
ot = "ot"
[tool.typos.type.sh.extend-words]
[tool.typos.type.vimscript]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.vimscript.extend-identifiers]
windo = "windo"
[tool.typos.type.vimscript.extend-words]

View File

@ -25,7 +25,7 @@ outlines_core == 0.2.10
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
@ -33,7 +33,7 @@ pyzmq >= 25.0.0
msgspec
gguf >= 0.13.0
importlib_metadata; python_version < '3.10'
mistral_common[opencv] >= 1.8.0
mistral_common[opencv] >= 1.6.2
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12

View File

@ -17,7 +17,6 @@ cloudpickle
fastapi
msgspec
openai
partial-json-parser
pillow
psutil
pybase64

View File

@ -23,7 +23,7 @@ jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.8.0 # required for voxtral test
mistral_common[opencv] >= 1.6.2 # required for pixtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test

View File

@ -28,7 +28,7 @@ torchvision==0.22.0
transformers_stream_generator # required for qwen-vl test
mamba_ssm # required for plamo2 test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.8.0 # required for voxtral test
mistral_common[opencv] >= 1.7.0 # required for pixtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test

View File

@ -305,7 +305,7 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.8.0
mistral-common==1.7.0
# via -r requirements/test.in
more-itertools==10.5.0
# via lm-eval
@ -518,8 +518,6 @@ pyasn1-modules==0.4.2
# via google-auth
pybind11==2.13.6
# via lm-eval
pycountry==24.6.1
# via pydantic-extra-types
pycparser==2.22
# via cffi
pycryptodomex==3.22.0
@ -530,12 +528,9 @@ pydantic==2.11.5
# datamodel-code-generator
# mistral-common
# mteb
# pydantic-extra-types
# ray
pydantic-core==2.33.2
# via pydantic
pydantic-extra-types==2.10.5
# via mistral-common
pygments==2.18.0
# via rich
pyparsing==3.2.0
@ -840,7 +835,6 @@ typing-extensions==4.12.2
# pqdm
# pydantic
# pydantic-core
# pydantic-extra-types
# torch
# typer
# typing-inspection

View File

@ -692,8 +692,7 @@ setup(
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile",
"mistral_common[audio]"], # Required for audio processing
"audio": ["librosa", "soundfile"], # Required for audio processing
"video": [] # Kept for backwards compatibility
},
cmdclass=cmdclass,

View File

@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture
def api_server(distributed_executor_backend: str):
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
commands = [
@ -40,6 +40,8 @@ def api_server(distributed_executor_backend: str):
"facebook/opt-125m",
"--host",
"127.0.0.1",
"--tokenizer-pool-size",
str(tokenizer_pool_size),
"--distributed-executor-backend",
distributed_executor_backend,
]
@ -52,8 +54,10 @@ def api_server(distributed_executor_backend: str):
uvicorn_process.terminate()
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, distributed_executor_backend: str):
def test_api_server(api_server, tokenizer_pool_size: int,
distributed_executor_backend: str):
"""
Run the API server and test it.

View File

@ -804,7 +804,7 @@ class VllmRunner:
def get_inputs(
self,
prompts: Union[list[str], list[torch.Tensor], list[int]],
prompts: Union[list[str], list[torch.Tensor]],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
@ -826,16 +826,11 @@ class VllmRunner:
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio
text_prompt_kwargs: dict[str, Any] = {
text_prompt_kwargs = {
("prompt" if isinstance(prompt, str) else "prompt_embeds"):
prompt,
"multi_modal_data": multi_modal_data or None
}
if isinstance(prompt, str):
text_prompt_kwargs["prompt"] = prompt
elif isinstance(prompt, list):
text_prompt_kwargs["prompt_token_ids"] = prompt
else:
text_prompt_kwargs["prompt_embeds"] = prompt
inputs.append(TextPrompt(**text_prompt_kwargs))
return inputs

View File

@ -14,9 +14,8 @@ from typing import Literal, NamedTuple, Optional
import pytest
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
from vllm.config import TaskOption
from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, create_new_process_for_each_test
@ -159,7 +158,7 @@ TEXT_GENERATION_MODELS = {
"databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
"tiiuae/falcon-7b": PPTestSettings.fast(),
"google/gemma-1.1-2b-it": PPTestSettings.fast(),
@ -211,11 +210,9 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
load_format="dummy", task="embed"
),
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
}
MULTIMODAL_MODELS = {
@ -251,7 +248,6 @@ TEST_MODELS = [
"meta-llama/Llama-3.2-1B-Instruct",
"ArthurZ/Ilama-3.2-1B",
"ibm/PowerLM-3b",
"deepseek-ai/DeepSeek-V2-Lite-Chat",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2",
@ -291,11 +287,6 @@ def _compare_tp(
trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
hf_config = get_config(model_id, trust_remote_code)
dtype = "float16"
if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
dtype = "bfloat16"
if load_format == "dummy":
# Avoid OOM
@ -325,7 +316,7 @@ def _compare_tp(
common_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
dtype,
"float16",
"--max-model-len",
"2048",
"--max-num-seqs",
@ -347,7 +338,6 @@ def _compare_tp(
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
testing_ray_compiled_graph = False
if distributed_backend == "ray" and (vllm_major_version == "1"
or specific_case):
# For V1, test Ray Compiled Graph for all the tests
@ -361,7 +351,6 @@ def _compare_tp(
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
testing_ray_compiled_graph = True
elif distributed_backend == "mp":
# Both V0/V1 of multiprocessing executor support PP
pp_env = {
@ -405,6 +394,7 @@ def _compare_tp(
tp_env,
method=method)
except Exception:
testing_ray_compiled_graph = pp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from argparse import ArgumentError
from argparse import ArgumentError, ArgumentTypeError
from contextlib import nullcontext
from dataclasses import dataclass, field
from typing import Annotated, Literal, Optional
@ -12,8 +12,8 @@ import pytest
from vllm.config import CompilationConfig, config
from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
get_type, get_type_hints, is_not_builtin,
is_type, literal_to_kwargs, optional_type,
parse_type)
is_type, literal_to_kwargs, nullable_kvs,
optional_type, parse_type)
from vllm.utils import FlexibleArgumentParser
@ -25,10 +25,18 @@ from vllm.utils import FlexibleArgumentParser
"foo": 1,
"bar": 2
}),
(json.loads, "foo=1,bar=2", {
"foo": 1,
"bar": 2
}),
])
def test_parse_type(type, value, expected):
parse_type_func = parse_type(type)
assert parse_type_func(value) == expected
context = nullcontext()
if value == "foo=1,bar=2":
context = pytest.warns(DeprecationWarning)
with context:
assert parse_type_func(value) == expected
def test_optional_type():
@ -195,6 +203,34 @@ def test_get_kwargs():
assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
@pytest.mark.parametrize(("arg", "expected"), [
(None, dict()),
("image=16", {
"image": 16
}),
("image=16,video=2", {
"image": 16,
"video": 2
}),
("Image=16, Video=2", {
"image": 16,
"video": 2
}),
])
def test_limit_mm_per_prompt_parser(arg, expected):
"""This functionality is deprecated and will be removed in the future.
This argument should be passed as JSON string instead.
TODO: Remove with nullable_kvs."""
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--limit-mm-per-prompt", arg])
assert args.limit_mm_per_prompt == expected
@pytest.mark.parametrize(
("arg", "expected"),
[
@ -290,6 +326,18 @@ def test_prefix_cache_default():
assert not engine_args.enable_prefix_caching
@pytest.mark.parametrize(
("arg"),
[
"image", # Missing =
"image=4,image=5", # Conflicting values
"image=video=4" # Too many = in tokenized arg
])
def test_bad_nullable_kvs(arg):
with pytest.raises(ArgumentTypeError):
nullable_kvs(arg)
# yapf: disable
@pytest.mark.parametrize(("arg", "expected", "option"), [
(None, None, "mm-processor-kwargs"),

View File

@ -176,8 +176,4 @@ async def test_invocations(server: RemoteOpenAIServer):
invocation_output = invocation_response.json()
assert classification_output.keys() == invocation_output.keys()
for classification_data, invocation_data in zip(
classification_output["data"], invocation_output["data"]):
assert classification_data.keys() == invocation_data.keys()
assert classification_data["probs"] == pytest.approx(
invocation_data["probs"], rel=0.01)
assert classification_output["data"] == invocation_output["data"]

View File

@ -14,7 +14,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from ...models.language.pooling.embed_utils import (
run_embedding_correctness_test)
from ...models.utils import check_embeddings_close
from ...utils import RemoteOpenAIServer
MODEL_NAME = "intfloat/multilingual-e5-small"
@ -322,13 +321,7 @@ async def test_invocations(server: RemoteOpenAIServer,
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(completion_output["data"],
invocation_output["data"]):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="completion",
name_1="invocation")
assert completion_output["data"] == invocation_output["data"]
@pytest.mark.asyncio
@ -362,10 +355,4 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(chat_output["data"],
invocation_output["data"]):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="chat",
name_1="invocation")
assert chat_output["data"] == invocation_output["data"]

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from typing import Final
import pytest
@ -30,7 +29,7 @@ def server():
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
f"image={MAXIMUM_IMAGES}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

View File

@ -281,13 +281,7 @@ async def test_invocations(server: RemoteOpenAIServer):
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(completion_output["data"],
invocation_output["data"]):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=completion_data["data"],
embeddings_1_lst=invocation_data["data"],
name_0="completion",
name_1="invocation")
assert completion_output["data"] == invocation_output["data"]
@pytest.mark.asyncio
@ -320,10 +314,4 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(chat_output["data"],
invocation_output["data"]):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=chat_data["data"],
embeddings_1_lst=invocation_data["data"],
name_0="chat",
name_1="invocation")
assert chat_output["data"] == invocation_output["data"]

View File

@ -120,8 +120,4 @@ def test_invocations(server: RemoteOpenAIServer):
invocation_output = invocation_response.json()
assert rerank_output.keys() == invocation_output.keys()
for rerank_result, invocations_result in zip(rerank_output["results"],
invocation_output["results"]):
assert rerank_result.keys() == invocations_result.keys()
assert rerank_result["relevance_score"] == pytest.approx(
invocations_result["relevance_score"], rel=0.01)
assert rerank_output["results"] == invocation_output["results"]

View File

@ -215,8 +215,4 @@ class TestModel:
invocation_output = invocation_response.json()
assert score_output.keys() == invocation_output.keys()
for score_data, invocation_data in zip(score_output["data"],
invocation_output["data"]):
assert score_data.keys() == invocation_data.keys()
assert score_data["score"] == pytest.approx(
invocation_data["score"], rel=0.01)
assert score_output["data"] == invocation_output["data"]

View File

@ -32,7 +32,6 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
"--max-lora-rank",
"64",
"--enable-tokenizer-info-endpoint",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@ -284,106 +283,3 @@ async def test_detokenize(
response.raise_for_status()
assert response.json() == {"prompt": prompt}
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_tokenizer_info_basic(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
"""Test basic tokenizer info endpoint functionality."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
assert "tokenizer_class" in result
assert isinstance(result["tokenizer_class"], str)
assert result["tokenizer_class"]
@pytest.mark.asyncio
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
"""Test that the response matches expected schema types."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
field_types = {
"add_bos_token": bool,
"add_prefix_space": bool,
"clean_up_tokenization_spaces": bool,
"split_special_tokens": bool,
"bos_token": str,
"eos_token": str,
"pad_token": str,
"unk_token": str,
"chat_template": str,
"errors": str,
"model_max_length": int,
"additional_special_tokens": list,
"added_tokens_decoder": dict,
}
for field, expected_type in field_types.items():
if field in result and result[field] is not None:
assert isinstance(
result[field],
expected_type), (f"{field} should be {expected_type.__name__}")
@pytest.mark.asyncio
async def test_tokenizer_info_added_tokens_structure(
server: RemoteOpenAIServer, ):
"""Test added_tokens_decoder structure if present."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
added_tokens = result.get("added_tokens_decoder")
if added_tokens:
for token_id, token_info in added_tokens.items():
assert isinstance(token_id, str), "Token IDs should be strings"
assert isinstance(token_info, dict), "Token info should be a dict"
assert "content" in token_info, "Token info should have content"
assert "special" in token_info, (
"Token info should have special flag")
assert isinstance(token_info["special"],
bool), ("Special flag should be boolean")
@pytest.mark.asyncio
async def test_tokenizer_info_consistency_with_tokenize(
server: RemoteOpenAIServer, ):
"""Test that tokenizer info is consistent with tokenization endpoint."""
info_response = requests.get(server.url_for("tokenizer_info"))
info_response.raise_for_status()
info = info_response.json()
tokenize_response = requests.post(
server.url_for("tokenize"),
json={
"model": MODEL_NAME,
"prompt": "Hello world!"
},
)
tokenize_response.raise_for_status()
tokenize_result = tokenize_response.json()
info_max_len = info.get("model_max_length")
tokenize_max_len = tokenize_result.get("max_model_len")
if info_max_len and tokenize_max_len:
assert info_max_len >= tokenize_max_len, (
"Info max length should be >= tokenize max length")
@pytest.mark.asyncio
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
"""Test chat template is properly included."""
response = requests.get(server.url_for("tokenizer_info"))
response.raise_for_status()
result = response.json()
chat_template = result.get("chat_template")
if chat_template:
assert isinstance(chat_template,
str), ("Chat template should be a string")
assert chat_template.strip(), "Chat template should not be empty"

View File

@ -17,11 +17,6 @@ from vllm.assets.audio import AudioAsset
from ...utils import RemoteOpenAIServer
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "mistral", "--config_format", "mistral",
"--load_format", "mistral"
]
@pytest.fixture
def mary_had_lamb():
@ -38,15 +33,9 @@ def winning_call():
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
async def test_basic_audio(mary_had_lamb, model_name):
async def test_basic_audio(mary_had_lamb):
model_name = "openai/whisper-large-v3-turbo"
server_args = ["--enforce-eager"]
if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
@ -76,13 +65,10 @@ async def test_bad_requests(mary_had_lamb):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
async def test_long_audio_request(mary_had_lamb, model_name):
async def test_long_audio_request(mary_had_lamb):
model_name = "openai/whisper-large-v3-turbo"
server_args = ["--enforce-eager"]
if model_name.startswith("openai"):
return
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
@ -101,8 +87,7 @@ async def test_long_audio_request(mary_had_lamb, model_name):
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
counts = out.count("Mary had a little lamb")
assert counts == 10, counts
assert out.count("Mary had a little lamb") == 10
@pytest.mark.asyncio

View File

@ -416,7 +416,7 @@ class RankTensors:
# We dequant and use that as hidden_states so the tests are stable.
# quantizing and dequantizing yield slightly different results
# depending on the hardware. Here we, quantize and dequantize
# first - so further quantize and dequantize will yield the same
# first - so further quantize and dequantize will yeild the same
# values.
if config.is_per_tensor_act_quant:
a_q, a_scales = ops.scaled_fp8_quant(

View File

@ -95,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
# triton reference
# triton referrence
out_triton = fused_experts(
hidden_states=tokens_bf16,
w1=w1,

View File

@ -1,115 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import pytest
import pytest_asyncio
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
TextChunk, UserMessage)
from vllm.transformers_utils.tokenizer import MistralTokenizer
from ....conftest import AudioTestAssets
from ....utils import RemoteOpenAIServer
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "mistral", "--config_format", "mistral",
"--load_format", "mistral"
]
@pytest.fixture()
def server(request, audio_assets: AudioTestAssets):
args = [
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}),
] + MISTRAL_FORMAT_ARGS
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
def _get_prompt(audio_assets, question):
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
audios = [
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
for i in range(len(audio_assets))
]
audio_chunks = [
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
]
text_chunk = TextChunk(text=question)
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
return tokenizer.apply_chat_template(messages=messages)
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_with_multiple_audios(vllm_runner,
audio_assets: AudioTestAssets, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate
for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tokenizer_mode="mistral",
)
@pytest.mark.asyncio
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
def asset_to_chunk(asset):
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
audio.format = "wav"
audio_dict = AudioChunk.from_audio(audio).to_openai()
return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
messages = [{
"role":
"user",
"content": [
*audio_chunks,
{
"type":
"text",
"text":
f"What's happening in these {len(audio_assets)} audio clips?"
},
],
}]
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"

View File

@ -440,7 +440,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
trust_remote_code=True), # noqa: E501
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", tokenizer_mode="mistral"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
# [Cross-encoder]
@ -465,11 +464,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code=True,
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
"EagleLlama4ForCausalLM": _HfExamplesInfo(
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
trust_remote_code=True,
speculative_model="morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct"), # noqa: E501
"EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
trust_remote_code=True,
is_available_online=False,
@ -519,4 +513,4 @@ class HfExampleModels:
raise ValueError(f"No example model defined for {model_id}")
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)

View File

@ -36,11 +36,6 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
"KimiVLForConditionalGeneration"):
pytest.skip("Avoid OOM")
if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
from vllm.model_executor.models.llama4 import Llama4ForCausalLM
from vllm.model_executor.models.registry import ModelRegistry
ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
# Avoid OOM and reduce initialization time by only using 1 layer
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
hf_config.update(model_info.hf_overrides)
@ -48,7 +43,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
text_config = hf_config.get_text_config()
# Ensure at least 2 expert per group
# Since `grouped_topk` assumes top-2
# Since `grouped_topk` assums top-2
n_group = getattr(text_config, 'n_group', None)
num_experts = n_group * 2 if n_group is not None else 2

View File

@ -6,10 +6,8 @@ import random
from typing import Any
import pytest
import torch
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
@pytest.fixture
@ -55,6 +53,14 @@ def model_name():
return "meta-llama/Llama-3.1-8B-Instruct"
def eagle_model_name():
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
def eagle3_model_name():
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
def test_ngram_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
@ -71,8 +77,6 @@ def test_ngram_correctness(
ref_llm = LLM(model=model_name, max_model_len=1024)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
spec_llm = LLM(
model=model_name,
@ -99,50 +103,34 @@ def test_ngram_correctness(
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.7 * len(ref_outputs))
del spec_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
@pytest.mark.parametrize("model_setup", [
("eagle", "meta-llama/Llama-3.1-8B-Instruct",
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
pytest.param(
("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
],
ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
def test_eagle_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_setup: tuple[str, str, str, int],
model_name: str,
use_eagle3: bool,
):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
model_setup: (method, model_name, eagle_model_name, tp_size)
'''
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
method, model_name, spec_model_name, tp_size = model_setup
ref_llm = LLM(model=model_name,
max_model_len=2048,
tensor_parallel_size=tp_size)
ref_llm = LLM(model=model_name, max_model_len=2048)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
spec_model_name = eagle3_model_name(
) if use_eagle3 else eagle_model_name()
spec_llm = LLM(
model=model_name,
trust_remote_code=True,
tensor_parallel_size=tp_size,
speculative_config={
"method": method,
"method": "eagle3" if use_eagle3 else "eagle",
"model": spec_model_name,
"num_speculative_tokens": 3,
"max_model_len": 2048,
@ -164,5 +152,3 @@ def test_eagle_correctness(
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
del spec_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

View File

@ -1,166 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_base64, fetch_image
# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES = 2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
@pytest.fixture(scope="module")
def default_image_server_args():
return [
"--enforce-eager",
"--max-model-len",
"6000",
"--max-num-seqs",
"128",
"--limit-mm-per-prompt",
json.dumps({"image": MAXIMUM_IMAGES}),
]
@pytest.fixture(scope="module")
def image_server(default_image_server_args):
with RemoteOpenAIServer(MODEL_NAME,
default_image_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(image_server):
async with image_server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_image() -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str):
content_text = "What's in this image?"
messages = [{
"role":
"user",
"content": [
{
"type": "input_image",
"image_url": image_url,
"detail": "auto",
},
{
"type": "input_text",
"text": content_text
},
],
}]
# test image url
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
image_url: str,
base64_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
messages = [{
"role":
"user",
"content": [
{
"type": "input_image",
"image_url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
"detail": "auto",
},
{
"type": "input_text",
"text": content_text
},
],
}]
# test image base64
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
image_urls: list[str]):
messages = [{
"role":
"user",
"content": [
*({
"type": "input_image",
"image_url": image_url,
"detail": "auto",
} for image_url in image_urls),
{
"type": "input_text",
"text": "What's in this image?"
},
],
}]
if len(image_urls) > MAXIMUM_IMAGES:
with pytest.raises(openai.BadRequestError): # test multi-image input
await client.responses.create(
model=model_name,
input=messages,
)
# the server should still work afterwards
response = await client.responses.create(
model=model_name,
input=[{
"role": "user",
"content": "What's the weather like in Paris today?",
}],
)
assert len(response.output_text) > 0
else:
response = await client.responses.create(
model=model_name,
input=messages,
)
assert len(response.output_text) > 0

View File

@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for external LB testing
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
# Default tensor parallel size to use
# Default tensor parallell size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1"))

179
typos.toml Normal file
View File

@ -0,0 +1,179 @@
[files]
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
"vllm/third_party/*"]
ignore-hidden = true
ignore-files = true
ignore-dot = true
ignore-vcs = true
ignore-global = true
ignore-parent = true
[default]
binary = false
check-filename = false
check-file = true
unicode = true
ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
".*ot.*", ".*[Tt]h[rR].*"]
extend-ignore-words-re = []
extend-ignore-re = []
[default.extend-identifiers]
bbc5b7ede = "bbc5b7ede"
womens_doubles = "womens_doubles"
v_2nd = "v_2nd"
splitted_input = "splitted_input"
NOOPs = "NOOPs"
typ = "typ"
nin_shortcut = "nin_shortcut"
UperNetDecoder = "UperNetDecoder"
subtile = "subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput = "SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
depthwise_seperable_CNN = "depthwise_seperable_CNN"
[default.extend-words]
iy = "iy"
tendencias = "tendencias"
# intel cpu features
tme = "tme"
dout = "dout"
Pn = "Pn"
arange = "arange"
[type.py]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.py.extend-identifiers]
arange = "arange"
NDArray = "NDArray"
EOFError = "EOFError"
[type.py.extend-words]
[type.cpp]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.cpp.extend-identifiers]
countr_one = "countr_one"
[type.cpp.extend-words]
[type.rust]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.rust.extend-identifiers]
flate2 = "flate2"
[type.rust.extend-words]
ser = "ser"
[type.lock]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.lock.extend-identifiers]
[type.lock.extend-words]
[type.jl]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.jl.extend-identifiers]
[type.jl.extend-words]
modul = "modul"
egals = "egals"
usig = "usig"
egal = "egal"
[type.go]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.go.extend-identifiers]
flate = "flate"
[type.go.extend-words]
[type.css]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.css.extend-identifiers]
nd = "nd"
[type.css.extend-words]
[type.man]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.man.extend-identifiers]
Nd = "Nd"
[type.man.extend-words]
[type.cert]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.cert.extend-identifiers]
[type.cert.extend-words]
[type.sh]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.sh.extend-identifiers]
stap = "stap"
ot = "ot"
[type.sh.extend-words]
[type.vimscript]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.vimscript.extend-identifiers]
windo = "windo"
[type.vimscript.extend-words]

View File

@ -961,7 +961,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
"... H (two D) -> ... (H two) D",
two=2)
else: # reuse the kv cache, full attention
else: # re-use the kv cache, full attention
q = q.view(-1, self.num_heads, self.head_size)
q1, q2 = self.split_heads(q)
# kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501

View File

@ -96,30 +96,25 @@ DEFAULT_PIP_PATTERNS = {
def run(command):
"""Return (return-code, stdout, stderr)."""
shell = True if type(command) is str else False
try:
p = subprocess.Popen(command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=shell)
raw_output, raw_err = p.communicate()
rc = p.returncode
if get_platform() == 'win32':
enc = 'oem'
else:
enc = locale.getpreferredencoding()
output = raw_output.decode(enc)
if command == 'nvidia-smi topo -m':
# don't remove the leading whitespace of `nvidia-smi topo -m`
# because they are meaningful
output = output.rstrip()
else:
output = output.strip()
err = raw_err.decode(enc)
return rc, output, err.strip()
except FileNotFoundError:
cmd_str = command if isinstance(command, str) else command[0]
return 127, '', f"Command not found: {cmd_str}"
p = subprocess.Popen(command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=shell)
raw_output, raw_err = p.communicate()
rc = p.returncode
if get_platform() == 'win32':
enc = 'oem'
else:
enc = locale.getpreferredencoding()
output = raw_output.decode(enc)
if command == 'nvidia-smi topo -m':
# don't remove the leading whitespace of `nvidia-smi topo -m`
# because they are meaningful
output = output.rstrip()
else:
output = output.strip()
err = raw_err.decode(enc)
return rc, output, err.strip()
def run_and_read_all(run_lambda, command):
@ -153,7 +148,7 @@ def get_conda_packages(run_lambda, patterns=None):
if patterns is None:
patterns = DEFAULT_CONDA_PATTERNS
conda = os.environ.get('CONDA_EXE', 'conda')
out = run_and_read_all(run_lambda, [conda, 'list'])
out = run_and_read_all(run_lambda, "{} list".format(conda))
if out is None:
return out

View File

@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from torch.distributed import ProcessGroup, ReduceOp
from typing_extensions import Self, runtime_checkable
from typing_extensions import Self, deprecated, runtime_checkable
import vllm.envs as envs
from vllm import version
@ -963,7 +963,7 @@ class ModelConfig:
optimized_quantization_methods = [
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
]
if self.quantization is not None:
self.quantization = cast(me_quant.QuantizationMethods,
@ -1563,7 +1563,7 @@ class ModelConfig:
BlockSize = Literal[1, 8, 16, 32, 64, 128]
CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
@ -1593,7 +1593,7 @@ class CacheConfig:
cache_dtype: CacheDType = "auto"
"""Data type for kv cache storage. If "auto", will use model data type.
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc)."""
fp8 (=fp8_e4m3)."""
is_attention_free: bool = False
"""Whether the model is attention-free. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
@ -1691,7 +1691,7 @@ class CacheConfig:
"Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. "
"Meanwhile, it may cause accuracy drop without a proper "
"scaling factor.")
"scaling factor")
else:
raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
@ -1730,6 +1730,35 @@ class CacheConfig:
logger.warning("Possibly too large swap space. %s", msg)
@config
@dataclass
class TokenizerPoolConfig:
"""This config is deprecated and will be removed in a future release.
Passing these parameters will have no effect. Please remove them from your
configurations.
"""
pool_size: int = 0
"""This parameter is deprecated and will be removed in a future release.
Passing this parameter will have no effect. Please remove it from your
configurations."""
pool_type: str = "ray"
"""This parameter is deprecated and will be removed in a future release.
Passing this parameter will have no effect. Please remove it from your
configurations."""
extra_config: dict = field(default_factory=dict)
"""This parameter is deprecated and will be removed in a future release.
Passing this parameter will have no effect. Please remove it from your
configurations."""
def __post_init__(self) -> None:
logger.warning_once(
"TokenizerPoolConfig is deprecated and will be removed in a "
"future release. Passing this parameter will have no effect. "
"Please remove it from your configurations.")
class LoadFormat(str, enum.Enum):
AUTO = "auto"
PT = "pt"
@ -1781,9 +1810,6 @@ class LoadConfig:
default_factory=dict)
"""Extra config for model loader. This will be passed to the model loader
corresponding to the chosen load_format."""
device: Optional[str] = None
"""Device to which model weights will be loaded, default to
device_config.device"""
ignore_patterns: Optional[Union[list[str], str]] = None
"""The list of patterns to ignore when loading the model. Default to
"original/**/*" to avoid repeated loading of llama's checkpoints."""
@ -1896,6 +1922,10 @@ class ParallelConfig:
disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
"""This parameter is deprecated and will be removed in a future release.
Please remove it from your configs"""
ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
@ -1910,7 +1940,7 @@ class ParallelConfig:
or equal to the number of GPUs available, "mp" will be used to
keep processing on a single host. Otherwise, this will default
to "ray" if Ray is installed and fail otherwise. Note that tpu
only support Ray for distributed inference."""
and hpu only support Ray for distributed inference."""
worker_cls: str = "auto"
"""The full name of the worker class to use. If "auto", the worker class
@ -3662,6 +3692,18 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
class DecodingConfig:
"""Dataclass which contains the decoding strategy of the engine."""
@property
@deprecated(
"`guided_decoding_backend` is deprecated and has been renamed to "
"`backend`. This will be removed in v0.10.0. Please use the "
"`backend` argument instead.")
def guided_decoding_backend(self) -> GuidedDecodingBackend:
return self.backend
@guided_decoding_backend.setter
def guided_decoding_backend(self, value: GuidedDecodingBackend):
self.backend = value
backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
"""Which engine will be used for guided decoding (JSON schema / regex etc)
by default. With "auto", we will make opinionated choices based on request
@ -3704,6 +3746,9 @@ class DecodingConfig:
return hash_str
def __post_init__(self):
if ":" in self.backend:
self._extract_backend_options()
if envs.VLLM_USE_V1:
valid_guided_backends = get_args(GuidedDecodingBackendV1)
else:
@ -3719,6 +3764,24 @@ class DecodingConfig:
raise ValueError("disable_additional_properties is only supported "
"for the guidance backend.")
@deprecated(
"Passing guided decoding backend options inside backend in the format "
"'backend:...' is deprecated. This will be removed in v0.10.0. Please "
"use the dedicated arguments '--disable-fallback', "
"'--disable-any-whitespace' and '--disable-additional-properties' "
"instead.")
def _extract_backend_options(self):
"""Extract backend options from the backend string."""
backend, options = self.backend.split(":")
self.backend = cast(GuidedDecodingBackend, backend)
options_set = set(options.strip().split(","))
if "no-fallback" in options_set:
self.disable_fallback = True
if "disable-any-whitespace" in options_set:
self.disable_any_whitespace = True
if "no-additional-properties" in options_set:
self.disable_additional_properties = True
DetailedTraceModules = Literal["model", "worker", "all"]

View File

@ -9,6 +9,7 @@ import functools
import json
import sys
import threading
import warnings
from dataclasses import MISSING, dataclass, fields, is_dataclass
from itertools import permutations
from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
@ -18,7 +19,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
import regex as re
import torch
from pydantic import TypeAdapter, ValidationError
from typing_extensions import TypeIs
from typing_extensions import TypeIs, deprecated
import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
@ -31,8 +32,8 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
ObservabilityConfig, ParallelConfig, PoolerConfig,
PrefixCachingHashAlgo, PromptAdapterConfig,
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
get_field)
TaskOption, TokenizerMode, TokenizerPoolConfig,
VllmConfig, get_attr_docs, get_field)
from vllm.logger import init_logger
from vllm.platforms import CpuArchEnum, current_platform
from vllm.plugins import load_general_plugins
@ -65,6 +66,9 @@ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
def _parse_type(val: str) -> T:
try:
if return_type is json.loads and not re.match(
r"(?s)^\s*{.*}\s*$", val):
return cast(T, nullable_kvs(val))
return return_type(val)
except ValueError as e:
raise argparse.ArgumentTypeError(
@ -90,6 +94,42 @@ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
return optional_type(json.loads)(val)
@deprecated(
"Passing a JSON argument as a string containing comma separated key=value "
"pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
"string instead.")
def nullable_kvs(val: str) -> dict[str, int]:
"""Parses a string containing comma separate key [str] to value [int]
pairs into a dictionary.
Args:
val: String value to be parsed.
Returns:
Dictionary with parsed values.
"""
out_dict: dict[str, int] = {}
for item in val.split(","):
kv_parts = [part.lower().strip() for part in item.split("=")]
if len(kv_parts) != 2:
raise argparse.ArgumentTypeError(
"Each item should be in the form KEY=VALUE")
key, value = kv_parts
try:
parsed_value = int(value)
except ValueError as exc:
msg = f"Failed to parse value of item {key}={value}"
raise argparse.ArgumentTypeError(msg) from exc
if key in out_dict and out_dict[key] != parsed_value:
raise argparse.ArgumentTypeError(
f"Conflicting values specified for key: {key}")
out_dict[key] = parsed_value
return out_dict
def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
"""Check if the type hint is a specific type."""
return type_hint is type or get_origin(type_hint) is type
@ -139,10 +179,6 @@ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
return type_hints
def is_online_quantization(quantization: Any) -> bool:
return quantization in ["inc"]
@functools.lru_cache(maxsize=30)
def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
cls_docs = get_attr_docs(cls)
@ -337,6 +373,13 @@ class EngineArgs:
enforce_eager: bool = ModelConfig.enforce_eager
max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
# The following three fields are deprecated and will be removed in a future
# release. Setting them will have no effect. Please remove them from your
# configurations.
tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
tokenizer_pool_extra_config: dict = \
get_field(TokenizerPoolConfig, "extra_config")
limit_mm_per_prompt: dict[str, int] = \
get_field(MultiModalConfig, "limit_per_prompt")
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
@ -398,6 +441,7 @@ class EngineArgs:
speculative_config: Optional[Dict[str, Any]] = None
qlora_adapter_name_or_path: Optional[str] = None
show_hidden_metrics_for_version: Optional[str] = \
ObservabilityConfig.show_hidden_metrics_for_version
otlp_traces_endpoint: Optional[str] = \
@ -431,6 +475,7 @@ class EngineArgs:
additional_config: dict[str, Any] = \
get_field(VllmConfig, "additional_config")
enable_reasoning: Optional[bool] = None # DEPRECATED
reasoning_parser: str = DecodingConfig.reasoning_backend
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
@ -448,6 +493,13 @@ class EngineArgs:
if isinstance(self.compilation_config, (int, dict)):
self.compilation_config = CompilationConfig.from_cli(
str(self.compilation_config))
if self.qlora_adapter_name_or_path is not None:
warnings.warn(
"The `qlora_adapter_name_or_path` is deprecated "
"and will be removed in v0.10.0. ",
DeprecationWarning,
stacklevel=2,
)
# Setup plugins
from vllm.plugins import load_general_plugins
load_general_plugins()
@ -560,6 +612,14 @@ class EngineArgs:
**load_kwargs["ignore_patterns"])
load_group.add_argument("--use-tqdm-on-load",
**load_kwargs["use_tqdm_on_load"])
load_group.add_argument(
"--qlora-adapter-name-or-path",
type=str,
default=None,
help="The `--qlora-adapter-name-or-path` has no effect, do not set"
" it, and it will be removed in v0.10.0.",
deprecated=True,
)
load_group.add_argument('--pt-load-map-location',
**load_kwargs["pt_load_map_location"])
@ -580,6 +640,15 @@ class EngineArgs:
guided_decoding_group.add_argument(
"--guided-decoding-disable-additional-properties",
**guided_decoding_kwargs["disable_additional_properties"])
guided_decoding_group.add_argument(
"--enable-reasoning",
action=argparse.BooleanOptionalAction,
deprecated=True,
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
"of v0.9.0. Use `--reasoning-parser` to specify the reasoning "
"parser backend instead. This flag (`--enable-reasoning`) will be "
"removed in v0.10.0. When `--reasoning-parser` is specified, "
"reasoning mode is automatically enabled.")
guided_decoding_group.add_argument(
"--reasoning-parser",
# This choices is a special case because it's not static
@ -682,6 +751,19 @@ class EngineArgs:
cache_group.add_argument("--calculate-kv-scales",
**cache_kwargs["calculate_kv_scales"])
# Tokenizer arguments
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
tokenizer_group = parser.add_argument_group(
title="TokenizerPoolConfig",
description=TokenizerPoolConfig.__doc__,
)
tokenizer_group.add_argument("--tokenizer-pool-size",
**tokenizer_kwargs["pool_size"])
tokenizer_group.add_argument("--tokenizer-pool-type",
**tokenizer_kwargs["pool_type"])
tokenizer_group.add_argument("--tokenizer-pool-extra-config",
**tokenizer_kwargs["extra_config"])
# Multimodal related configs
multimodal_kwargs = get_kwargs(MultiModalConfig)
multimodal_group = parser.add_argument_group(
@ -964,8 +1046,6 @@ class EngineArgs:
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
device="cpu"
if is_online_quantization(self.quantization) else None,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
use_tqdm_on_load=self.use_tqdm_on_load,
@ -1365,9 +1445,7 @@ class EngineArgs:
supported = False
if current_platform.is_rocm() or (
current_platform.is_cuda()
and current_platform.is_device_capability(100)) or (
current_platform.device_name
== "hpu"): # handle hpu also for OOT platform
and current_platform.is_device_capability(100)):
supported = True
elif fp8_attention and will_use_fa:
from vllm.attention.utils.fa_utils import (
@ -1701,12 +1779,7 @@ class EngineArgs:
@dataclass
class AsyncEngineArgs(EngineArgs):
"""Arguments for asynchronous vLLM engine."""
# Request logging is disabled by default. ``--disable-log-requests`` is
# kept for backwards compatibility but has no effect. ``--enable-legacy-
# log-requests`` can be used to restore the previous behaviour of logging
# each request.
disable_log_requests: bool = True
enable_legacy_log_requests: bool = False
disable_log_requests: bool = False
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser,
@ -1719,34 +1792,10 @@ class AsyncEngineArgs(EngineArgs):
parser = EngineArgs.add_cli_args(parser)
parser.add_argument('--disable-log-requests',
action='store_true',
default=None,
help='[DEPRECATED] Request logging is disabled by '
'default.')
parser.add_argument('--enable-legacy-log-requests',
action='store_true',
help='Enable legacy request logging behavior.')
help='Disable logging requests.')
current_platform.pre_register_and_update(parser)
return parser
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
engine_args = super().from_cli_args(args)
if args.enable_legacy_log_requests:
engine_args.disable_log_requests = False
else:
if args.disable_log_requests is None:
logger.warning(
"Request logging is disabled by default. Use "
"--enable-legacy-log-requests to restore the previous "
"behaviour.")
engine_args.disable_log_requests = True
else:
engine_args.disable_log_requests = True
engine_args.enable_legacy_log_requests = args.enable_legacy_log_requests
return engine_args
def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:

View File

@ -28,7 +28,6 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
ChatCompletionToolMessageParam)
from openai.types.chat.chat_completion_content_part_input_audio_param import (
InputAudio)
from openai.types.responses import ResponseInputImageParam
from PIL import Image
from pydantic import BaseModel, ConfigDict, TypeAdapter
# yapf: enable
@ -943,8 +942,6 @@ _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
_ResponsesInputImageParser = TypeAdapter(
ResponseInputImageParam).validate_python
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
# Define a mapping from part types to their corresponding parsing functions.
@ -956,8 +953,6 @@ MM_PARSER_MAP: dict[
lambda part: _TextParser(part).get("text", None),
"input_text":
lambda part: _TextParser(part).get("text", None),
"input_image":
lambda part: _ResponsesInputImageParser(part).get("image_url", None),
"image_url":
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
"image_embeds":
@ -1090,8 +1085,10 @@ def _parse_chat_message_content_part(
"""
if isinstance(part, str): # Handle plain text parts
return part
# Handle structured dictionary parts
part_type, content = _parse_chat_message_content_mm_part(part)
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
# content is None, log a warning and skip
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
@ -1112,7 +1109,7 @@ def _parse_chat_message_content_part(
image_content = cast(Image.Image, content)
mm_parser.parse_image_pil(image_content)
modality = "image"
elif part_type in ("image_url", "input_image"):
elif part_type == "image_url":
str_content = cast(str, content)
mm_parser.parse_image(str_content)
modality = "image"

View File

@ -67,6 +67,37 @@ class ServeSubcommand(CLISubcommand):
help="Start the vLLM OpenAI Compatible API server.",
description="Start the vLLM OpenAI Compatible API server.",
usage="vllm serve [model_tag] [options]")
serve_parser.add_argument("model_tag",
type=str,
nargs='?',
help="The model tag to serve "
"(optional if specified in config)")
serve_parser.add_argument(
"--headless",
action='store_true',
default=False,
help="Run in headless mode. See multi-node data parallel "
"documentation for more details.")
serve_parser.add_argument(
'--data-parallel-start-rank',
'-dpr',
type=int,
default=0,
help="Starting data parallel rank for secondary nodes. "
"Requires --headless.")
serve_parser.add_argument('--api-server-count',
'-asc',
type=int,
default=1,
help='How many API server processes to run.')
serve_parser.add_argument(
"--config",
type=str,
default='',
required=False,
help="Read CLI options from a config file. "
"Must be a YAML with the following options: "
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
serve_parser = make_arg_parser(serve_parser)
show_filtered_argument_or_group_from_help(serve_parser, ["serve"])

View File

@ -522,19 +522,6 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
assert_never(generator)
def maybe_register_tokenizer_info_endpoint(args):
"""Conditionally register the tokenizer info endpoint if enabled."""
if getattr(args, 'enable_tokenizer_info_endpoint', False):
@router.get("/tokenizer_info")
async def get_tokenizer_info(raw_request: Request):
"""Get comprehensive tokenizer information."""
result = await tokenization(raw_request).get_tokenizer_info()
return JSONResponse(content=result.model_dump(),
status_code=result.code if isinstance(
result, ErrorResponse) else 200)
@router.get("/v1/models")
async def show_available_models(raw_request: Request):
handler = models(raw_request)
@ -1527,6 +1514,8 @@ async def init_app_state(
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
expand_tools_even_if_tool_choice_none=args.
expand_tools_even_if_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
@ -1542,6 +1531,8 @@ async def init_app_state(
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
expand_tools_even_if_tool_choice_none=args.
expand_tools_even_if_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
@ -1553,7 +1544,6 @@ async def init_app_state(
state.openai_serving_models,
request_logger=request_logger,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
) if "generate" in model_config.supported_tasks else None
state.openai_serving_pooling = OpenAIServingPooling(
@ -1705,7 +1695,6 @@ async def run_server_worker(listen_address,
uvicorn_kwargs['log_config'] = log_config
async with build_async_engine_client(args, client_config) as engine_client:
maybe_register_tokenizer_info_endpoint(args)
app = build_app(args)
vllm_config = await engine_client.get_vllm_config()

View File

@ -182,9 +182,13 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
"""If set to True, enable tracking server_load_metrics in the app state."""
enable_force_include_usage: bool = False
"""If set to True, including usage on every request."""
enable_tokenizer_info_endpoint: bool = False
"""Enable the /get_tokenizer_info endpoint. May expose chat
templates and other tokenizer configuration."""
expand_tools_even_if_tool_choice_none: bool = False
"""Include tool definitions in prompts even when `tool_choice='none'`.
This is a transitional option that will be removed in v0.10.0. In
v0.10.0, tool definitions will always be included regardless of
`tool_choice` setting. Use this flag to test the upcoming behavior
before the breaking change."""
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@ -195,6 +199,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
# Special case: allowed_origins, allowed_methods, allowed_headers all
# need json.loads type
# Should also remove nargs
print(frontend_kwargs["allowed_origins"])
frontend_kwargs["allowed_origins"]["type"] = json.loads
frontend_kwargs["allowed_methods"]["type"] = json.loads
frontend_kwargs["allowed_headers"]["type"] = json.loads
@ -220,6 +225,11 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers
# Special case for expand-tools-even-if-tool-choice-none because of
# the deprecation field
frontend_kwargs["expand_tools_even_if_tool_choice_none"]\
["deprecated"] = True
frontend_group = parser.add_argument_group(
title="Frontend",
description=FrontendArgs.__doc__,
@ -238,34 +248,6 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
register all arguments instead of manually enumerating them here. This
avoids code duplication and keeps the argument definitions in one place.
"""
parser.add_argument("model_tag",
type=str,
nargs="?",
help="The model tag to serve "
"(optional if specified in config)")
parser.add_argument(
"--headless",
action="store_true",
default=False,
help="Run in headless mode. See multi-node data parallel "
"documentation for more details.")
parser.add_argument(
"--data-parallel-start-rank",
"-dpr",
type=int,
default=0,
help="Starting data parallel rank for secondary nodes. "
"Requires --headless.")
parser.add_argument("--api-server-count",
"-asc",
type=int,
default=1,
help="How many API server processes to run.")
parser.add_argument(
"--config",
help="Read CLI options from a config file. "
"Must be a YAML with the following options: "
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
parser = FrontendArgs.add_cli_args(parser)
parser = AsyncEngineArgs.add_cli_args(parser)

View File

@ -290,15 +290,6 @@ class ResponsesRequest(OpenAIBaseModel):
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."),
)
cache_salt: Optional[str] = Field(
default=None,
description=(
"If specified, the prefix cache will be salted with the provided "
"string to prevent an attacker to guess prompts in multi-user "
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit). Not supported by vLLM engine V0."))
# --8<-- [end:responses-extra-params]
_DEFAULT_SAMPLING_PARAMS = {
@ -360,19 +351,6 @@ class ResponsesRequest(OpenAIBaseModel):
raise ValueError("prompt template is not supported")
return data
@model_validator(mode="before")
def check_cache_salt_support(cls, data):
if data.get("cache_salt") is not None:
if not envs.VLLM_USE_V1:
raise ValueError(
"Parameter 'cache_salt' is not supported with "
"this instance of vLLM, which uses engine V0.")
if not isinstance(data["cache_salt"],
str) or not data["cache_salt"]:
raise ValueError("Parameter 'cache_salt' must be a "
"non-empty string if provided.")
return data
class ChatCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
@ -1026,16 +1004,6 @@ class CompletionRequest(OpenAIBaseModel):
" as strings of the form 'token_id:{token_id}' so that tokens "
"that are not JSON-encodable can be identified."))
cache_salt: Optional[str] = Field(
default=None,
description=(
"If specified, the prefix cache will be salted with the provided "
"string to prevent an attacker to guess prompts in multi-user "
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit). Not supported by vLLM engine V0."))
kv_transfer_params: Optional[dict[str, Any]] = Field(
default=None,
description="KVTransfer parameters used for disaggregated serving.")
@ -1212,20 +1180,6 @@ class CompletionRequest(OpenAIBaseModel):
"At least one of `prompt` or `prompt_embeds` must be set.")
return data
@model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data):
if data.get("cache_salt") is not None:
if not envs.VLLM_USE_V1:
raise ValueError(
"Parameter 'cache_salt' is not supported with "
"this instance of vLLM, which uses engine V0.")
if not isinstance(data["cache_salt"],
str) or not data["cache_salt"]:
raise ValueError("Parameter 'cache_salt' must be a "
"non-empty string if provided.")
return data
class EmbeddingCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
@ -1953,16 +1907,6 @@ class DetokenizeResponse(OpenAIBaseModel):
prompt: str
class TokenizerInfoResponse(OpenAIBaseModel):
"""
Response containing tokenizer configuration
equivalent to tokenizer_config.json
"""
model_config = ConfigDict(extra="allow")
tokenizer_class: str
class LoadLoRAAdapterRequest(BaseModel):
lora_name: str
lora_path: str
@ -2027,7 +1971,7 @@ class TranscriptionRequest(OpenAIBaseModel):
"""
stream: Optional[bool] = False
"""When set, it will enable output to be streamed in a similar fashion
"""When set, it will enable output to be streamed in a similar fashion
as the Chat Completion endpoint.
"""
# --8<-- [start:transcription-extra-params]
@ -2289,9 +2233,9 @@ class TranslationRequest(OpenAIBaseModel):
"""
stream: Optional[bool] = False
"""Custom field not present in the original OpenAI definition. When set,
"""Custom field not present in the original OpenAI definition. When set,
it will enable output to be streamed in a similar fashion as the Chat
Completion endpoint.
Completion endpoint.
"""
# Flattened stream option to simplify form data.
stream_include_usage: Optional[bool] = False

View File

@ -63,6 +63,7 @@ class OpenAIServingChat(OpenAIServing):
return_tokens_as_token_ids: bool = False,
reasoning_parser: str = "",
enable_auto_tools: bool = False,
expand_tools_even_if_tool_choice_none: bool = False,
tool_parser: Optional[str] = None,
enable_prompt_tokens_details: bool = False,
enable_force_include_usage: bool = False,
@ -111,6 +112,8 @@ class OpenAIServingChat(OpenAIServing):
raise TypeError("Error: --enable-auto-tool-choice requires "
f"tool_parser:'{tool_parser}' which has not "
"been registered") from e
self.expand_tools_even_if_tool_choice_none = (
expand_tools_even_if_tool_choice_none)
self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.enable_force_include_usage = enable_force_include_usage
@ -179,6 +182,20 @@ class OpenAIServingChat(OpenAIServing):
if request.tools is None:
tool_dicts = None
elif (request.tool_choice == "none"
and not self.expand_tools_even_if_tool_choice_none):
if len(request.tools) > 0:
logger.warning_once(
"Tools are specified but tool_choice is set to 'none' "
"and --expand-tools-even-if-tool-choice-none is not "
"enabled. Tool definitions will be excluded from the "
"prompt. This behavior will change in vLLM v0.10 where "
"tool definitions will be included by default even "
"with tool_choice='none'. To adopt the new behavior "
"now, use --expand-tools-even-if-tool-choice-none. "
"To suppress this warning, either remove tools from "
"the request or set tool_choice to a different value.")
tool_dicts = None
else:
tool_dicts = [tool.model_dump() for tool in request.tools]

View File

@ -23,7 +23,6 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
CompletionResponseStreamChoice,
CompletionStreamResponse,
ErrorResponse,
PromptTokenUsageInfo,
RequestResponseMetadata,
UsageInfo)
from vllm.entrypoints.openai.serving_engine import (
@ -57,7 +56,6 @@ class OpenAIServingCompletion(OpenAIServing):
*,
request_logger: Optional[RequestLogger],
return_tokens_as_token_ids: bool = False,
enable_prompt_tokens_details: bool = False,
enable_force_include_usage: bool = False,
):
super().__init__(engine_client=engine_client,
@ -66,7 +64,6 @@ class OpenAIServingCompletion(OpenAIServing):
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
enable_force_include_usage=enable_force_include_usage)
self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.default_sampling_params = (
self.model_config.get_diff_sampling_param())
if self.default_sampling_params:
@ -316,8 +313,6 @@ class OpenAIServingCompletion(OpenAIServing):
previous_num_tokens = [0] * num_choices * num_prompts
has_echoed = [False] * num_choices * num_prompts
num_prompt_tokens = [0] * num_prompts
num_cached_tokens = None
first_iteration = True
stream_options = request.stream_options
if stream_options:
@ -333,10 +328,6 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_token_ids = res.prompt_token_ids
prompt_logprobs = res.prompt_logprobs
if first_iteration:
num_cached_tokens = res.num_cached_tokens
first_iteration = False
if res.prompt is not None:
prompt_text = res.prompt
else:
@ -440,10 +431,6 @@ class OpenAIServingCompletion(OpenAIServing):
completion_tokens=total_completion_tokens,
total_tokens=total_prompt_tokens + total_completion_tokens)
if self.enable_prompt_tokens_details and num_cached_tokens:
final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
cached_tokens=num_cached_tokens)
if include_usage:
final_usage_chunk = CompletionStreamResponse(
id=request_id,
@ -548,10 +535,6 @@ class OpenAIServingCompletion(OpenAIServing):
total_tokens=num_prompt_tokens + num_generated_tokens,
)
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
usage.prompt_tokens_details = PromptTokenUsageInfo(
cached_tokens=final_res.num_cached_tokens)
request_metadata.final_usage_info = usage
return CompletionResponse(

View File

@ -226,7 +226,7 @@ class OpenAIServing:
def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
"""
Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
given tokenizer.
"""
async_tokenizer = self._async_tokenizer_pool.get(tokenizer)
@ -811,12 +811,6 @@ class OpenAIServing:
prompt_token_ids=request_prompt_text["prompt_token_ids"])
for request_prompt_text in request_prompts_text
]
cache_salt = request.cache_salt if (
hasattr(request, "cache_salt")
and request.cache_salt is not None) else None
if cache_salt:
for prompt_text in engine_prompts_text:
prompt_text["cache_salt"] = cache_salt
# This check is equivalent to simply checking if
# `request_prompts_embeds` is empty, but it's difficult to propagate
@ -834,9 +828,6 @@ class OpenAIServing:
prompt_embeds=request_prompt_embeds["prompt_embeds"])
for request_prompt_embeds in request_prompts_embeds
]
if cache_salt:
for prompt_embed in engine_prompts_embeds:
prompt_embed["cache_salt"] = cache_salt
request_prompts = request_prompts_embeds + request_prompts_text
engine_prompts = engine_prompts_embeds + engine_prompts_text

View File

@ -51,6 +51,7 @@ class OpenAIServingResponses(OpenAIServing):
return_tokens_as_token_ids: bool = False,
reasoning_parser: str = "",
enable_auto_tools: bool = False,
expand_tools_even_if_tool_choice_none: bool = False,
tool_parser: Optional[str] = None,
enable_prompt_tokens_details: bool = False,
enable_force_include_usage: bool = False,
@ -372,7 +373,7 @@ class OpenAIServingResponses(OpenAIServing):
})
# Append the new input.
# Responses API supports simple text inputs without chat format.
# Reponses API supports simple text inputs without chat format.
if isinstance(request.input, str):
messages.append({"role": "user", "content": request.input})
else:

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from typing import Any, Final, Optional, Union
from typing import Final, Optional, Union
import jinja2
from fastapi import Request
@ -17,13 +17,11 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
ErrorResponse,
TokenizeChatRequest,
TokenizeRequest,
TokenizeResponse,
TokenizerInfoResponse)
TokenizeResponse)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer
logger = init_logger(__name__)
@ -157,49 +155,3 @@ class OpenAIServingTokenization(OpenAIServing):
input_text = prompt_input["prompt"]
return DetokenizeResponse(prompt=input_text)
async def get_tokenizer_info(
self, ) -> Union[TokenizerInfoResponse, ErrorResponse]:
"""Get comprehensive tokenizer information."""
try:
tokenizer = await self.engine_client.get_tokenizer()
info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
return TokenizerInfoResponse(**info)
except Exception as e:
return self.create_error_response(
f"Failed to get tokenizer info: {str(e)}")
@dataclass
class TokenizerInfo:
tokenizer: AnyTokenizer
chat_template: Optional[str]
def to_dict(self) -> dict[str, Any]:
"""Return the tokenizer configuration."""
return self._get_tokenizer_config()
def _get_tokenizer_config(self) -> dict[str, Any]:
"""Get tokenizer configuration directly from the tokenizer object."""
config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})
# Remove file path fields
config.pop("vocab_file", None)
config.pop("merges_file", None)
config = self._make_json_serializable(config)
config["tokenizer_class"] = type(self.tokenizer).__name__
if self.chat_template:
config["chat_template"] = self.chat_template
return config
def _make_json_serializable(self, obj):
"""Convert any non-JSON-serializable objects to serializable format."""
if hasattr(obj, "content"):
return obj.content
elif isinstance(obj, dict):
return {k: self._make_json_serializable(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [self._make_json_serializable(item) for item in obj]
else:
return obj

View File

@ -112,7 +112,6 @@ class OpenAISpeechToText(OpenAIServing):
prompt = self.model_cls.get_generation_prompt(
audio=chunk,
stt_config=self.asr_config,
model_config=self.model_config,
language=lang,
task_type=self.task_type,
request_prompt=request.prompt)

10
vllm/envs.py Executable file → Normal file
View File

@ -139,8 +139,6 @@ if TYPE_CHECKING:
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
VLLM_USE_CUDNN_PREFILL: bool = False
VLLM_LOOPBACK_IP: str = ""
def get_default_cache_root():
@ -963,17 +961,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
# Controls whether or not to use cudnn prefill
"VLLM_USE_CUDNN_PREFILL":
lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
# If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
"VLLM_USE_TRTLLM_DECODE_ATTENTION":
lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
# Used to force set up loopback IP
"VLLM_LOOPBACK_IP":
lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
}
# --8<-- [end:env-vars-definition]

View File

@ -1172,7 +1172,7 @@ def fused_experts(
allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
# For now, disable DeepGemm for small N (<= 512) until better
# permute/unpermute ops are available.
# However, on B200, we use DeepGemm for all cases because they only support
# However, on B200, we use DeepGemm for all cases becuase they only support
# E8M0 scale, which means we requantize the weight and input to the specific
# scale. Fallen back to cutlass or triton for some cases would cause
# accuracy issue.

View File

@ -573,8 +573,8 @@ class MambaMixer2(MambaBase, CustomOp):
x = hidden_states_B_C_p.transpose(
0, 1) # this is the form that causal-conv see
if mamba2_metadata.cu_seqlen is None:
mamba2_metadata = update_metadata(x, query_start_loc_p,
mamba2_metadata)
mamba2_metadata = update_metadata(
x, attn_metadata.query_start_loc, mamba2_metadata)
hidden_states_B_C_p = causal_conv1d_fn(
x,
conv_weights,
@ -583,7 +583,6 @@ class MambaMixer2(MambaBase, CustomOp):
conv_states=conv_state,
has_initial_state=has_initial_states_p,
cache_indices=state_indices_tensor_p,
metadata=mamba2_metadata,
query_start_loc=query_start_loc_p).transpose(
0, 1)[:num_prefill_tokens]
@ -594,14 +593,9 @@ class MambaMixer2(MambaBase, CustomOp):
initial_states = None
if (has_initial_states_p is not None and prep_initial_states):
# making a copy of the states
if envs.VLLM_USE_V1:
initial_states = torch.where(
has_initial_states_p[:, None, None, None],
ssm_state[state_indices_tensor_p], 0)
else:
initial_states = torch.where(
has_initial_states_p[:num_prefills, None, None, None],
ssm_state[state_indices_tensor_p], 0)
initial_states = torch.where(
has_initial_states_p[:, None, None, None],
ssm_state[state_indices_tensor_p], 0)
scan_output, varlen_state = mamba_chunk_scan_combined(
hidden_states_p.view(1, num_prefill_tokens,

View File

@ -55,6 +55,7 @@ def _causal_conv1d_fwd_kernel( # continuous batching
IS_CONTINUOUS_BATCHING: tl.constexpr,
USE_PAD_SLOT: tl.constexpr,
NP2_STATELEN: tl.constexpr,
DECODE_SEQLEN: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
):
@ -415,7 +416,7 @@ def causal_conv1d_fn(
activation = "silu"
args = None
out = torch.empty_like(x)
out = torch.zeros_like(x)
if metadata is not None:
cu_seqlen = metadata.cu_seqlen
nums_dict = metadata.nums_dict
@ -606,6 +607,7 @@ def causal_conv1d_fn(
IS_CONTINUOUS_BATCHING=cache_indices is not None,
USE_PAD_SLOT=pad_slot_id is not None,
NP2_STATELEN=np2_statelen,
DECODE_SEQLEN=1,
#launch_cooperative_grid=True
BLOCK_M=8,
BLOCK_N=256,
@ -663,8 +665,7 @@ def _causal_conv1d_update_kernel(
if IS_CONTINUOUS_BATCHING:
# mask = idx_seq < batch
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(
tl.int64)
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq)
else:
conv_state_batch_coord = idx_seq
if USE_PAD_SLOT: # noqa

View File

@ -1,21 +1,22 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import IntEnum
from typing import Callable, Optional, TypeVar, Union
from typing import Optional, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PretrainedConfig
from typing_extensions import assert_never
from vllm.config import ModelConfig, PoolerConfig
from vllm.model_executor.pooling_metadata import ( # noqa: E501
PoolingMetadata as V0PoolingMetadata)
from vllm.model_executor.pooling_metadata import PoolingTensors
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
from vllm.utils import resolve_obj_by_qualname
from vllm.transformers_utils.config import (
get_classification_activation_function,
get_cross_encoder_activation_function)
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
@ -30,202 +31,140 @@ class PoolingType(IntEnum):
MEAN = 4
@dataclass(frozen=True)
class ResolvedPoolingConfig:
pooling_type: PoolingType
class SimplePooler(nn.Module):
"""A layer that pools specific information from hidden states.
normalize: bool
softmax: bool
step_tag_id: Optional[int]
returned_token_ids: Optional[list[int]]
This layer does the following:
1. Extracts specific tokens or aggregates data based on pooling method.
2. Normalizes output if specified.
3. Returns structured results as `PoolerOutput`.
@classmethod
def from_config_with_defaults(
cls,
pooler_config: PoolerConfig,
Attributes:
pooling_type: The type of pooling to use.
normalize: Whether to normalize the pooled data.
"""
@staticmethod
def from_pooling_type(
pooling_type: PoolingType,
*,
normalize: bool,
softmax: bool,
step_tag_id: Optional[int] = None,
returned_token_ids: Optional[list[int]] = None,
) -> "ResolvedPoolingConfig":
return cls(
pooling_type=PoolingType[pooler_config.pooling_type]
if pooler_config.pooling_type is not None else pooling_type,
normalize=pooler_config.normalize
if pooler_config.normalize is not None else normalize,
softmax=pooler_config.softmax
if pooler_config.softmax is not None else softmax,
step_tag_id=pooler_config.step_tag_id
if pooler_config.step_tag_id is not None else step_tag_id,
returned_token_ids=pooler_config.returned_token_ids
if pooler_config.returned_token_ids is not None else
returned_token_ids,
)
) -> "SimplePooler":
if pooling_type == PoolingType.LAST:
assert step_tag_id is None and returned_token_ids is None
return LastPool(normalize=normalize, softmax=softmax)
if pooling_type == PoolingType.ALL:
assert step_tag_id is None and returned_token_ids is None
return AllPool(normalize=normalize, softmax=softmax)
if pooling_type == PoolingType.CLS:
assert step_tag_id is None and returned_token_ids is None
return CLSPool(normalize=normalize, softmax=softmax)
if pooling_type == PoolingType.MEAN:
assert step_tag_id is None and returned_token_ids is None
return MeanPool(normalize=normalize, softmax=softmax)
if pooling_type == PoolingType.STEP:
return StepPool(normalize=normalize,
softmax=softmax,
step_tag_id=step_tag_id,
returned_token_ids=returned_token_ids)
assert_never(pooling_type)
def get_prompt_lens(
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> torch.Tensor:
if isinstance(pooling_metadata, V1PoolingMetadata):
return pooling_metadata.prompt_lens
def __init__(self, *, normalize: bool, softmax: bool) -> None:
super().__init__()
assert isinstance(hidden_states, torch.Tensor)
return PoolingTensors.from_pooling_metadata(
pooling_metadata, hidden_states.device).prompt_lens
self.head = PoolerHead(normalize=normalize, softmax=softmax)
def get_prompt_lens(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> torch.Tensor:
if isinstance(pooling_metadata, V1PoolingMetadata):
return pooling_metadata.prompt_lens
assert isinstance(hidden_states, torch.Tensor)
return PoolingTensors.from_pooling_metadata(
pooling_metadata, hidden_states.device).prompt_lens
def get_classification_activation_function(config: PretrainedConfig):
return PoolerClassify()
def extract_states(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[list[torch.Tensor], torch.Tensor]:
raise NotImplementedError
def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput:
return PoolingSequenceGroupOutput(data)
def get_cross_encoder_activation_function(config: PretrainedConfig):
function_name: Optional[str] = None
if (hasattr(config, "sentence_transformers")
and "activation_fn" in config.sentence_transformers):
function_name = config.sentence_transformers["activation_fn"]
elif (hasattr(config, "sbert_ce_default_activation_function")
and config.sbert_ce_default_activation_function is not None):
function_name = config.sbert_ce_default_activation_function
if function_name is not None:
assert function_name.startswith("torch.nn.modules."), (
"Loading of activation functions is restricted to "
"torch.nn.modules for security reasons")
fn = resolve_obj_by_qualname(function_name)()
return PoolerActivation.wraps(fn)
return PoolerScore()
def build_output(all_data: torch.Tensor) -> PoolerOutput:
all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
return PoolerOutput(outputs=all_outputs)
class BasePooler(nn.Module):
@abstractmethod
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
raise NotImplementedError
pooled_data = self.extract_states(hidden_states, pooling_metadata)
pooled_data = self.head(pooled_data, pooling_metadata)
pooled_outputs = [self.build_output(data) for data in pooled_data]
return PoolerOutput(outputs=pooled_outputs)
class PoolingMethod(nn.Module, ABC):
class CLSPool(SimplePooler):
@staticmethod
def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod":
if pooling_type == PoolingType.LAST:
return LastPool()
if pooling_type == PoolingType.ALL:
return AllPool()
if pooling_type == PoolingType.CLS:
return CLSPool()
if pooling_type == PoolingType.MEAN:
return MeanPool()
raise NotImplementedError(f"Unsupported method: {pooling_type}")
@abstractmethod
def forward_one(
self,
hidden_states: torch.Tensor,
prompt_len: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""
Note:
`prompt_len=None` means `prompt_len=len(hidden_states)`.
"""
raise NotImplementedError
@abstractmethod
def forward_all(
self,
hidden_states: torch.Tensor,
prompt_lens: torch.Tensor,
) -> Union[list[torch.Tensor], torch.Tensor]:
raise NotImplementedError
def forward(
def extract_states(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[list[torch.Tensor], torch.Tensor]:
prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
if isinstance(hidden_states, list):
return [
self.forward_one(h, prompt_len)
for h, prompt_len in zip(hidden_states, prompt_lens)
]
result = []
for req_state, prompt_len in zip(hidden_states, prompt_lens):
assert prompt_len == req_state.shape[0], \
"partial prefill not supported with CLS pooling"
result.append(req_state[0])
return result
return self.forward_all(hidden_states, prompt_lens)
class CLSPool(PoolingMethod):
def forward_one(
self,
hidden_states: torch.Tensor,
prompt_len: Optional[torch.Tensor] = None,
) -> torch.Tensor:
assert prompt_len is None or prompt_len == hidden_states.shape[0], \
"partial prefill not supported with CLS pooling"
return hidden_states[0]
def forward_all(
self,
hidden_states: torch.Tensor,
prompt_lens: torch.Tensor,
) -> Union[list[torch.Tensor], torch.Tensor]:
first_token_flat_indices = torch.zeros_like(prompt_lens)
first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
return hidden_states[first_token_flat_indices]
class LastPool(PoolingMethod):
class LastPool(SimplePooler):
def forward_one(
def extract_states(
self,
hidden_states: torch.Tensor,
prompt_len: Optional[torch.Tensor] = None,
) -> torch.Tensor:
return hidden_states[-1]
def forward_all(
self,
hidden_states: torch.Tensor,
prompt_lens: torch.Tensor,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[list[torch.Tensor], torch.Tensor]:
if isinstance(hidden_states, list):
return [h[-1] for h in hidden_states]
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
return hidden_states[last_token_flat_indices]
class AllPool(PoolingMethod):
class AllPool(SimplePooler):
def forward_one(
def extract_states(
self,
hidden_states: torch.Tensor,
prompt_len: Optional[torch.Tensor] = None,
) -> torch.Tensor:
assert prompt_len is None or prompt_len == hidden_states.shape[0], \
"partial prefill not supported with ALL pooling"
return hidden_states
def forward_all(
self,
hidden_states: torch.Tensor,
prompt_lens: torch.Tensor,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[list[torch.Tensor], torch.Tensor]:
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
if isinstance(hidden_states, list):
for req_state, prompt_len in zip(hidden_states, prompt_lens):
assert prompt_len == req_state.shape[0], \
"partial prefill not supported with ALL pooling"
return hidden_states
offset = 0
pooled_data = list[torch.Tensor]()
for prompt_len in prompt_lens:
pooled_data.append(hidden_states[offset:offset + prompt_len])
offset += prompt_len
@ -233,23 +172,24 @@ class AllPool(PoolingMethod):
return pooled_data
class MeanPool(PoolingMethod):
class MeanPool(SimplePooler):
def forward_one(
def extract_states(
self,
hidden_states: torch.Tensor,
prompt_len: Optional[torch.Tensor] = None,
) -> torch.Tensor:
assert prompt_len is None or prompt_len == hidden_states.shape[0], \
"partial prefill not supported with MEAN pooling"
return hidden_states.mean(dim=0, dtype=torch.float32)
def forward_all(
self,
hidden_states: torch.Tensor,
prompt_lens: torch.Tensor,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[list[torch.Tensor], torch.Tensor]:
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
if isinstance(hidden_states, list):
result = []
for req_state, prompt_len in zip(hidden_states, prompt_lens):
assert prompt_len == req_state.shape[0], \
"partial prefill not supported with mean pooling"
result.append(torch.mean(req_state, dim=0,
dtype=torch.float32))
return result
# Use float32 for torch.cumsum in MeanPool,
# otherwise precision will be lost significantly.
cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32)
@ -263,127 +203,78 @@ class MeanPool(PoolingMethod):
hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
_T = TypeVar("_T", torch.Tensor, list[torch.Tensor])
class StepPool(SimplePooler):
def __init__(
self,
*,
normalize: bool,
softmax: bool,
step_tag_id: Optional[int] = None,
returned_token_ids: Optional[list[int]] = None,
):
super().__init__(normalize=normalize, softmax=softmax)
class BasePoolerActivation(nn.Module, ABC):
self.step_tag_id = step_tag_id
self.returned_token_ids = returned_token_ids
@abstractmethod
def forward(self, pooled_data: _T) -> _T:
# shape:
# classify (& score) -> (batch_size, num_classes)
# embed -> (batch_size, embedding_dim) or list(embedding_dim)
# (batch_size, dimensions) or list(dimensions) if using MRL
raise NotImplementedError
def get_prompt_token_ids(
self,
pooling_metadata: PoolingMetadata,
) -> list[torch.Tensor]:
if isinstance(pooling_metadata, V1PoolingMetadata):
return [
pooling_metadata.prompt_token_ids[i, :num]
for i, num in enumerate(pooling_metadata.prompt_lens)
]
return [
torch.tensor(seq_data_i.prompt_token_ids)
for seq_data_i in pooling_metadata.seq_data.values()
]
def extract_states(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[list[torch.Tensor], torch.Tensor]:
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
prompt_token_ids = self.get_prompt_token_ids(pooling_metadata)
class PoolerActivation(BasePoolerActivation):
pooled_data_lst = list[torch.Tensor]()
if isinstance(hidden_states, list):
for req_state, prompt_len in zip(hidden_states, prompt_lens):
assert prompt_len == req_state.shape[0], \
"partial prefill not supported with step pooling"
pooled_data_lst = hidden_states
else:
offset = 0
for prompt_len in prompt_lens:
pooled_data_i = hidden_states[offset:offset + prompt_len]
offset += prompt_len
pooled_data_lst.append(pooled_data_i)
@staticmethod
def wraps(module: nn.Module):
if isinstance(module, nn.Identity):
return PoolerIdentity()
if isinstance(module, (nn.Sigmoid, nn.Softmax)):
return PoolerClassify()
pooled_data = list[torch.Tensor]()
returned_token_ids = self.returned_token_ids
step_tag_id = self.step_tag_id
return LambdaPoolerActivation(module)
for data, token_id in zip(pooled_data_lst, prompt_token_ids):
if returned_token_ids is not None and len(returned_token_ids) > 0:
data = data[:, returned_token_ids]
@abstractmethod
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
raise NotImplementedError
def forward(self, pooled_data: _T) -> _T:
if isinstance(pooled_data, list):
return [self.forward_chunk(data) for data in pooled_data]
return self.forward_chunk(pooled_data)
class PoolerIdentity(PoolerActivation):
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
return pooled_data
class PoolerNormalize(PoolerActivation):
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
x = F.normalize(pooled_data.float(), p=2, dim=-1)
return x.to(pooled_data.dtype)
class PoolerClassify(PoolerActivation):
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
num_labels = pooled_data.shape[-1]
if num_labels < 2:
return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype)
class PoolerScore(PoolerActivation):
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
num_labels = pooled_data.shape[-1]
if num_labels < 2:
return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
if step_tag_id is not None:
data = data[token_id == step_tag_id]
pooled_data.append(data)
return pooled_data
class LambdaPoolerActivation(PoolerActivation):
def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
super().__init__()
self.fn = fn
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
return self.fn(pooled_data)
class PoolerHead(nn.Module):
@classmethod
def from_config_with_defaults(
cls,
pooler_config: PoolerConfig,
pooling_type: PoolingType,
normalize: bool,
softmax: bool,
) -> "PoolerHead":
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
pooler_config=pooler_config,
pooling_type=pooling_type,
normalize=normalize,
softmax=softmax,
step_tag_id=None,
returned_token_ids=None,
)
return cls.from_config(resolved_config)
@classmethod
def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "PoolerHead":
if pooler_config.normalize and pooler_config.softmax:
raise ValueError("`normalize=True` and `softmax=True` should not "
"be set together")
activation: PoolerActivation
if pooler_config.normalize:
activation = PoolerNormalize()
elif pooler_config.softmax:
activation = PoolerClassify()
else:
activation = PoolerIdentity()
return cls(activation)
def __init__(self, activation: PoolerActivation) -> None:
def __init__(self, *, normalize: bool, softmax: bool) -> None:
super().__init__()
self.activation = activation
self.normalize = normalize
self.softmax = softmax
def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
pooling_metadata: PoolingMetadata):
@ -421,169 +312,60 @@ class PoolerHead(nn.Module):
for vecs, d in zip(pooled_data, dimensions_list)
]
return self.activation(pooled_data)
if self.normalize:
if isinstance(pooled_data, list):
pooled_data = [
F.normalize(data, p=2, dim=-1) for data in pooled_data
]
else:
pooled_data = F.normalize(pooled_data, p=2, dim=-1)
if self.softmax:
if isinstance(pooled_data, list):
pooled_data = [
F.softmax(data, dim=-1)
if data.shape[-1] >= 2 else F.sigmoid(data)
for data in pooled_data
]
else:
if pooled_data.shape[-1] >= 2:
pooled_data = F.softmax(pooled_data, dim=-1)
else:
pooled_data = F.sigmoid(pooled_data)
class SimplePooler(BasePooler):
"""A layer that pools specific information from hidden states.
This layer does the following:
1. Extracts specific tokens or aggregates data based on pooling method.
2. Normalizes output if specified.
3. Returns structured results as `PoolerOutput`.
Attributes:
pooling_type: The type of pooling to use.
normalize: Whether to normalize the pooled data.
"""
@classmethod
def from_config_with_defaults(
cls,
pooler_config: PoolerConfig,
pooling_type: PoolingType,
normalize: bool,
softmax: bool,
) -> "SimplePooler":
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
pooler_config=pooler_config,
pooling_type=pooling_type,
normalize=normalize,
softmax=softmax,
)
assert resolved_config.pooling_type != PoolingType.STEP
return cls.from_config(resolved_config)
@classmethod
def from_config(
cls,
pooler_config: ResolvedPoolingConfig,
) -> "SimplePooler":
pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type)
head = PoolerHead.from_config(pooler_config)
return cls(pooling, head)
def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
super().__init__()
self.pooling = pooling
self.head = head
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
pooled_data = self.pooling(hidden_states, pooling_metadata)
pooled_data = self.head(pooled_data, pooling_metadata)
return build_output(pooled_data)
class StepPooler(BasePooler):
@classmethod
def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "StepPooler":
assert pooler_config.pooling_type == PoolingType.STEP
return cls(
PoolerHead.from_config(pooler_config),
step_tag_id=pooler_config.step_tag_id,
returned_token_ids=pooler_config.returned_token_ids,
)
def __init__(
self,
head: PoolerHead,
*,
step_tag_id: Optional[int] = None,
returned_token_ids: Optional[list[int]] = None,
) -> None:
super().__init__()
self.pooling = AllPool()
self.head = head
self.step_tag_id = step_tag_id
self.returned_token_ids = returned_token_ids
def get_prompt_token_ids(
self,
pooling_metadata: PoolingMetadata,
) -> list[torch.Tensor]:
if isinstance(pooling_metadata, V1PoolingMetadata):
return [
pooling_metadata.prompt_token_ids[i, :num]
for i, num in enumerate(pooling_metadata.prompt_lens)
]
return [
torch.tensor(seq_data_i.prompt_token_ids)
for seq_data_i in pooling_metadata.seq_data.values()
]
def extract_states(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[list[torch.Tensor], torch.Tensor]:
pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
prompt_token_ids = self.get_prompt_token_ids(pooling_metadata)
pooled_data = list[torch.Tensor]()
returned_token_ids = self.returned_token_ids
step_tag_id = self.step_tag_id
for data, token_id in zip(pooled_data_lst, prompt_token_ids):
if returned_token_ids is not None and len(returned_token_ids) > 0:
data = data[:, returned_token_ids]
if step_tag_id is not None:
data = data[token_id == step_tag_id]
pooled_data.append(data)
# shape:
# classify (& score) -> (batch_size, num_classes)
# embed -> (batch_size, embedding_dim) or list(embedding_dim)
# (batch_size, dimensions) or list(dimensions) if using MRL
return pooled_data
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
pooled_data = self.extract_states(hidden_states, pooling_metadata)
pooled_data = self.head(pooled_data, pooling_metadata)
return build_output(pooled_data)
class Pooler(nn.Module):
@staticmethod
@classmethod
def from_config_with_defaults(
cls,
pooler_config: PoolerConfig,
pooling_type: PoolingType,
normalize: bool,
softmax: bool,
step_tag_id: Optional[int] = None,
returned_token_ids: Optional[list[int]] = None,
) -> BasePooler:
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
pooler_config=pooler_config,
pooling_type=pooling_type,
normalize=normalize,
softmax=softmax,
step_tag_id=step_tag_id,
returned_token_ids=returned_token_ids,
) -> SimplePooler:
return SimplePooler.from_pooling_type(
pooling_type=PoolingType[pooler_config.pooling_type]
if pooler_config.pooling_type is not None else pooling_type,
normalize=pooler_config.normalize
if pooler_config.normalize is not None else normalize,
softmax=pooler_config.softmax
if pooler_config.softmax is not None else softmax,
step_tag_id=pooler_config.step_tag_id
if pooler_config.step_tag_id is not None else step_tag_id,
returned_token_ids=pooler_config.returned_token_ids
if pooler_config.returned_token_ids is not None else
returned_token_ids,
)
if pooling_type == PoolingType.STEP:
return StepPooler.from_config(resolved_config)
return SimplePooler.from_config(resolved_config)
PoolingFn = Callable[
[Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
Union[torch.Tensor, list[torch.Tensor]]]
ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
class ClassifierPooler(nn.Module):
"""A pooling layer for classification tasks.
@ -600,39 +382,69 @@ class ClassifierPooler(nn.Module):
def __init__(
self,
config: ModelConfig,
pooling: PoolingFn,
classifier: ClassifierFn,
act_fn: Optional[PoolerActivation] = None,
) -> None:
classifier: nn.Module,
pooler: Optional[nn.Module] = None,
):
super().__init__()
self.pooling = pooling
self.classifier = classifier
self.pooler = pooler
self.classification_act_fn = get_classification_activation_function(
config.hf_config) if act_fn is None else act_fn
config.hf_config)
self.cross_encoder_act_fn = get_cross_encoder_activation_function(
config.hf_config) if act_fn is None else act_fn
config.hf_config)
def _get_act_fn(self, use_cross_encoder: bool):
return (self.cross_encoder_act_fn
if use_cross_encoder else self.classification_act_fn)
def get_prompt_lens(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> torch.Tensor:
if isinstance(pooling_metadata, V1PoolingMetadata):
return pooling_metadata.prompt_lens
assert isinstance(hidden_states, torch.Tensor)
return PoolingTensors.from_pooling_metadata(
pooling_metadata, hidden_states.device).prompt_lens
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
"""Pools sentence pair scores from the hidden_states."""
pooled_data = self.pooling(hidden_states, pooling_metadata)
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
# apply classifier once on the full batch if possible
if isinstance(pooled_data, torch.Tensor):
pooled_output = self.classifier(pooled_data)
elif len({data.shape for data in pooled_data}) <= 1:
pooled_output = self.classifier(torch.stack(pooled_data))
pooled_data = list[torch.Tensor]()
if isinstance(hidden_states, list):
for req_state, prompt_len in zip(hidden_states, prompt_lens):
assert prompt_len == req_state.shape[0], \
"partial prefill not supported with classifier"
pooled_data = hidden_states
else:
pooled_output = [self.classifier(data) for data in pooled_data]
offset = 0
for prompt_len in prompt_lens:
pooled_data_i = hidden_states[offset:offset + prompt_len]
offset += prompt_len
pooled_data.append(pooled_data_i)
pooled_data_lst = []
for pooled_data_i in pooled_data:
if self.pooler is not None:
final_shape_tensor = self.pooler(pooled_data_i)
else:
final_shape_tensor = self.classifier(pooled_data_i)
pooled_data_lst.append(final_shape_tensor)
pooled_output = torch.stack(pooled_data_lst)
if self.pooler is not None:
# apply classifier once on the full batch if possible
pooled_output = self.classifier(pooled_output)
if isinstance(pooling_metadata, V0PoolingMetadata):
use_cross_encoder_list = [
@ -657,4 +469,5 @@ class ClassifierPooler(nn.Module):
pooled_output)
])
return build_output(scores)
pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
return PoolerOutput(outputs=pooled_outputs)

View File

@ -36,7 +36,6 @@ QuantizationMethods = Literal[
"torchao",
"auto-round",
"rtn",
"inc",
]
QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
@ -105,7 +104,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
from .gptq_marlin import GPTQMarlinConfig
from .gptq_marlin_24 import GPTQMarlin24Config
from .hqq_marlin import HQQMarlinConfig
from .inc import INCConfig
from .ipex_quant import IPEXConfig
from .marlin import MarlinConfig
from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
@ -146,8 +144,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
"moe_wna16": MoeWNA16Config,
"torchao": TorchAOConfig,
"auto-round": AutoRoundConfig,
"rtn": RTNConfig,
"inc": INCConfig,
"rtn": RTNConfig
}
# Update the `method_to_config` with customized quantization methods.
method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
@ -160,4 +157,4 @@ __all__ = [
"QuantizationMethods",
"get_quantization_config",
"QUANTIZATION_METHODS",
]
]

View File

@ -929,8 +929,10 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
scoring_func=scoring_func,
e_score_correction_bias=e_score_correction_bias)
per_act_token = (
self.input_quant.strategy == QuantizationStrategy.TOKEN)
a1_scale = layer.w13_input_scale
a2_scale = layer.w2_input_scale
per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
a2_scale.numel() != 1 if a2_scale is not None else False)
if self.fused_experts is None:
# If no modular kernel is provided, use cutlass_moe_fp8
@ -948,8 +950,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
expert_map=None if self.disable_expert_map else expert_map,
w1_scale=layer.w13_weight_scale,
w2_scale=layer.w2_weight_scale,
a1_scale=layer.w13_input_scale,
a2_scale=layer.w2_input_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
)
else:
return self.fused_experts(

View File

@ -1,61 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Intel Gaudi supports quantization of various modules and functions,
# including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`.
# During model loading,
# INC will patch layers with quantization/dequantization operators.
# Meanwhile, INC will convert original weight to target datatype
# and loading to target device.
# static scaling should be provided through Quant_CONFIG:
# `QUANT_CONFIG` is an environment variable,
# that points to the measurement or quantization JSON config file.
# The measurement configuration file is used during the calibration procedure,
# to collect measurements for a given model.
# The quantization configuration is used during inference.
# For more information, please refer to:
# https://docs.habana.ai/en/v1.21.1/PyTorch/vLLM_Inference/vLLM_FP8_Inference.html
from typing import Any, Optional
import torch
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, UnquantizedFusedMoEMethod)
from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
class INCConfig(QuantizationConfig):
"""Config class for FP8 using Intel Neural Compressor."""
@classmethod
def get_name(cls) -> QuantizationMethods:
return "inc"
@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
return [torch.bfloat16]
@classmethod
def from_config(cls, config: dict[str, Any]) -> "INCConfig":
raise AssertionError
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["QuantizeMethodBase"]:
if isinstance(layer, LinearBase):
return UnquantizedLinearMethod()
elif isinstance(layer, FusedMoE):
return UnquantizedFusedMoEMethod(layer.moe_config)
return None
@classmethod
def get_min_capability(cls) -> int:
raise AssertionError
@staticmethod
def get_config_filenames() -> list[str]:
return []

View File

@ -378,6 +378,8 @@ def per_token_group_quant_fp8(
is supported for now.
column_major_scales: Outputs scales in column major.
out_q: Optional output tensor. If not provided, function will create.
tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
scaling factor for quantization.
Returns:
tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
scaling factor.

View File

@ -6,12 +6,9 @@ import torch
import torch.nn as nn
from vllm.config import LoadConfig, ModelConfig, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.model_loader.utils import (
initialize_model, process_weights_after_loading, set_default_torch_dtype)
logger = init_logger(__name__)
class BaseModelLoader(ABC):
"""Base class for model loaders."""
@ -35,16 +32,11 @@ class BaseModelLoader(ABC):
model_config: ModelConfig) -> nn.Module:
"""Load a model with the given configurations."""
device_config = vllm_config.device_config
load_config = vllm_config.load_config
load_device = device_config.device if load_config.device is None else \
load_config.device
target_device = torch.device(load_device)
target_device = torch.device(device_config.device)
with set_default_torch_dtype(model_config.dtype):
with target_device:
model = initialize_model(vllm_config=vllm_config,
model_config=model_config)
logger.debug("Loading weights on %s ...", load_device)
# Quantization does not happen in `load_weights` but after it
self.load_weights(model, model_config)
process_weights_after_loading(model, model_config, target_device)

View File

@ -152,8 +152,8 @@ def get_quant_config(model_config: ModelConfig,
quant_cls = get_quantization_config(model_config.quantization)
# GGUF doesn't have config file
if model_config.quantization in ("gguf", "inc"):
return quant_cls()
if model_config.quantization == "gguf":
return quant_cls.from_config({})
# Read the quantization config from the HF model config, if available.
hf_quant_config = getattr(model_config.hf_config, "quantization_config",

View File

@ -58,27 +58,22 @@ def _create_pooling_model_cls(
) -> None:
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
self.vllm_config = vllm_config
# These are not used in pooling models
for attr in ("lm_head", "logits_processor"):
if hasattr(self, attr):
delattr(self, attr)
# If the model already defines a pooler instance, don't overwrite it
if not getattr(self, "_pooler", None):
self._init_pooler(vllm_config, prefix=prefix)
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self._pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=default_pooling_type,
normalize=default_normalize,
softmax=default_softmax,
)
# If the model already defines a pooler instance, don't overwrite it
if not getattr(self, "_pooler", None):
self._pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=default_pooling_type,
normalize=default_normalize,
softmax=default_softmax,
)
def pooler(
self,
@ -170,9 +165,7 @@ def as_seq_cls_model(cls: _T) -> _T:
# Lazy import
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.pooler import (ClassifierPooler,
PoolerOutput, PoolingType,
SimplePooler)
from vllm.model_executor.layers.pooler import PoolerOutput, PoolingType
from vllm.model_executor.models.interfaces import SupportsCrossEncoding
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import IntermediateTensors
@ -189,40 +182,30 @@ def as_seq_cls_model(cls: _T) -> _T:
class ModelForSequenceClassification(ModelForPooling,
SupportsCrossEncoding):
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
def __init__(
self,
*,
vllm_config: "VllmConfig",
prefix: str = "",
**kwargs: Any,
) -> None:
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.score = RowParallelLinear(
config.hidden_size,
config.num_labels,
input_is_parallel=False,
bias=False,
params_dtype=torch.float32,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "score"),
)
self.vllm_config = vllm_config
self.task = vllm_config.model_config.task
self.pooling_type = (
vllm_config.model_config.pooler_config.pooling_type)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
pooler = SimplePooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.LAST,
normalize=False,
softmax=True,
)
self._pooler = ClassifierPooler(
vllm_config.model_config,
pooling=pooler.pooling,
classifier=self._classifier,
act_fn=pooler.head.activation,
)
def _classifier(self, x: torch.Tensor):
x, _ = self.score(x.float())
return x
self.score = RowParallelLinear(config.hidden_size,
config.num_labels,
quant_config=quant_config,
input_is_parallel=False,
bias=False,
prefix=maybe_prefix(
prefix, "score"))
def forward(
self,
@ -239,7 +222,27 @@ def as_seq_cls_model(cls: _T) -> _T:
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
return self._pooler(hidden_states, pooling_metadata)
def get_logits(hidden_states):
if isinstance(hidden_states, list):
logits = [self.score(state)[0] for state in hidden_states]
else:
logits, _ = self.score(hidden_states)
return logits
if self.pooling_type == PoolingType.ALL:
logits = get_logits(hidden_states)
return self._pooler(logits, pooling_metadata)
else:
hidden_states = self._pooler.extract_states(
hidden_states, pooling_metadata)
logits = get_logits(hidden_states)
pooled_data = self._pooler.head(logits, pooling_metadata)
pooled_outputs = [
self._pooler.build_output(data) for data in pooled_data
]
return PoolerOutput(outputs=pooled_outputs)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
tokens = getattr(self.config, "classifier_from_token", None)

View File

@ -47,6 +47,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@ -484,6 +485,7 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
else:
self.lm_head = PPMissingLayer()
self.sampler = get_sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
@ -510,6 +512,14 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
sampling_metadata)
return logits
def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Optional, Union
from typing import Optional
import torch
from torch import nn
@ -18,7 +18,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
PoolingMethod, PoolingType)
PoolingType)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
@ -84,18 +84,14 @@ class BertPooler(nn.Module):
def __init__(self, config: BertConfig):
super().__init__()
self.pooling = PoolingMethod.from_pooling_type(PoolingType.CLS)
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[torch.Tensor, list[torch.Tensor]]:
pooled_output = self.pooling(hidden_states, pooling_metadata)
pooled_output = self.dense(pooled_output)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[0, :]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
@ -476,11 +472,8 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
embedding_class=BertEmbedding,
add_pooling_layer=True)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self._pooler = ClassifierPooler(
vllm_config.model_config,
pooling=self.bert.pooler,
classifier=self.classifier,
)
self._pooler = ClassifierPooler(vllm_config.model_config,
self.classifier, self.bert.pooler)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader(self)

View File

@ -205,19 +205,6 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
}
class GraniteMoeHybridModelConfig(VerifyAndUpdateConfig):
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
config = vllm_config.model_config
config.max_seq_len_to_capture = config.max_model_len
logger.info(
"Setting max_seq_len_to_capture to %d "
"to ensure that CUDA graph capture "
"covers sequences of length up to max_model_len.",
config.max_model_len)
class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
@classmethod
@ -310,5 +297,4 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
"XLMRobertaModel": JinaRobertaModelConfig,
"JinaVLForRanking": JinaVLForSequenceClassificationConfig,
"GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
}

View File

@ -36,6 +36,7 @@ from vllm.config import CacheConfig, VllmConfig
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
@ -548,6 +549,7 @@ class GraniteSpeechForConditionalGeneration(
self.config = config
self.quant_config = quant_config
self.cache_config = cache_config
self.sampler = get_sampler()
# The language model is typically a Granite LLM
self.language_model = init_vllm_registered_model(

View File

@ -9,7 +9,7 @@ import torch.nn as nn
from vllm.config import ModelConfig, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.pooler import PoolerHead, PoolerNormalize
from vllm.model_executor.layers.pooler import PoolerHead
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.model_executor.pooling_metadata import (PoolingMetadata,
PoolingTensors)
@ -49,7 +49,7 @@ class GritLMPooler(nn.Module):
self.embed_pattern_ids = tokens_to_ids(
["▁<", "|", "embed", "|", ">", "<0x0A>"])
self.head = PoolerHead(PoolerNormalize())
self.head = PoolerHead(normalize=True, softmax=False)
def _find_array(self, arr: array, target: array, start_idx: int) -> int:
"""

View File

@ -49,6 +49,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import (
@ -660,6 +661,7 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size,
logit_scale)
self.sampler = get_sampler()
else:
self.lm_head = PPMissingLayer()
@ -683,6 +685,14 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
sampling_metadata)
return logits
def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
def make_empty_intermediate_tensors(
self, batch_size: int, dtype: torch.dtype,
device: torch.device) -> IntermediateTensors:

View File

@ -22,8 +22,8 @@ from typing import Literal, Optional, TypedDict, Union
import torch
from torch import nn
from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
Idefics3Processor)
from transformers import (AddedToken, BatchFeature, Idefics3Config,
Idefics3ImageProcessor, Idefics3Processor)
from vllm.config import VllmConfig
from vllm.model_executor.layers.linear import ReplicatedLinear
@ -199,14 +199,21 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
return grid_w * grid_h + 1
# TODO: Remove after requiring transformers>=4.52
def _get_content(self, token: Union[AddedToken, str]) -> str:
if isinstance(token, str):
return token
return token.content
def _get_image_token(
self,
processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
if processor is None:
processor = self.get_hf_processor()
image_token = processor.image_token
fake_image_token = processor.fake_image_token
image_token = self._get_content(processor.image_token)
fake_image_token = self._get_content(processor.fake_image_token)
global_image_token = processor.global_image_tag
return image_token, fake_image_token, global_image_token

View File

@ -659,7 +659,7 @@ def supports_cross_encoding(
def has_step_pooler(model: Union[type[object], object]) -> bool:
"""Check if the model uses step pooler."""
return is_pooling_model(model) and any(
type(module).__name__ == "StepPooler" for module in model.modules())
type(module).__name__ == "StepPool" for module in model.modules())
class SupportsQuant:
@ -722,8 +722,7 @@ class SupportsTranscription(Protocol):
@classmethod
def get_generation_prompt(cls, audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig, language: str,
stt_config: SpeechToTextConfig, language: str,
task_type: str,
request_prompt: str) -> PromptType:
"""Get the prompt for the ASR model.

View File

@ -19,8 +19,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
from vllm.model_executor.layers.pooler import (ClassifierPooler, PoolingType,
SimplePooler)
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
@ -565,41 +564,29 @@ class JambaForSequenceClassification(JambaForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
config = vllm_config.model_config.hf_config
num_labels: int = config.num_labels
score_bias: bool = getattr(config, 'score_bias', False)
# TODO: The original reward weights have float32 accuracy data, we
# would like to load them in fp32 to get that extra precision.
# Currently weight_loader passes the weight which is already in bf16
self.score = nn.Linear(
config.hidden_size,
num_labels,
bias=score_bias,
dtype=torch.float32,
)
self.score = nn.Linear(config.hidden_size, num_labels, bias=score_bias)
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
pooler = SimplePooler.from_config_with_defaults(
self._pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.LAST,
normalize=False,
softmax=False,
)
self._pooler = ClassifierPooler(
vllm_config.model_config,
pooling=pooler.pooling,
classifier=self.score,
act_fn=pooler.head.activation,
)
softmax=False)
def pooler(
self,
hidden_states: torch.Tensor,
pooling_metadata: PoolingMetadata,
) -> Optional[PoolerOutput]:
return self._pooler(hidden_states, pooling_metadata)
hidden_states = hidden_states.float()
logits = self.score(hidden_states)
return self._pooler(logits, pooling_metadata)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
# TODO: The reward weights themselves have float32 accuracy data, we
# would like to load them in fp32 to get that extra precision.
super().load_weights(weights)
self.score = self.score.float()

View File

@ -1,214 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
# All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Iterable
from typing import Optional
import torch
import torch.nn as nn
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed.parallel_state import get_pp_group
from vllm.logger import init_logger
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.quantization.torchao import TorchAOConfig
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.llama4 import (Llama4DecoderLayer,
Llama4ForCausalLM)
from vllm.model_executor.models.utils import extract_layer_index
from .utils import AutoWeightsLoader, maybe_prefix
logger = init_logger(__name__)
@support_torch_compile
class LlamaModel(nn.Module):
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
start_layer_id: int = 0,
quant_config: Optional[QuantizationConfig] = None,
) -> None:
super().__init__()
self.config = (
vllm_config.speculative_config.draft_model_config.hf_config)
self.validate_and_update_config(start_layer_id, quant_config)
self.vocab_size = self.config.vocab_size
self.embed_tokens = VocabParallelEmbedding(
self.config.vocab_size,
self.config.hidden_size,
prefix=maybe_prefix(prefix, "embed_tokens"),
)
self.layers = nn.ModuleList([
Llama4DecoderLayer(
self.config,
quant_config=quant_config,
prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
) for i in range(self.config.num_hidden_layers)
])
self.fc = torch.nn.Linear(self.config.hidden_size * 2,
self.config.hidden_size,
bias=False)
self.norm = RMSNorm(self.config.hidden_size,
eps=self.config.rms_norm_eps)
def forward(
self,
input_ids: Optional[torch.Tensor],
positions: torch.Tensor,
hidden_states: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
input_embeds = self.embed_tokens(input_ids)
hidden_states = self.fc(
torch.cat((input_embeds, hidden_states), dim=-1))
residual = None
for layer in self.layers:
hidden_states, residual = layer(
positions,
hidden_states,
residual,
)
hidden_states, _ = self.norm(hidden_states, residual)
return hidden_states, hidden_states
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
(".qkv_proj", ".q_proj", "q"),
(".qkv_proj", ".k_proj", "k"),
(".qkv_proj", ".v_proj", "v"),
(".gate_up_proj", ".gate_proj", 0),
(".gate_up_proj", ".up_proj", 1),
]
params_dict = dict(self.named_parameters())
loaded_params: set[str] = set()
for name, loaded_weight in weights:
name = name.removeprefix("model.")
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# if PP disabled then draft will share embed with target
if get_pp_group().world_size == 1 and \
"embed_tokens." in name:
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
loaded_params.add(name)
for name in params_dict:
# if PP disabled then draft will share embed with target
if get_pp_group().world_size == 1 and \
"embed_tokens." in name:
continue
assert name in loaded_params, f"{name} is not loaded!"
return loaded_params
def validate_and_update_config(
self,
start_layer_id: int,
quant_config: Optional[QuantizationConfig] = None) -> None:
# yoco and moe is not supported by draft model yet
assert self.config.yoco_global_kv_layer is None
assert self.config.yoco_local_kv_layer is None
assert len(self.config.moe_layers) == 0
# draft model layer index is increased by start_layer_id,
# so we need to pad relevant configs accordingly
self.config.no_rope_layers = [
0
] * start_layer_id + self.config.no_rope_layers
# currently only TorchAO quantization is supported
if isinstance(quant_config, TorchAOConfig):
def pad_layer_name(layer: str) -> str:
layer_index = extract_layer_index(layer)
return layer.replace(str(layer_index),
str(layer_index + start_layer_id))
quant_config.torchao_config.module_fqn_to_config = {
pad_layer_name(layer): quantization
for layer, quantization in
quant_config.torchao_config.module_fqn_to_config.items()
}
class EagleLlama4ForCausalLM(Llama4ForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
nn.Module.__init__(self)
self.config = (
vllm_config.speculative_config.draft_model_config.hf_config)
target_layer_num = vllm_config.model_config.get_num_layers(
vllm_config.parallel_config)
# draft model quantization config may differ from target model
quant_config = VllmConfig.get_quantization_config(
vllm_config.speculative_config.draft_model_config,
vllm_config.load_config)
self.model = LlamaModel(vllm_config=vllm_config,
prefix="model",
start_layer_id=target_layer_num,
quant_config=quant_config)
logit_scale = getattr(self.config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(self.config.vocab_size,
scale=logit_scale)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
hidden_states: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
return self.model(input_ids, positions, hidden_states)
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> None:
loader = AutoWeightsLoader(
self,
# lm_head is tied with target model (Llama4ForCausalLM)
skip_prefixes=(["lm_head."]),
)
model_weights = {}
weights = [
self.permute_qk_weight_for_rotary(name, loaded_weight)
for name, loaded_weight in weights
]
for name, loaded_weight in weights:
if "lm_head" not in name:
name = "model." + name
model_weights[name] = loaded_weight
loader.load_weights(model_weights.items())

View File

@ -36,6 +36,7 @@ from vllm.config import VllmConfig
from vllm.distributed import get_pp_group
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, maybe_remap_kv_scale_name)
@ -175,6 +176,7 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
self.lm_head = PPMissingLayer()
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = get_sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

View File

@ -30,6 +30,7 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@ -160,6 +161,8 @@ class MiMoMTP(nn.Module):
self.lm_head = ParallelLMHead(self.config.vocab_size,
self.config.hidden_size)
self.sampler = get_sampler()
def forward(
self,
input_ids: torch.Tensor,
@ -184,6 +187,14 @@ class MiMoMTP(nn.Module):
return self.model.compute_logits(hidden_states, self.lm_head,
sampling_metadata, spec_step_idx)
def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
stacked_params_mapping = [

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Optional, Union
from typing import Optional
import torch
from torch import nn
@ -13,8 +13,7 @@ from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import (QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.pooler import (BasePooler, ClassifierPooler,
PoolingMethod, PoolingType)
from vllm.model_executor.layers.pooler import ClassifierPooler
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
@ -253,13 +252,10 @@ class ModernBertModel(nn.Module):
return norm_outputs
class ModernBertPooler(BasePooler):
class ModernBertPooler(nn.Module):
def __init__(self, config: ModernBertConfig):
super().__init__()
pooling_type = PoolingType[config.classifier_pooling.upper()]
self.pooling = PoolingMethod.from_pooling_type(pooling_type)
self.dense = nn.Linear(config.hidden_size, config.hidden_size,
config.classifier_bias)
self.pooling_type = config.classifier_pooling
@ -268,12 +264,15 @@ class ModernBertPooler(BasePooler):
eps=config.norm_eps,
bias=config.norm_bias)
def forward(
self,
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
pooling_metadata: PoolingMetadata,
) -> Union[torch.Tensor, list[torch.Tensor]]:
pooled_output = self.pooling(hidden_states, pooling_metadata)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
pooled_output = hidden_states
if self.pooling_type == "mean":
pooled_output = pooled_output.mean(dim=0, keepdim=False)
elif self.pooling_type == "cls":
pooled_output = pooled_output[0, :]
else:
raise ValueError("Pooling type should be either `cls` or `mean`, "
f"but got {self.pooling_type}")
pooled_output = self.norm(self.act(self.dense(pooled_output)))
return pooled_output
@ -288,11 +287,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
self.model = ModernBertModel(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "modernbert"))
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self._pooler = ClassifierPooler(
vllm_config.model_config,
pooling=ModernBertPooler(config),
classifier=self.classifier,
)
self._pooler = ClassifierPooler(vllm_config.model_config,
self.classifier,
ModernBertPooler(config))
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):

Some files were not shown because too many files have changed in this diff Show More