mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-21 07:13:52 +08:00
Compare commits
1 Commits
codex/chan
...
benchmark
Author | SHA1 | Date | |
---|---|---|---|
b6381ced9c |
@ -6,17 +6,19 @@ set -exuo pipefail
|
||||
|
||||
# Try building the docker image
|
||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||
FROM gaudi-base-image:latest
|
||||
FROM 1.22-413-pt2.7.1:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN pip install -v -r requirements/hpu.txt
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
@ -645,7 +645,7 @@ steps:
|
||||
optional: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
|
@ -1,6 +0,0 @@
|
||||
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
|
||||
have_fun: false # Just review the code
|
||||
code_review:
|
||||
comment_severity_threshold: HIGH # Reduce quantity of comments
|
||||
pull_request_opened:
|
||||
summary: false # Don't summarize the PR in a separate comment
|
@ -21,7 +21,7 @@ repos:
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.34.0
|
||||
rev: v1.32.0
|
||||
hooks:
|
||||
- id: typos
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
|
@ -30,11 +30,17 @@ from datasets import load_dataset
|
||||
from PIL import Image
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||
try:
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||
except:
|
||||
MultiModalDataDict = None
|
||||
AnyTokenizer = None
|
||||
LoRARequest = None
|
||||
print("Install vLLM to use LoRA or Multimodal benchmarking.")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -24,7 +24,6 @@
|
||||
|
||||
#include "attention_dtypes.h"
|
||||
#include "attention_utils.cuh"
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
@ -34,6 +33,12 @@ typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
|
||||
@ -665,6 +670,7 @@ __global__ void paged_attention_v2_reduce_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
@ -18,7 +18,6 @@ limitations under the License.
|
||||
* Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
|
||||
* by Alcanderian JieXin Liang
|
||||
*/
|
||||
#include "core/registration.h"
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -271,13 +270,4 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
|
||||
m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
|
||||
}
|
||||
|
||||
// clang-format on
|
||||
|
@ -18,7 +18,12 @@
|
||||
*/
|
||||
|
||||
#include "attention_kernels.cuh"
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -182,6 +187,7 @@ void paged_attention_v1(
|
||||
CALL_V1_LAUNCHER_BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
@ -18,7 +18,12 @@
|
||||
*/
|
||||
|
||||
#include "attention_kernels.cuh"
|
||||
#include "cuda_compat.h"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -192,6 +197,7 @@ void paged_attention_v2(
|
||||
CALL_V2_LAUNCHER_BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
@ -58,7 +58,7 @@ namespace {
|
||||
|
||||
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
|
||||
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
|
||||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
|
||||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
|
||||
|
||||
#define CHECK_INPUT(x) \
|
||||
CHECK_CPU(x); \
|
||||
|
@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
|
||||
int64_t topk,
|
||||
int64_t num_tokens_post_pad);
|
||||
|
||||
// shared expert implementation for int8 w8a8
|
||||
// shared expert implememntation for int8 w8a8
|
||||
template <typename scalar_t>
|
||||
void shared_expert_int8_kernel_impl(
|
||||
scalar_t* __restrict__ output,
|
||||
|
@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
|
||||
__m512 vd0;
|
||||
__m512 vd1[COLS];
|
||||
|
||||
// oops! 4x4 spills but luckily we use 4x2
|
||||
// oops! 4x4 spills but luckly we use 4x2
|
||||
__m512 vbias[COLS];
|
||||
|
||||
// [NOTE]: s8s8 igemm compensation in avx512-vnni
|
||||
|
@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
|
||||
#define CVT_FP16_TO_FP32(a) \
|
||||
_mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
|
||||
|
||||
// this doesn't handle NaN.
|
||||
// this doesn't hanel NaN.
|
||||
inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
|
||||
const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
|
||||
|
||||
|
@ -4,10 +4,10 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM) && defined(__GFX9__)
|
||||
#define WARP_SIZE 64
|
||||
#else
|
||||
#ifndef USE_ROCM
|
||||
#define WARP_SIZE 32
|
||||
#else
|
||||
#define WARP_SIZE warpSize
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
13
csrc/ops.h
13
csrc/ops.h
@ -167,6 +167,19 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale);
|
||||
|
||||
void sm100_cutlass_mla_decode(
|
||||
torch::Tensor const& out, torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens, torch::Tensor const& page_table,
|
||||
torch::Tensor const& workspace, double sm_scale,
|
||||
int64_t num_kv_splits =
|
||||
1 /* Set to 1 to avoid cuda_graph issue by default. */);
|
||||
|
||||
int64_t sm100_cutlass_mla_get_workspace_size(
|
||||
int64_t max_seq_len, int64_t num_batches, int64_t sm_count = 0,
|
||||
int64_t num_kv_splits =
|
||||
1 /* Set to 1 to avoid cuda_graph issue by default. */);
|
||||
|
||||
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
@ -521,14 +521,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor page_table, Tensor workspace, float "
|
||||
"scale,"
|
||||
" int num_kv_splits) -> ()");
|
||||
// conditionally compiled so impl in source file
|
||||
ops.impl("sm100_cutlass_mla_decode", torch::kCUDA, &sm100_cutlass_mla_decode);
|
||||
|
||||
// SM100 CUTLASS MLA workspace
|
||||
ops.def(
|
||||
"sm100_cutlass_mla_get_workspace_size(int max_seq_len, int num_batches,"
|
||||
" int sm_count, int num_kv_splits) "
|
||||
"-> int");
|
||||
// conditionally compiled so impl in source file
|
||||
ops.impl("sm100_cutlass_mla_get_workspace_size",
|
||||
&sm100_cutlass_mla_get_workspace_size);
|
||||
|
||||
// Compute NVFP4 block quantized tensor.
|
||||
ops.def(
|
||||
|
@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
|
||||
ARG PIP_KEYRING_PROVIDER=disabled
|
||||
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
|
||||
|
||||
# Flag enables built-in KV-connector dependency libs into docker images
|
||||
# Flag enables build-in KV-connector dependency libs into docker images
|
||||
ARG INSTALL_KV_CONNECTORS=false
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
@ -207,19 +207,6 @@ ARG SCCACHE_ENDPOINT
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED
|
||||
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
|
||||
ENV VLLM_USE_PRECOMPILED=""
|
||||
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
|
||||
export VLLM_USE_PRECOMPILED=1 && \
|
||||
echo "Using precompiled wheels"; \
|
||||
else \
|
||||
unset VLLM_USE_PRECOMPILED && \
|
||||
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
|
||||
fi
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
|
@ -8,6 +8,7 @@ API documentation for vLLM's configuration classes.
|
||||
|
||||
- [vllm.config.ModelConfig][]
|
||||
- [vllm.config.CacheConfig][]
|
||||
- [vllm.config.TokenizerPoolConfig][]
|
||||
- [vllm.config.LoadConfig][]
|
||||
- [vllm.config.ParallelConfig][]
|
||||
- [vllm.config.SchedulerConfig][]
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 57 KiB After Width: | Height: | Size: 68 KiB |
@ -1,7 +1,3 @@
|
||||
---
|
||||
toc_depth: 4
|
||||
---
|
||||
|
||||
# vLLM CLI Guide
|
||||
|
||||
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
||||
@ -46,10 +42,6 @@ Start the vLLM OpenAI Compatible API server.
|
||||
vllm serve --help=page
|
||||
```
|
||||
|
||||
### Options
|
||||
|
||||
--8<-- "docs/argparse/serve.md"
|
||||
|
||||
## chat
|
||||
|
||||
Generate chat completions via the running API server.
|
||||
|
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
## CLI Arguments
|
||||
|
||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
|
||||
To see the available CLI arguments, run `vllm serve --help`!
|
||||
|
||||
## Configuration file
|
||||
|
||||
|
@ -3,15 +3,6 @@
|
||||
[](){ #deployment-anyscale }
|
||||
|
||||
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||
|
||||
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
|
||||
|
||||
It hosts Ray clusters inside your own AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, or managing observability stacks.
|
||||
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
||||
|
||||
## Production-ready vLLM on Anyscale quickstarts
|
||||
|
||||
- [Offline batch inference](https://console.anyscale.com/template-preview/llm_batch_inference?utm_source=vllm_docs)
|
||||
- [Deploy vLLM services](https://console.anyscale.com/template-preview/llm_serving?utm_source=vllm_docs)
|
||||
- [Curate a dataset](https://console.anyscale.com/template-preview/audio-dataset-curation-llm-judge?utm_source=vllm_docs)
|
||||
- [Finetune an LLM](https://console.anyscale.com/template-preview/entity-recognition-with-llms?utm_source=vllm_docs)
|
||||
|
@ -1,42 +1,26 @@
|
||||
# Open WebUI
|
||||
|
||||
[Open WebUI](https://github.com/open-webui/open-webui) is an extensible, feature-rich,
|
||||
and user-friendly self-hosted AI platform designed to operate entirely offline.
|
||||
It supports various LLM runners like Ollama and OpenAI-compatible APIs,
|
||||
with built-in RAG capabilities, making it a powerful AI deployment solution.
|
||||
1. Install the [Docker](https://docs.docker.com/engine/install/)
|
||||
|
||||
To get started with Open WebUI using vLLM, follow these steps:
|
||||
2. Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
1. Install the [Docker](https://docs.docker.com/engine/install/).
|
||||
```bash
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
2. Start the vLLM server with a supported chat completion model:
|
||||
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
|
||||
|
||||
```console
|
||||
vllm serve Qwen/Qwen3-0.6B-Chat
|
||||
```
|
||||
```bash
|
||||
docker run -d -p 3000:8080 \
|
||||
--name open-webui \
|
||||
-v open-webui:/app/backend/data \
|
||||
-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
|
||||
--restart always \
|
||||
ghcr.io/open-webui/open-webui:main
|
||||
```
|
||||
|
||||
!!! note
|
||||
When starting the vLLM server, be sure to specify the host and port using the `--host` and `--port` flags.
|
||||
For example:
|
||||
1. Open it in the browser: <http://open-webui-host:3000/>
|
||||
|
||||
```console
|
||||
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
|
||||
```
|
||||
On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
|
||||
|
||||
3. Start the Open WebUI Docker container:
|
||||
|
||||
```console
|
||||
docker run -d \
|
||||
--name open-webui \
|
||||
-p 3000:8080 \
|
||||
-v open-webui:/app/backend/data \
|
||||
-e OPENAI_API_BASE_URL=http://0.0.0.0:8000/v1 \
|
||||
--restart always \
|
||||
ghcr.io/open-webui/open-webui:main
|
||||
```
|
||||
|
||||
4. Open it in the browser: <http://open-webui-host:3000/>
|
||||
|
||||
At the top of the page, you should see the model `Qwen/Qwen3-0.6B-Chat`.
|
||||
|
||||

|
||||

|
||||
|
@ -10,7 +10,6 @@ Contents:
|
||||
- [BitBLAS](bitblas.md)
|
||||
- [GGUF](gguf.md)
|
||||
- [GPTQModel](gptqmodel.md)
|
||||
- [INC](inc.md)
|
||||
- [INT4 W4A16](int4.md)
|
||||
- [INT8 W8A8](int8.md)
|
||||
- [FP8 W8A8](fp8.md)
|
||||
|
@ -1,56 +0,0 @@
|
||||
---
|
||||
title: FP8 INC
|
||||
---
|
||||
[](){ #inc }
|
||||
|
||||
vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
|
||||
Currently, quantization is validated only in Llama models.
|
||||
|
||||
Intel Gaudi supports quantization of various modules and functions, including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. For more information, please refer to:
|
||||
[Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules).
|
||||
|
||||
!!! note
|
||||
Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
|
||||
|
||||
!!! note
|
||||
`QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
|
||||
The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
|
||||
|
||||
## Run Online Inference Using FP8
|
||||
|
||||
Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command:
|
||||
|
||||
```bash
|
||||
export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json
|
||||
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
|
||||
```
|
||||
|
||||
!!! tip
|
||||
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
|
||||
|
||||
!!! tip
|
||||
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
|
||||
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
|
||||
`VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes.
|
||||
|
||||
## Run Offline Inference Using FP8
|
||||
|
||||
To run offline inference (after completing the model calibration process):
|
||||
|
||||
* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
|
||||
* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
|
||||
* Call shutdown method of the model_executor at the end of the run.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc")
|
||||
...
|
||||
# Call llm.generate on the required prompts and sampling params.
|
||||
...
|
||||
llm.llm_engine.model_executor.shutdown()
|
||||
```
|
||||
|
||||
## Device for the Model's Weights Uploading
|
||||
|
||||
The unquantized weights are first loaded onto the CPU, then quantized and transferred to the target device (HPU) for model execution.
|
||||
This reduces the device memory footprint of model weights, as only quantized weights are stored in the device memory.
|
@ -2,19 +2,18 @@
|
||||
|
||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||
|
||||
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU |
|
||||
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
|
||||
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
|
||||
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ |
|
||||
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
|
||||
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ |
|
||||
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ |
|
||||
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU |
|
||||
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
|
||||
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
|
||||
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
|
||||
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ |
|
||||
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ |
|
||||
|
||||
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
|
||||
- ✅︎ indicates that the quantization method is supported on the specified hardware.
|
||||
|
@ -103,7 +103,9 @@ When tool_choice='required' is set, the model is guaranteed to generate one or m
|
||||
|
||||
vLLM supports the `tool_choice='none'` option in the chat completion API. When this option is set, the model will not generate any tool calls and will respond with regular text content only, even if tools are defined in the request.
|
||||
|
||||
However, when `tool_choice='none'` is specified, vLLM includes tool definitions from the prompt.
|
||||
By default, when `tool_choice='none'` is specified, vLLM excludes tool definitions from the prompt to optimize context usage. To include tool definitions even with `tool_choice='none'`, use the `--expand-tools-even-if-tool-choice-none` option.
|
||||
|
||||
Note: This behavior will change in v0.10.0, where tool definitions will be included by default even with `tool_choice='none'`.
|
||||
|
||||
## Automatic Function Calling
|
||||
|
||||
|
@ -28,7 +28,7 @@ To verify that the Intel Gaudi software was correctly installed, run:
|
||||
hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
|
||||
apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
|
||||
pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
|
||||
pip list | grep neural # verify that neural_compressor_pt is installed
|
||||
pip list | grep neural # verify that neural_compressor is installed
|
||||
```
|
||||
|
||||
Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
|
||||
@ -120,13 +120,12 @@ docker run \
|
||||
- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
|
||||
for accelerating low-batch latency and throughput
|
||||
- Attention with Linear Biases (ALiBi)
|
||||
- INC quantization
|
||||
|
||||
### Unsupported features
|
||||
|
||||
- Beam search
|
||||
- LoRA adapters
|
||||
- AWQ quantization
|
||||
- Quantization
|
||||
- Prefill chunking (mixed-batch inferencing)
|
||||
|
||||
### Supported configurations
|
||||
|
@ -16,7 +16,6 @@ sys.modules["blake3"] = MagicMock()
|
||||
sys.modules["vllm._C"] = MagicMock()
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
|
||||
from vllm.utils import FlexibleArgumentParser # noqa: E402
|
||||
|
||||
logger = logging.getLogger("mkdocs")
|
||||
@ -25,18 +24,15 @@ logger = logging.getLogger("mkdocs")
|
||||
class MarkdownFormatter(HelpFormatter):
|
||||
"""Custom formatter that generates markdown for argument groups."""
|
||||
|
||||
def __init__(self, prog, starting_heading_level=3):
|
||||
def __init__(self, prog):
|
||||
super().__init__(prog,
|
||||
max_help_position=float('inf'),
|
||||
width=float('inf'))
|
||||
self._section_heading_prefix = "#" * starting_heading_level
|
||||
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
||||
self._markdown_output = []
|
||||
|
||||
def start_section(self, heading):
|
||||
if heading not in {"positional arguments", "options"}:
|
||||
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
self._markdown_output.append(f"\n### {heading}\n\n")
|
||||
|
||||
def end_section(self):
|
||||
pass
|
||||
@ -50,13 +46,9 @@ class MarkdownFormatter(HelpFormatter):
|
||||
|
||||
def add_arguments(self, actions):
|
||||
for action in actions:
|
||||
if (len(action.option_strings) == 0
|
||||
or "--help" in action.option_strings):
|
||||
continue
|
||||
|
||||
option_strings = f'`{"`, `".join(action.option_strings)}`'
|
||||
heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
self._markdown_output.append(f"#### {option_strings}\n\n")
|
||||
|
||||
if choices := action.choices:
|
||||
choices = f'`{"`, `".join(str(c) for c in choices)}`'
|
||||
@ -89,14 +81,6 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
||||
return cls.add_cli_args(parser, **kwargs)
|
||||
|
||||
|
||||
def create_serve_parser() -> FlexibleArgumentParser:
|
||||
"""Create a parser for the serve command with markdown formatting."""
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.formatter_class = lambda prog: MarkdownFormatter(
|
||||
prog, starting_heading_level=4)
|
||||
return make_arg_parser(parser)
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
logger.info("Generating argparse documentation")
|
||||
logger.debug("Root directory: %s", ROOT_DIR.resolve())
|
||||
@ -111,7 +95,6 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
"engine_args": create_parser(EngineArgs),
|
||||
"async_engine_args": create_parser(AsyncEngineArgs,
|
||||
async_args_only=True),
|
||||
"serve": create_serve_parser(),
|
||||
}
|
||||
|
||||
# Generate documentation for each parser
|
||||
|
@ -106,7 +106,7 @@ to enable simultaneous generation and embedding using the same engine instance i
|
||||
|
||||
Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
|
||||
Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
|
||||
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
|
||||
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet suported. Please note that these models currently require
|
||||
enforcing eager mode and disabling prefix caching in V1.
|
||||
|
||||
Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
|
||||
|
@ -10,7 +10,7 @@ on HuggingFace model repository.
|
||||
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from typing import Any, NamedTuple, Optional
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
@ -30,9 +30,7 @@ question_per_audio_count = {
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: Optional[str] = None
|
||||
prompt_token_ids: Optional[dict[str, list[int]]] = None
|
||||
multi_modal_data: Optional[dict[str, Any]] = None
|
||||
prompt: str
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
@ -42,60 +40,6 @@ class ModelRequestData(NamedTuple):
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Voxtral
|
||||
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.messages import (
|
||||
AudioChunk,
|
||||
RawAudio,
|
||||
TextChunk,
|
||||
UserMessage,
|
||||
)
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
model_name = "mistralai/Voxtral-Mini-3B-2507"
|
||||
tokenizer = MistralTokenizer.from_hf_hub(model_name)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
tokenizer_mode="mistral",
|
||||
enforce_eager=True,
|
||||
enable_chunked_prefill=False,
|
||||
)
|
||||
|
||||
text_chunk = TextChunk(text=question)
|
||||
audios = [
|
||||
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
|
||||
for i in range(audio_count)
|
||||
]
|
||||
audio_chunks = [
|
||||
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
|
||||
]
|
||||
|
||||
messages = [UserMessage(content=[*audio_chunks, text_chunk])]
|
||||
|
||||
req = ChatCompletionRequest(messages=messages, model=model_name)
|
||||
|
||||
tokens = tokenizer.encode_chat_completion(req)
|
||||
prompt_ids, audios = tokens.tokens, tokens.audios
|
||||
|
||||
audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
|
||||
|
||||
multi_modal_data = {"audio": audios_and_sr}
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt_token_ids=prompt_ids,
|
||||
multi_modal_data=multi_modal_data,
|
||||
)
|
||||
|
||||
|
||||
# Granite Speech
|
||||
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
||||
# NOTE - the setting in this example are somehat different than what is
|
||||
@ -299,7 +243,6 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"voxtral": run_voxtral,
|
||||
"granite_speech": run_granite_speech,
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
@ -368,24 +311,16 @@ def main(args):
|
||||
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
|
||||
)
|
||||
|
||||
mm_data = req_data.multi_modal_data
|
||||
if not mm_data:
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
|
||||
]
|
||||
}
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
|
||||
]
|
||||
}
|
||||
|
||||
assert args.num_prompts > 0
|
||||
inputs = {"multi_modal_data": mm_data}
|
||||
|
||||
if req_data.prompt:
|
||||
inputs["prompt"] = req_data.prompt
|
||||
else:
|
||||
inputs["prompt_token_ids"] = req_data.prompt_token_ids
|
||||
|
||||
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
|
||||
if args.num_prompts > 1:
|
||||
# Batch inference
|
||||
inputs = [inputs] * args.num_prompts
|
||||
|
@ -84,7 +84,6 @@ def main():
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config=speculative_config,
|
||||
disable_log_stats=False,
|
||||
max_model_len=16384,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
|
||||
|
183
pyproject.toml
183
pyproject.toml
@ -174,186 +174,3 @@ respect-ignore-files = true
|
||||
|
||||
[tool.ty.environment]
|
||||
python = "./.venv"
|
||||
|
||||
[tool.typos.files]
|
||||
# these files may be written in non english words
|
||||
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
||||
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
||||
"vllm/third_party/*"]
|
||||
ignore-hidden = true
|
||||
ignore-files = true
|
||||
ignore-dot = true
|
||||
ignore-vcs = true
|
||||
ignore-global = true
|
||||
ignore-parent = true
|
||||
|
||||
[tool.typos.default]
|
||||
binary = false
|
||||
check-filename = false
|
||||
check-file = true
|
||||
unicode = true
|
||||
ignore-hex = true
|
||||
identifier-leading-digits = false
|
||||
locale = "en"
|
||||
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
||||
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
|
||||
".*[Tt]h[rR].*"]
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.default.extend-identifiers]
|
||||
bbc5b7ede = "bbc5b7ede"
|
||||
womens_doubles = "womens_doubles"
|
||||
v_2nd = "v_2nd"
|
||||
# splitted_input = "splitted_input"
|
||||
NOOPs = "NOOPs"
|
||||
typ = "typ"
|
||||
nin_shortcut = "nin_shortcut"
|
||||
UperNetDecoder = "UperNetDecoder"
|
||||
subtile = "subtile"
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
||||
SFOuput = "SFOuput"
|
||||
# huggingface transformers repo uses these words
|
||||
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
||||
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
||||
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
||||
|
||||
[tool.typos.default.extend-words]
|
||||
iy = "iy"
|
||||
tendencias = "tendencias"
|
||||
# intel cpu features
|
||||
tme = "tme"
|
||||
dout = "dout"
|
||||
Pn = "Pn"
|
||||
arange = "arange"
|
||||
|
||||
[tool.typos.type.py]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.py.extend-identifiers]
|
||||
arange = "arange"
|
||||
NDArray = "NDArray"
|
||||
EOFError = "EOFError"
|
||||
fo = "fo"
|
||||
ba = "ba"
|
||||
|
||||
[tool.typos.type.py.extend-words]
|
||||
|
||||
[tool.typos.type.cpp]
|
||||
extend-glob = ["*.cu"]
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.cpp.extend-identifiers]
|
||||
countr_one = "countr_one"
|
||||
k_ot = "k_ot"
|
||||
ot = "ot"
|
||||
|
||||
[tool.typos.type.cpp.extend-words]
|
||||
|
||||
[tool.typos.type.rust]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.rust.extend-identifiers]
|
||||
flate2 = "flate2"
|
||||
|
||||
[tool.typos.type.rust.extend-words]
|
||||
ser = "ser"
|
||||
|
||||
[tool.typos.type.lock]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.lock.extend-identifiers]
|
||||
|
||||
[tool.typos.type.lock.extend-words]
|
||||
|
||||
[tool.typos.type.jl]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.jl.extend-identifiers]
|
||||
|
||||
[tool.typos.type.jl.extend-words]
|
||||
modul = "modul"
|
||||
egals = "egals"
|
||||
usig = "usig"
|
||||
egal = "egal"
|
||||
|
||||
[tool.typos.type.go]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.go.extend-identifiers]
|
||||
flate = "flate"
|
||||
|
||||
[tool.typos.type.go.extend-words]
|
||||
|
||||
[tool.typos.type.css]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.css.extend-identifiers]
|
||||
nd = "nd"
|
||||
|
||||
[tool.typos.type.css.extend-words]
|
||||
|
||||
[tool.typos.type.man]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.man.extend-identifiers]
|
||||
Nd = "Nd"
|
||||
|
||||
[tool.typos.type.man.extend-words]
|
||||
|
||||
[tool.typos.type.cert]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.cert.extend-identifiers]
|
||||
|
||||
[tool.typos.type.cert.extend-words]
|
||||
|
||||
[tool.typos.type.sh]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.sh.extend-identifiers]
|
||||
ot = "ot"
|
||||
|
||||
[tool.typos.type.sh.extend-words]
|
||||
|
||||
[tool.typos.type.vimscript]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[tool.typos.type.vimscript.extend-identifiers]
|
||||
windo = "windo"
|
||||
|
||||
[tool.typos.type.vimscript.extend-words]
|
||||
|
@ -25,7 +25,7 @@ outlines_core == 0.2.10
|
||||
# required for outlines backend disk cache
|
||||
diskcache == 5.6.3
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
@ -33,7 +33,7 @@ pyzmq >= 25.0.0
|
||||
msgspec
|
||||
gguf >= 0.13.0
|
||||
importlib_metadata; python_version < '3.10'
|
||||
mistral_common[opencv] >= 1.8.0
|
||||
mistral_common[opencv] >= 1.6.2
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
|
@ -17,7 +17,6 @@ cloudpickle
|
||||
fastapi
|
||||
msgspec
|
||||
openai
|
||||
partial-json-parser
|
||||
pillow
|
||||
psutil
|
||||
pybase64
|
||||
|
@ -23,7 +23,7 @@ jiwer # required for audio tests
|
||||
timm # required for internvl test
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.8.0 # required for voxtral test
|
||||
mistral_common[opencv] >= 1.6.2 # required for pixtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
|
@ -28,7 +28,7 @@ torchvision==0.22.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
mamba_ssm # required for plamo2 test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.8.0 # required for voxtral test
|
||||
mistral_common[opencv] >= 1.7.0 # required for pixtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
|
@ -305,7 +305,7 @@ mbstrdecoder==1.1.3
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common==1.8.0
|
||||
mistral-common==1.7.0
|
||||
# via -r requirements/test.in
|
||||
more-itertools==10.5.0
|
||||
# via lm-eval
|
||||
@ -518,8 +518,6 @@ pyasn1-modules==0.4.2
|
||||
# via google-auth
|
||||
pybind11==2.13.6
|
||||
# via lm-eval
|
||||
pycountry==24.6.1
|
||||
# via pydantic-extra-types
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pycryptodomex==3.22.0
|
||||
@ -530,12 +528,9 @@ pydantic==2.11.5
|
||||
# datamodel-code-generator
|
||||
# mistral-common
|
||||
# mteb
|
||||
# pydantic-extra-types
|
||||
# ray
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pydantic-extra-types==2.10.5
|
||||
# via mistral-common
|
||||
pygments==2.18.0
|
||||
# via rich
|
||||
pyparsing==3.2.0
|
||||
@ -840,7 +835,6 @@ typing-extensions==4.12.2
|
||||
# pqdm
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# pydantic-extra-types
|
||||
# torch
|
||||
# typer
|
||||
# typing-inspection
|
||||
|
3
setup.py
3
setup.py
@ -692,8 +692,7 @@ setup(
|
||||
"tensorizer": ["tensorizer==2.10.1"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
|
||||
"audio": ["librosa", "soundfile",
|
||||
"mistral_common[audio]"], # Required for audio processing
|
||||
"audio": ["librosa", "soundfile"], # Required for audio processing
|
||||
"video": [] # Kept for backwards compatibility
|
||||
},
|
||||
cmdclass=cmdclass,
|
||||
|
@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(distributed_executor_backend: str):
|
||||
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
@ -40,6 +40,8 @@ def api_server(distributed_executor_backend: str):
|
||||
"facebook/opt-125m",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--tokenizer-pool-size",
|
||||
str(tokenizer_pool_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_executor_backend,
|
||||
]
|
||||
@ -52,8 +54,10 @@ def api_server(distributed_executor_backend: str):
|
||||
uvicorn_process.terminate()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
||||
def test_api_server(api_server, distributed_executor_backend: str):
|
||||
def test_api_server(api_server, tokenizer_pool_size: int,
|
||||
distributed_executor_backend: str):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
|
@ -804,7 +804,7 @@ class VllmRunner:
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor], list[int]],
|
||||
prompts: Union[list[str], list[torch.Tensor]],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
@ -826,16 +826,11 @@ class VllmRunner:
|
||||
if audios is not None and (audio := audios[i]) is not None:
|
||||
multi_modal_data["audio"] = audio
|
||||
|
||||
text_prompt_kwargs: dict[str, Any] = {
|
||||
text_prompt_kwargs = {
|
||||
("prompt" if isinstance(prompt, str) else "prompt_embeds"):
|
||||
prompt,
|
||||
"multi_modal_data": multi_modal_data or None
|
||||
}
|
||||
if isinstance(prompt, str):
|
||||
text_prompt_kwargs["prompt"] = prompt
|
||||
elif isinstance(prompt, list):
|
||||
text_prompt_kwargs["prompt_token_ids"] = prompt
|
||||
else:
|
||||
text_prompt_kwargs["prompt_embeds"] = prompt
|
||||
|
||||
inputs.append(TextPrompt(**text_prompt_kwargs))
|
||||
|
||||
return inputs
|
||||
|
@ -14,9 +14,8 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
|
||||
from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
@ -159,7 +158,7 @@ TEXT_GENERATION_MODELS = {
|
||||
"databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
|
||||
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
|
||||
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
|
||||
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
|
||||
"tiiuae/falcon-7b": PPTestSettings.fast(),
|
||||
"google/gemma-1.1-2b-it": PPTestSettings.fast(),
|
||||
@ -211,11 +210,9 @@ TEXT_GENERATION_MODELS = {
|
||||
|
||||
EMBEDDING_MODELS = { # type: ignore[var-annotated]
|
||||
# [Text-only]
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
|
||||
load_format="dummy", task="embed"
|
||||
),
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
|
||||
}
|
||||
|
||||
MULTIMODAL_MODELS = {
|
||||
@ -251,7 +248,6 @@ TEST_MODELS = [
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"ArthurZ/Ilama-3.2-1B",
|
||||
"ibm/PowerLM-3b",
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
@ -291,11 +287,6 @@ def _compare_tp(
|
||||
trust_remote_code = model_info.trust_remote_code
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
hf_config = get_config(model_id, trust_remote_code)
|
||||
|
||||
dtype = "float16"
|
||||
if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
|
||||
dtype = "bfloat16"
|
||||
|
||||
if load_format == "dummy":
|
||||
# Avoid OOM
|
||||
@ -325,7 +316,7 @@ def _compare_tp(
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
dtype,
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
@ -347,7 +338,6 @@ def _compare_tp(
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
|
||||
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
|
||||
testing_ray_compiled_graph = False
|
||||
if distributed_backend == "ray" and (vllm_major_version == "1"
|
||||
or specific_case):
|
||||
# For V1, test Ray Compiled Graph for all the tests
|
||||
@ -361,7 +351,6 @@ def _compare_tp(
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
# terminate because of a Ray Compiled Graph issue.
|
||||
common_args.append("--disable-frontend-multiprocessing")
|
||||
testing_ray_compiled_graph = True
|
||||
elif distributed_backend == "mp":
|
||||
# Both V0/V1 of multiprocessing executor support PP
|
||||
pp_env = {
|
||||
@ -405,6 +394,7 @@ def _compare_tp(
|
||||
tp_env,
|
||||
method=method)
|
||||
except Exception:
|
||||
testing_ray_compiled_graph = pp_env is not None
|
||||
if testing_ray_compiled_graph and vllm_major_version == "0":
|
||||
# Ray Compiled Graph tests are flaky for V0,
|
||||
# so we don't want to fail the test
|
||||
|
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from argparse import ArgumentError
|
||||
from argparse import ArgumentError, ArgumentTypeError
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Annotated, Literal, Optional
|
||||
@ -12,8 +12,8 @@ import pytest
|
||||
from vllm.config import CompilationConfig, config
|
||||
from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
|
||||
get_type, get_type_hints, is_not_builtin,
|
||||
is_type, literal_to_kwargs, optional_type,
|
||||
parse_type)
|
||||
is_type, literal_to_kwargs, nullable_kvs,
|
||||
optional_type, parse_type)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@ -25,10 +25,18 @@ from vllm.utils import FlexibleArgumentParser
|
||||
"foo": 1,
|
||||
"bar": 2
|
||||
}),
|
||||
(json.loads, "foo=1,bar=2", {
|
||||
"foo": 1,
|
||||
"bar": 2
|
||||
}),
|
||||
])
|
||||
def test_parse_type(type, value, expected):
|
||||
parse_type_func = parse_type(type)
|
||||
assert parse_type_func(value) == expected
|
||||
context = nullcontext()
|
||||
if value == "foo=1,bar=2":
|
||||
context = pytest.warns(DeprecationWarning)
|
||||
with context:
|
||||
assert parse_type_func(value) == expected
|
||||
|
||||
|
||||
def test_optional_type():
|
||||
@ -195,6 +203,34 @@ def test_get_kwargs():
|
||||
assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("arg", "expected"), [
|
||||
(None, dict()),
|
||||
("image=16", {
|
||||
"image": 16
|
||||
}),
|
||||
("image=16,video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
("Image=16, Video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
])
|
||||
def test_limit_mm_per_prompt_parser(arg, expected):
|
||||
"""This functionality is deprecated and will be removed in the future.
|
||||
This argument should be passed as JSON string instead.
|
||||
|
||||
TODO: Remove with nullable_kvs."""
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args(["--limit-mm-per-prompt", arg])
|
||||
|
||||
assert args.limit_mm_per_prompt == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg", "expected"),
|
||||
[
|
||||
@ -290,6 +326,18 @@ def test_prefix_cache_default():
|
||||
assert not engine_args.enable_prefix_caching
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg"),
|
||||
[
|
||||
"image", # Missing =
|
||||
"image=4,image=5", # Conflicting values
|
||||
"image=video=4" # Too many = in tokenized arg
|
||||
])
|
||||
def test_bad_nullable_kvs(arg):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
nullable_kvs(arg)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("arg", "expected", "option"), [
|
||||
(None, None, "mm-processor-kwargs"),
|
||||
|
@ -176,8 +176,4 @@ async def test_invocations(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert classification_output.keys() == invocation_output.keys()
|
||||
for classification_data, invocation_data in zip(
|
||||
classification_output["data"], invocation_output["data"]):
|
||||
assert classification_data.keys() == invocation_data.keys()
|
||||
assert classification_data["probs"] == pytest.approx(
|
||||
invocation_data["probs"], rel=0.01)
|
||||
assert classification_output["data"] == invocation_output["data"]
|
||||
|
@ -14,7 +14,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...models.language.pooling.embed_utils import (
|
||||
run_embedding_correctness_test)
|
||||
from ...models.utils import check_embeddings_close
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
@ -322,13 +321,7 @@ async def test_invocations(server: RemoteOpenAIServer,
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert completion_output.keys() == invocation_output.keys()
|
||||
for completion_data, invocation_data in zip(completion_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert completion_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
|
||||
embeddings_1_lst=[invocation_data["embedding"]],
|
||||
name_0="completion",
|
||||
name_1="invocation")
|
||||
assert completion_output["data"] == invocation_output["data"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -362,10 +355,4 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert chat_output.keys() == invocation_output.keys()
|
||||
for chat_data, invocation_data in zip(chat_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert chat_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
|
||||
embeddings_1_lst=[invocation_data["embedding"]],
|
||||
name_0="chat",
|
||||
name_1="invocation")
|
||||
assert chat_output["data"] == invocation_output["data"]
|
||||
|
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from typing import Final
|
||||
|
||||
import pytest
|
||||
@ -30,7 +29,7 @@ def server():
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
f"image={MAXIMUM_IMAGES}",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
|
@ -281,13 +281,7 @@ async def test_invocations(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert completion_output.keys() == invocation_output.keys()
|
||||
for completion_data, invocation_data in zip(completion_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert completion_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=completion_data["data"],
|
||||
embeddings_1_lst=invocation_data["data"],
|
||||
name_0="completion",
|
||||
name_1="invocation")
|
||||
assert completion_output["data"] == invocation_output["data"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -320,10 +314,4 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert chat_output.keys() == invocation_output.keys()
|
||||
for chat_data, invocation_data in zip(chat_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert chat_data.keys() == invocation_data.keys()
|
||||
check_embeddings_close(embeddings_0_lst=chat_data["data"],
|
||||
embeddings_1_lst=invocation_data["data"],
|
||||
name_0="chat",
|
||||
name_1="invocation")
|
||||
assert chat_output["data"] == invocation_output["data"]
|
||||
|
@ -120,8 +120,4 @@ def test_invocations(server: RemoteOpenAIServer):
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert rerank_output.keys() == invocation_output.keys()
|
||||
for rerank_result, invocations_result in zip(rerank_output["results"],
|
||||
invocation_output["results"]):
|
||||
assert rerank_result.keys() == invocations_result.keys()
|
||||
assert rerank_result["relevance_score"] == pytest.approx(
|
||||
invocations_result["relevance_score"], rel=0.01)
|
||||
assert rerank_output["results"] == invocation_output["results"]
|
||||
|
@ -215,8 +215,4 @@ class TestModel:
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert score_output.keys() == invocation_output.keys()
|
||||
for score_data, invocation_data in zip(score_output["data"],
|
||||
invocation_output["data"]):
|
||||
assert score_data.keys() == invocation_data.keys()
|
||||
assert score_data["score"] == pytest.approx(
|
||||
invocation_data["score"], rel=0.01)
|
||||
assert score_output["data"] == invocation_output["data"]
|
||||
|
@ -32,7 +32,6 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--enable-tokenizer-info-endpoint",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
@ -284,106 +283,3 @@ async def test_detokenize(
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {"prompt": prompt}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenizer_info_basic(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
tokenizer_name: str,
|
||||
):
|
||||
"""Test basic tokenizer info endpoint functionality."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
assert "tokenizer_class" in result
|
||||
assert isinstance(result["tokenizer_class"], str)
|
||||
assert result["tokenizer_class"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
|
||||
"""Test that the response matches expected schema types."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
field_types = {
|
||||
"add_bos_token": bool,
|
||||
"add_prefix_space": bool,
|
||||
"clean_up_tokenization_spaces": bool,
|
||||
"split_special_tokens": bool,
|
||||
"bos_token": str,
|
||||
"eos_token": str,
|
||||
"pad_token": str,
|
||||
"unk_token": str,
|
||||
"chat_template": str,
|
||||
"errors": str,
|
||||
"model_max_length": int,
|
||||
"additional_special_tokens": list,
|
||||
"added_tokens_decoder": dict,
|
||||
}
|
||||
for field, expected_type in field_types.items():
|
||||
if field in result and result[field] is not None:
|
||||
assert isinstance(
|
||||
result[field],
|
||||
expected_type), (f"{field} should be {expected_type.__name__}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_added_tokens_structure(
|
||||
server: RemoteOpenAIServer, ):
|
||||
"""Test added_tokens_decoder structure if present."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
added_tokens = result.get("added_tokens_decoder")
|
||||
if added_tokens:
|
||||
for token_id, token_info in added_tokens.items():
|
||||
assert isinstance(token_id, str), "Token IDs should be strings"
|
||||
assert isinstance(token_info, dict), "Token info should be a dict"
|
||||
assert "content" in token_info, "Token info should have content"
|
||||
assert "special" in token_info, (
|
||||
"Token info should have special flag")
|
||||
assert isinstance(token_info["special"],
|
||||
bool), ("Special flag should be boolean")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_consistency_with_tokenize(
|
||||
server: RemoteOpenAIServer, ):
|
||||
"""Test that tokenizer info is consistent with tokenization endpoint."""
|
||||
info_response = requests.get(server.url_for("tokenizer_info"))
|
||||
info_response.raise_for_status()
|
||||
info = info_response.json()
|
||||
tokenize_response = requests.post(
|
||||
server.url_for("tokenize"),
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"prompt": "Hello world!"
|
||||
},
|
||||
)
|
||||
tokenize_response.raise_for_status()
|
||||
tokenize_result = tokenize_response.json()
|
||||
info_max_len = info.get("model_max_length")
|
||||
tokenize_max_len = tokenize_result.get("max_model_len")
|
||||
if info_max_len and tokenize_max_len:
|
||||
assert info_max_len >= tokenize_max_len, (
|
||||
"Info max length should be >= tokenize max length")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
|
||||
"""Test chat template is properly included."""
|
||||
response = requests.get(server.url_for("tokenizer_info"))
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
chat_template = result.get("chat_template")
|
||||
if chat_template:
|
||||
assert isinstance(chat_template,
|
||||
str), ("Chat template should be a string")
|
||||
assert chat_template.strip(), "Chat template should not be empty"
|
@ -17,11 +17,6 @@ from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode", "mistral", "--config_format", "mistral",
|
||||
"--load_format", "mistral"
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
@ -38,15 +33,9 @@ def winning_call():
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
|
||||
async def test_basic_audio(mary_had_lamb, model_name):
|
||||
async def test_basic_audio(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
@ -76,13 +65,10 @@ async def test_bad_requests(mary_had_lamb):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
|
||||
async def test_long_audio_request(mary_had_lamb, model_name):
|
||||
async def test_long_audio_request(mary_had_lamb):
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("openai"):
|
||||
return
|
||||
|
||||
mary_had_lamb.seek(0)
|
||||
audio, sr = librosa.load(mary_had_lamb)
|
||||
# Add small silence after each audio for repeatability in the split process
|
||||
@ -101,8 +87,7 @@ async def test_long_audio_request(mary_had_lamb, model_name):
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(transcription)['text']
|
||||
counts = out.count("Mary had a little lamb")
|
||||
assert counts == 10, counts
|
||||
assert out.count("Mary had a little lamb") == 10
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
@ -416,7 +416,7 @@ class RankTensors:
|
||||
# We dequant and use that as hidden_states so the tests are stable.
|
||||
# quantizing and dequantizing yield slightly different results
|
||||
# depending on the hardware. Here we, quantize and dequantize
|
||||
# first - so further quantize and dequantize will yield the same
|
||||
# first - so further quantize and dequantize will yeild the same
|
||||
# values.
|
||||
if config.is_per_tensor_act_quant:
|
||||
a_q, a_scales = ops.scaled_fp8_quant(
|
||||
|
@ -95,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
|
||||
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
|
||||
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
|
||||
|
||||
# triton reference
|
||||
# triton referrence
|
||||
out_triton = fused_experts(
|
||||
hidden_states=tokens_bf16,
|
||||
w1=w1,
|
||||
|
@ -1,115 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
|
||||
TextChunk, UserMessage)
|
||||
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
|
||||
from ....conftest import AudioTestAssets
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
|
||||
|
||||
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode", "mistral", "--config_format", "mistral",
|
||||
"--load_format", "mistral"
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
] + MISTRAL_FORMAT_ARGS
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
"30"}) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
def _get_prompt(audio_assets, question):
|
||||
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
||||
audios = [
|
||||
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
|
||||
for i in range(len(audio_assets))
|
||||
]
|
||||
audio_chunks = [
|
||||
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
|
||||
]
|
||||
|
||||
text_chunk = TextChunk(text=question)
|
||||
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
|
||||
|
||||
return tokenizer.apply_chat_template(messages=messages)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models_with_multiple_audios(vllm_runner,
|
||||
audio_assets: AudioTestAssets, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate
|
||||
for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer_mode="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
def asset_to_chunk(asset):
|
||||
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
|
||||
audio.format = "wav"
|
||||
audio_dict = AudioChunk.from_audio(audio).to_openai()
|
||||
return audio_dict
|
||||
|
||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*audio_chunks,
|
||||
{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
@ -440,7 +440,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
|
||||
trust_remote_code=True), # noqa: E501
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
"VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", tokenizer_mode="mistral"), # noqa: E501
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||
|
||||
# [Cross-encoder]
|
||||
@ -465,11 +464,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
|
||||
tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
|
||||
"EagleLlama4ForCausalLM": _HfExamplesInfo(
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
|
||||
trust_remote_code=True,
|
||||
speculative_model="morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
|
||||
tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct"), # noqa: E501
|
||||
"EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16",
|
||||
trust_remote_code=True,
|
||||
is_available_online=False,
|
||||
@ -519,4 +513,4 @@ class HfExampleModels:
|
||||
raise ValueError(f"No example model defined for {model_id}")
|
||||
|
||||
|
||||
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
|
||||
HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
|
@ -36,11 +36,6 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
||||
"KimiVLForConditionalGeneration"):
|
||||
pytest.skip("Avoid OOM")
|
||||
|
||||
if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
|
||||
from vllm.model_executor.models.llama4 import Llama4ForCausalLM
|
||||
from vllm.model_executor.models.registry import ModelRegistry
|
||||
ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
|
||||
|
||||
# Avoid OOM and reduce initialization time by only using 1 layer
|
||||
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
|
||||
hf_config.update(model_info.hf_overrides)
|
||||
@ -48,7 +43,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
||||
text_config = hf_config.get_text_config()
|
||||
|
||||
# Ensure at least 2 expert per group
|
||||
# Since `grouped_topk` assumes top-2
|
||||
# Since `grouped_topk` assums top-2
|
||||
n_group = getattr(text_config, 'n_group', None)
|
||||
num_experts = n_group * 2 if n_group is not None else 2
|
||||
|
||||
|
@ -6,10 +6,8 @@ import random
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -55,6 +53,14 @@ def model_name():
|
||||
return "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
|
||||
def eagle_model_name():
|
||||
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
|
||||
|
||||
|
||||
def eagle3_model_name():
|
||||
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
|
||||
|
||||
def test_ngram_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
@ -71,8 +77,6 @@ def test_ngram_correctness(
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
@ -99,50 +103,34 @@ def test_ngram_correctness(
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_setup", [
|
||||
("eagle", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
|
||||
("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
|
||||
pytest.param(
|
||||
("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
|
||||
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
|
||||
],
|
||||
ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
|
||||
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
|
||||
def test_eagle_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
model_setup: tuple[str, str, str, int],
|
||||
model_name: str,
|
||||
use_eagle3: bool,
|
||||
):
|
||||
'''
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using eagle speculative decoding.
|
||||
model_setup: (method, model_name, eagle_model_name, tp_size)
|
||||
'''
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
method, model_name, spec_model_name, tp_size = model_setup
|
||||
|
||||
ref_llm = LLM(model=model_name,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=tp_size)
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
spec_model_name = eagle3_model_name(
|
||||
) if use_eagle3 else eagle_model_name()
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=tp_size,
|
||||
speculative_config={
|
||||
"method": method,
|
||||
"method": "eagle3" if use_eagle3 else "eagle",
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 3,
|
||||
"max_model_len": 2048,
|
||||
@ -164,5 +152,3 @@ def test_eagle_correctness(
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
@ -1,166 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
|
||||
# Use a small vision model for testing
|
||||
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_image_server_args():
|
||||
return [
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"6000",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def image_server(default_image_server_args):
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
default_image_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(image_server):
|
||||
async with image_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_image() -> dict[str, str]:
|
||||
return {
|
||||
image_url: encode_image_base64(fetch_image(image_url))
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
|
||||
model_name: str, image_url: str):
|
||||
content_text = "What's in this image?"
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": image_url,
|
||||
"detail": "auto",
|
||||
},
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": content_text
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
# test image url
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
assert len(response.output_text) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
async def test_single_chat_session_image_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_url: str,
|
||||
base64_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url":
|
||||
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
|
||||
"detail": "auto",
|
||||
},
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": content_text
|
||||
},
|
||||
],
|
||||
}]
|
||||
# test image base64
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
assert len(response.output_text) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
|
||||
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
|
||||
image_urls: list[str]):
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*({
|
||||
"type": "input_image",
|
||||
"image_url": image_url,
|
||||
"detail": "auto",
|
||||
} for image_url in image_urls),
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
if len(image_urls) > MAXIMUM_IMAGES:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-image input
|
||||
await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
# the server should still work afterwards
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?",
|
||||
}],
|
||||
)
|
||||
assert len(response.output_text) > 0
|
||||
else:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=messages,
|
||||
)
|
||||
assert len(response.output_text) > 0
|
@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||
|
||||
# Number of data parallel ranks for external LB testing
|
||||
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
|
||||
# Default tensor parallel size to use
|
||||
# Default tensor parallell size to use
|
||||
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
|
||||
|
||||
|
||||
|
179
typos.toml
Normal file
179
typos.toml
Normal file
@ -0,0 +1,179 @@
|
||||
[files]
|
||||
# these files may be written in non english words
|
||||
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
||||
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
||||
"vllm/third_party/*"]
|
||||
ignore-hidden = true
|
||||
ignore-files = true
|
||||
ignore-dot = true
|
||||
ignore-vcs = true
|
||||
ignore-global = true
|
||||
ignore-parent = true
|
||||
|
||||
[default]
|
||||
binary = false
|
||||
check-filename = false
|
||||
check-file = true
|
||||
unicode = true
|
||||
ignore-hex = true
|
||||
identifier-leading-digits = false
|
||||
locale = "en"
|
||||
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
||||
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
|
||||
".*ot.*", ".*[Tt]h[rR].*"]
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[default.extend-identifiers]
|
||||
bbc5b7ede = "bbc5b7ede"
|
||||
womens_doubles = "womens_doubles"
|
||||
v_2nd = "v_2nd"
|
||||
splitted_input = "splitted_input"
|
||||
NOOPs = "NOOPs"
|
||||
typ = "typ"
|
||||
nin_shortcut = "nin_shortcut"
|
||||
UperNetDecoder = "UperNetDecoder"
|
||||
subtile = "subtile"
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
||||
SFOuput = "SFOuput"
|
||||
# huggingface transformers repo uses these words
|
||||
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
||||
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
||||
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
||||
|
||||
[default.extend-words]
|
||||
iy = "iy"
|
||||
tendencias = "tendencias"
|
||||
# intel cpu features
|
||||
tme = "tme"
|
||||
dout = "dout"
|
||||
Pn = "Pn"
|
||||
arange = "arange"
|
||||
|
||||
[type.py]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.py.extend-identifiers]
|
||||
arange = "arange"
|
||||
NDArray = "NDArray"
|
||||
EOFError = "EOFError"
|
||||
|
||||
[type.py.extend-words]
|
||||
|
||||
[type.cpp]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.cpp.extend-identifiers]
|
||||
countr_one = "countr_one"
|
||||
|
||||
[type.cpp.extend-words]
|
||||
|
||||
[type.rust]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.rust.extend-identifiers]
|
||||
flate2 = "flate2"
|
||||
|
||||
[type.rust.extend-words]
|
||||
ser = "ser"
|
||||
|
||||
[type.lock]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.lock.extend-identifiers]
|
||||
|
||||
[type.lock.extend-words]
|
||||
|
||||
[type.jl]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.jl.extend-identifiers]
|
||||
|
||||
[type.jl.extend-words]
|
||||
modul = "modul"
|
||||
egals = "egals"
|
||||
usig = "usig"
|
||||
egal = "egal"
|
||||
|
||||
[type.go]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.go.extend-identifiers]
|
||||
flate = "flate"
|
||||
|
||||
[type.go.extend-words]
|
||||
|
||||
[type.css]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.css.extend-identifiers]
|
||||
nd = "nd"
|
||||
|
||||
[type.css.extend-words]
|
||||
|
||||
[type.man]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.man.extend-identifiers]
|
||||
Nd = "Nd"
|
||||
|
||||
[type.man.extend-words]
|
||||
|
||||
[type.cert]
|
||||
extend-glob = []
|
||||
check-file = false
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.cert.extend-identifiers]
|
||||
|
||||
[type.cert.extend-words]
|
||||
|
||||
[type.sh]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.sh.extend-identifiers]
|
||||
stap = "stap"
|
||||
ot = "ot"
|
||||
|
||||
[type.sh.extend-words]
|
||||
|
||||
[type.vimscript]
|
||||
extend-glob = []
|
||||
extend-ignore-identifiers-re = []
|
||||
extend-ignore-words-re = []
|
||||
extend-ignore-re = []
|
||||
|
||||
[type.vimscript.extend-identifiers]
|
||||
windo = "windo"
|
||||
|
||||
[type.vimscript.extend-words]
|
@ -961,7 +961,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
|
||||
"... H (two D) -> ... (H two) D",
|
||||
two=2)
|
||||
|
||||
else: # reuse the kv cache, full attention
|
||||
else: # re-use the kv cache, full attention
|
||||
q = q.view(-1, self.num_heads, self.head_size)
|
||||
q1, q2 = self.split_heads(q)
|
||||
# kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501
|
||||
|
@ -96,30 +96,25 @@ DEFAULT_PIP_PATTERNS = {
|
||||
def run(command):
|
||||
"""Return (return-code, stdout, stderr)."""
|
||||
shell = True if type(command) is str else False
|
||||
try:
|
||||
p = subprocess.Popen(command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
shell=shell)
|
||||
raw_output, raw_err = p.communicate()
|
||||
rc = p.returncode
|
||||
if get_platform() == 'win32':
|
||||
enc = 'oem'
|
||||
else:
|
||||
enc = locale.getpreferredencoding()
|
||||
output = raw_output.decode(enc)
|
||||
if command == 'nvidia-smi topo -m':
|
||||
# don't remove the leading whitespace of `nvidia-smi topo -m`
|
||||
# because they are meaningful
|
||||
output = output.rstrip()
|
||||
else:
|
||||
output = output.strip()
|
||||
err = raw_err.decode(enc)
|
||||
return rc, output, err.strip()
|
||||
|
||||
except FileNotFoundError:
|
||||
cmd_str = command if isinstance(command, str) else command[0]
|
||||
return 127, '', f"Command not found: {cmd_str}"
|
||||
p = subprocess.Popen(command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
shell=shell)
|
||||
raw_output, raw_err = p.communicate()
|
||||
rc = p.returncode
|
||||
if get_platform() == 'win32':
|
||||
enc = 'oem'
|
||||
else:
|
||||
enc = locale.getpreferredencoding()
|
||||
output = raw_output.decode(enc)
|
||||
if command == 'nvidia-smi topo -m':
|
||||
# don't remove the leading whitespace of `nvidia-smi topo -m`
|
||||
# because they are meaningful
|
||||
output = output.rstrip()
|
||||
else:
|
||||
output = output.strip()
|
||||
err = raw_err.decode(enc)
|
||||
return rc, output, err.strip()
|
||||
|
||||
|
||||
def run_and_read_all(run_lambda, command):
|
||||
@ -153,7 +148,7 @@ def get_conda_packages(run_lambda, patterns=None):
|
||||
if patterns is None:
|
||||
patterns = DEFAULT_CONDA_PATTERNS
|
||||
conda = os.environ.get('CONDA_EXE', 'conda')
|
||||
out = run_and_read_all(run_lambda, [conda, 'list'])
|
||||
out = run_and_read_all(run_lambda, "{} list".format(conda))
|
||||
if out is None:
|
||||
return out
|
||||
|
||||
|
@ -26,7 +26,7 @@ from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator,
|
||||
from pydantic.dataclasses import dataclass
|
||||
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
|
||||
from torch.distributed import ProcessGroup, ReduceOp
|
||||
from typing_extensions import Self, runtime_checkable
|
||||
from typing_extensions import Self, deprecated, runtime_checkable
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import version
|
||||
@ -963,7 +963,7 @@ class ModelConfig:
|
||||
optimized_quantization_methods = [
|
||||
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
|
||||
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
|
||||
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
|
||||
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
|
||||
]
|
||||
if self.quantization is not None:
|
||||
self.quantization = cast(me_quant.QuantizationMethods,
|
||||
@ -1563,7 +1563,7 @@ class ModelConfig:
|
||||
|
||||
|
||||
BlockSize = Literal[1, 8, 16, 32, 64, 128]
|
||||
CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
|
||||
CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
|
||||
PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
|
||||
|
||||
|
||||
@ -1593,7 +1593,7 @@ class CacheConfig:
|
||||
cache_dtype: CacheDType = "auto"
|
||||
"""Data type for kv cache storage. If "auto", will use model data type.
|
||||
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
|
||||
fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc)."""
|
||||
fp8 (=fp8_e4m3)."""
|
||||
is_attention_free: bool = False
|
||||
"""Whether the model is attention-free. This is primarily set in
|
||||
`ModelConfig` and that value should be manually duplicated here."""
|
||||
@ -1691,7 +1691,7 @@ class CacheConfig:
|
||||
"Using fp8 data type to store kv cache. It reduces the GPU "
|
||||
"memory footprint and boosts the performance. "
|
||||
"Meanwhile, it may cause accuracy drop without a proper "
|
||||
"scaling factor.")
|
||||
"scaling factor")
|
||||
else:
|
||||
raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
|
||||
|
||||
@ -1730,6 +1730,35 @@ class CacheConfig:
|
||||
logger.warning("Possibly too large swap space. %s", msg)
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class TokenizerPoolConfig:
|
||||
"""This config is deprecated and will be removed in a future release.
|
||||
|
||||
Passing these parameters will have no effect. Please remove them from your
|
||||
configurations.
|
||||
"""
|
||||
|
||||
pool_size: int = 0
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Passing this parameter will have no effect. Please remove it from your
|
||||
configurations."""
|
||||
pool_type: str = "ray"
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Passing this parameter will have no effect. Please remove it from your
|
||||
configurations."""
|
||||
extra_config: dict = field(default_factory=dict)
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Passing this parameter will have no effect. Please remove it from your
|
||||
configurations."""
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
logger.warning_once(
|
||||
"TokenizerPoolConfig is deprecated and will be removed in a "
|
||||
"future release. Passing this parameter will have no effect. "
|
||||
"Please remove it from your configurations.")
|
||||
|
||||
|
||||
class LoadFormat(str, enum.Enum):
|
||||
AUTO = "auto"
|
||||
PT = "pt"
|
||||
@ -1781,9 +1810,6 @@ class LoadConfig:
|
||||
default_factory=dict)
|
||||
"""Extra config for model loader. This will be passed to the model loader
|
||||
corresponding to the chosen load_format."""
|
||||
device: Optional[str] = None
|
||||
"""Device to which model weights will be loaded, default to
|
||||
device_config.device"""
|
||||
ignore_patterns: Optional[Union[list[str], str]] = None
|
||||
"""The list of patterns to ignore when loading the model. Default to
|
||||
"original/**/*" to avoid repeated loading of llama's checkpoints."""
|
||||
@ -1896,6 +1922,10 @@ class ParallelConfig:
|
||||
disable_custom_all_reduce: bool = False
|
||||
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
||||
|
||||
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Please remove it from your configs"""
|
||||
|
||||
ray_workers_use_nsight: bool = False
|
||||
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
||||
|
||||
@ -1910,7 +1940,7 @@ class ParallelConfig:
|
||||
or equal to the number of GPUs available, "mp" will be used to
|
||||
keep processing on a single host. Otherwise, this will default
|
||||
to "ray" if Ray is installed and fail otherwise. Note that tpu
|
||||
only support Ray for distributed inference."""
|
||||
and hpu only support Ray for distributed inference."""
|
||||
|
||||
worker_cls: str = "auto"
|
||||
"""The full name of the worker class to use. If "auto", the worker class
|
||||
@ -3662,6 +3692,18 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
|
||||
class DecodingConfig:
|
||||
"""Dataclass which contains the decoding strategy of the engine."""
|
||||
|
||||
@property
|
||||
@deprecated(
|
||||
"`guided_decoding_backend` is deprecated and has been renamed to "
|
||||
"`backend`. This will be removed in v0.10.0. Please use the "
|
||||
"`backend` argument instead.")
|
||||
def guided_decoding_backend(self) -> GuidedDecodingBackend:
|
||||
return self.backend
|
||||
|
||||
@guided_decoding_backend.setter
|
||||
def guided_decoding_backend(self, value: GuidedDecodingBackend):
|
||||
self.backend = value
|
||||
|
||||
backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
|
||||
"""Which engine will be used for guided decoding (JSON schema / regex etc)
|
||||
by default. With "auto", we will make opinionated choices based on request
|
||||
@ -3704,6 +3746,9 @@ class DecodingConfig:
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self):
|
||||
if ":" in self.backend:
|
||||
self._extract_backend_options()
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
valid_guided_backends = get_args(GuidedDecodingBackendV1)
|
||||
else:
|
||||
@ -3719,6 +3764,24 @@ class DecodingConfig:
|
||||
raise ValueError("disable_additional_properties is only supported "
|
||||
"for the guidance backend.")
|
||||
|
||||
@deprecated(
|
||||
"Passing guided decoding backend options inside backend in the format "
|
||||
"'backend:...' is deprecated. This will be removed in v0.10.0. Please "
|
||||
"use the dedicated arguments '--disable-fallback', "
|
||||
"'--disable-any-whitespace' and '--disable-additional-properties' "
|
||||
"instead.")
|
||||
def _extract_backend_options(self):
|
||||
"""Extract backend options from the backend string."""
|
||||
backend, options = self.backend.split(":")
|
||||
self.backend = cast(GuidedDecodingBackend, backend)
|
||||
options_set = set(options.strip().split(","))
|
||||
if "no-fallback" in options_set:
|
||||
self.disable_fallback = True
|
||||
if "disable-any-whitespace" in options_set:
|
||||
self.disable_any_whitespace = True
|
||||
if "no-additional-properties" in options_set:
|
||||
self.disable_additional_properties = True
|
||||
|
||||
|
||||
DetailedTraceModules = Literal["model", "worker", "all"]
|
||||
|
||||
|
@ -9,6 +9,7 @@ import functools
|
||||
import json
|
||||
import sys
|
||||
import threading
|
||||
import warnings
|
||||
from dataclasses import MISSING, dataclass, fields, is_dataclass
|
||||
from itertools import permutations
|
||||
from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
|
||||
@ -18,7 +19,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
|
||||
import regex as re
|
||||
import torch
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
from typing_extensions import TypeIs
|
||||
from typing_extensions import TypeIs, deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
@ -31,8 +32,8 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
ObservabilityConfig, ParallelConfig, PoolerConfig,
|
||||
PrefixCachingHashAlgo, PromptAdapterConfig,
|
||||
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
|
||||
TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
|
||||
get_field)
|
||||
TaskOption, TokenizerMode, TokenizerPoolConfig,
|
||||
VllmConfig, get_attr_docs, get_field)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.plugins import load_general_plugins
|
||||
@ -65,6 +66,9 @@ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
|
||||
|
||||
def _parse_type(val: str) -> T:
|
||||
try:
|
||||
if return_type is json.loads and not re.match(
|
||||
r"(?s)^\s*{.*}\s*$", val):
|
||||
return cast(T, nullable_kvs(val))
|
||||
return return_type(val)
|
||||
except ValueError as e:
|
||||
raise argparse.ArgumentTypeError(
|
||||
@ -90,6 +94,42 @@ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
|
||||
return optional_type(json.loads)(val)
|
||||
|
||||
|
||||
@deprecated(
|
||||
"Passing a JSON argument as a string containing comma separated key=value "
|
||||
"pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
|
||||
"string instead.")
|
||||
def nullable_kvs(val: str) -> dict[str, int]:
|
||||
"""Parses a string containing comma separate key [str] to value [int]
|
||||
pairs into a dictionary.
|
||||
|
||||
Args:
|
||||
val: String value to be parsed.
|
||||
|
||||
Returns:
|
||||
Dictionary with parsed values.
|
||||
"""
|
||||
out_dict: dict[str, int] = {}
|
||||
for item in val.split(","):
|
||||
kv_parts = [part.lower().strip() for part in item.split("=")]
|
||||
if len(kv_parts) != 2:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Each item should be in the form KEY=VALUE")
|
||||
key, value = kv_parts
|
||||
|
||||
try:
|
||||
parsed_value = int(value)
|
||||
except ValueError as exc:
|
||||
msg = f"Failed to parse value of item {key}={value}"
|
||||
raise argparse.ArgumentTypeError(msg) from exc
|
||||
|
||||
if key in out_dict and out_dict[key] != parsed_value:
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"Conflicting values specified for key: {key}")
|
||||
out_dict[key] = parsed_value
|
||||
|
||||
return out_dict
|
||||
|
||||
|
||||
def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
|
||||
"""Check if the type hint is a specific type."""
|
||||
return type_hint is type or get_origin(type_hint) is type
|
||||
@ -139,10 +179,6 @@ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
|
||||
return type_hints
|
||||
|
||||
|
||||
def is_online_quantization(quantization: Any) -> bool:
|
||||
return quantization in ["inc"]
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=30)
|
||||
def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
cls_docs = get_attr_docs(cls)
|
||||
@ -337,6 +373,13 @@ class EngineArgs:
|
||||
enforce_eager: bool = ModelConfig.enforce_eager
|
||||
max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
|
||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||
# The following three fields are deprecated and will be removed in a future
|
||||
# release. Setting them will have no effect. Please remove them from your
|
||||
# configurations.
|
||||
tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
|
||||
tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
|
||||
tokenizer_pool_extra_config: dict = \
|
||||
get_field(TokenizerPoolConfig, "extra_config")
|
||||
limit_mm_per_prompt: dict[str, int] = \
|
||||
get_field(MultiModalConfig, "limit_per_prompt")
|
||||
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
|
||||
@ -398,6 +441,7 @@ class EngineArgs:
|
||||
|
||||
speculative_config: Optional[Dict[str, Any]] = None
|
||||
|
||||
qlora_adapter_name_or_path: Optional[str] = None
|
||||
show_hidden_metrics_for_version: Optional[str] = \
|
||||
ObservabilityConfig.show_hidden_metrics_for_version
|
||||
otlp_traces_endpoint: Optional[str] = \
|
||||
@ -431,6 +475,7 @@ class EngineArgs:
|
||||
|
||||
additional_config: dict[str, Any] = \
|
||||
get_field(VllmConfig, "additional_config")
|
||||
enable_reasoning: Optional[bool] = None # DEPRECATED
|
||||
reasoning_parser: str = DecodingConfig.reasoning_backend
|
||||
|
||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||
@ -448,6 +493,13 @@ class EngineArgs:
|
||||
if isinstance(self.compilation_config, (int, dict)):
|
||||
self.compilation_config = CompilationConfig.from_cli(
|
||||
str(self.compilation_config))
|
||||
if self.qlora_adapter_name_or_path is not None:
|
||||
warnings.warn(
|
||||
"The `qlora_adapter_name_or_path` is deprecated "
|
||||
"and will be removed in v0.10.0. ",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
# Setup plugins
|
||||
from vllm.plugins import load_general_plugins
|
||||
load_general_plugins()
|
||||
@ -560,6 +612,14 @@ class EngineArgs:
|
||||
**load_kwargs["ignore_patterns"])
|
||||
load_group.add_argument("--use-tqdm-on-load",
|
||||
**load_kwargs["use_tqdm_on_load"])
|
||||
load_group.add_argument(
|
||||
"--qlora-adapter-name-or-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The `--qlora-adapter-name-or-path` has no effect, do not set"
|
||||
" it, and it will be removed in v0.10.0.",
|
||||
deprecated=True,
|
||||
)
|
||||
load_group.add_argument('--pt-load-map-location',
|
||||
**load_kwargs["pt_load_map_location"])
|
||||
|
||||
@ -580,6 +640,15 @@ class EngineArgs:
|
||||
guided_decoding_group.add_argument(
|
||||
"--guided-decoding-disable-additional-properties",
|
||||
**guided_decoding_kwargs["disable_additional_properties"])
|
||||
guided_decoding_group.add_argument(
|
||||
"--enable-reasoning",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
deprecated=True,
|
||||
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
|
||||
"of v0.9.0. Use `--reasoning-parser` to specify the reasoning "
|
||||
"parser backend instead. This flag (`--enable-reasoning`) will be "
|
||||
"removed in v0.10.0. When `--reasoning-parser` is specified, "
|
||||
"reasoning mode is automatically enabled.")
|
||||
guided_decoding_group.add_argument(
|
||||
"--reasoning-parser",
|
||||
# This choices is a special case because it's not static
|
||||
@ -682,6 +751,19 @@ class EngineArgs:
|
||||
cache_group.add_argument("--calculate-kv-scales",
|
||||
**cache_kwargs["calculate_kv_scales"])
|
||||
|
||||
# Tokenizer arguments
|
||||
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
|
||||
tokenizer_group = parser.add_argument_group(
|
||||
title="TokenizerPoolConfig",
|
||||
description=TokenizerPoolConfig.__doc__,
|
||||
)
|
||||
tokenizer_group.add_argument("--tokenizer-pool-size",
|
||||
**tokenizer_kwargs["pool_size"])
|
||||
tokenizer_group.add_argument("--tokenizer-pool-type",
|
||||
**tokenizer_kwargs["pool_type"])
|
||||
tokenizer_group.add_argument("--tokenizer-pool-extra-config",
|
||||
**tokenizer_kwargs["extra_config"])
|
||||
|
||||
# Multimodal related configs
|
||||
multimodal_kwargs = get_kwargs(MultiModalConfig)
|
||||
multimodal_group = parser.add_argument_group(
|
||||
@ -964,8 +1046,6 @@ class EngineArgs:
|
||||
return LoadConfig(
|
||||
load_format=self.load_format,
|
||||
download_dir=self.download_dir,
|
||||
device="cpu"
|
||||
if is_online_quantization(self.quantization) else None,
|
||||
model_loader_extra_config=self.model_loader_extra_config,
|
||||
ignore_patterns=self.ignore_patterns,
|
||||
use_tqdm_on_load=self.use_tqdm_on_load,
|
||||
@ -1365,9 +1445,7 @@ class EngineArgs:
|
||||
supported = False
|
||||
if current_platform.is_rocm() or (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100)) or (
|
||||
current_platform.device_name
|
||||
== "hpu"): # handle hpu also for OOT platform
|
||||
and current_platform.is_device_capability(100)):
|
||||
supported = True
|
||||
elif fp8_attention and will_use_fa:
|
||||
from vllm.attention.utils.fa_utils import (
|
||||
@ -1701,12 +1779,7 @@ class EngineArgs:
|
||||
@dataclass
|
||||
class AsyncEngineArgs(EngineArgs):
|
||||
"""Arguments for asynchronous vLLM engine."""
|
||||
# Request logging is disabled by default. ``--disable-log-requests`` is
|
||||
# kept for backwards compatibility but has no effect. ``--enable-legacy-
|
||||
# log-requests`` can be used to restore the previous behaviour of logging
|
||||
# each request.
|
||||
disable_log_requests: bool = True
|
||||
enable_legacy_log_requests: bool = False
|
||||
disable_log_requests: bool = False
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser,
|
||||
@ -1719,34 +1792,10 @@ class AsyncEngineArgs(EngineArgs):
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
parser.add_argument('--disable-log-requests',
|
||||
action='store_true',
|
||||
default=None,
|
||||
help='[DEPRECATED] Request logging is disabled by '
|
||||
'default.')
|
||||
parser.add_argument('--enable-legacy-log-requests',
|
||||
action='store_true',
|
||||
help='Enable legacy request logging behavior.')
|
||||
help='Disable logging requests.')
|
||||
current_platform.pre_register_and_update(parser)
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace):
|
||||
engine_args = super().from_cli_args(args)
|
||||
|
||||
if args.enable_legacy_log_requests:
|
||||
engine_args.disable_log_requests = False
|
||||
else:
|
||||
if args.disable_log_requests is None:
|
||||
logger.warning(
|
||||
"Request logging is disabled by default. Use "
|
||||
"--enable-legacy-log-requests to restore the previous "
|
||||
"behaviour.")
|
||||
engine_args.disable_log_requests = True
|
||||
else:
|
||||
engine_args.disable_log_requests = True
|
||||
|
||||
engine_args.enable_legacy_log_requests = args.enable_legacy_log_requests
|
||||
return engine_args
|
||||
|
||||
|
||||
def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
|
||||
if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
|
||||
|
@ -28,7 +28,6 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
|
||||
ChatCompletionToolMessageParam)
|
||||
from openai.types.chat.chat_completion_content_part_input_audio_param import (
|
||||
InputAudio)
|
||||
from openai.types.responses import ResponseInputImageParam
|
||||
from PIL import Image
|
||||
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
||||
# yapf: enable
|
||||
@ -943,8 +942,6 @@ _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
|
||||
_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
|
||||
_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
|
||||
|
||||
_ResponsesInputImageParser = TypeAdapter(
|
||||
ResponseInputImageParam).validate_python
|
||||
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
|
||||
|
||||
# Define a mapping from part types to their corresponding parsing functions.
|
||||
@ -956,8 +953,6 @@ MM_PARSER_MAP: dict[
|
||||
lambda part: _TextParser(part).get("text", None),
|
||||
"input_text":
|
||||
lambda part: _TextParser(part).get("text", None),
|
||||
"input_image":
|
||||
lambda part: _ResponsesInputImageParser(part).get("image_url", None),
|
||||
"image_url":
|
||||
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
|
||||
"image_embeds":
|
||||
@ -1090,8 +1085,10 @@ def _parse_chat_message_content_part(
|
||||
"""
|
||||
if isinstance(part, str): # Handle plain text parts
|
||||
return part
|
||||
|
||||
# Handle structured dictionary parts
|
||||
part_type, content = _parse_chat_message_content_mm_part(part)
|
||||
|
||||
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
|
||||
# content is None, log a warning and skip
|
||||
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
|
||||
@ -1112,7 +1109,7 @@ def _parse_chat_message_content_part(
|
||||
image_content = cast(Image.Image, content)
|
||||
mm_parser.parse_image_pil(image_content)
|
||||
modality = "image"
|
||||
elif part_type in ("image_url", "input_image"):
|
||||
elif part_type == "image_url":
|
||||
str_content = cast(str, content)
|
||||
mm_parser.parse_image(str_content)
|
||||
modality = "image"
|
||||
|
@ -67,6 +67,37 @@ class ServeSubcommand(CLISubcommand):
|
||||
help="Start the vLLM OpenAI Compatible API server.",
|
||||
description="Start the vLLM OpenAI Compatible API server.",
|
||||
usage="vllm serve [model_tag] [options]")
|
||||
serve_parser.add_argument("model_tag",
|
||||
type=str,
|
||||
nargs='?',
|
||||
help="The model tag to serve "
|
||||
"(optional if specified in config)")
|
||||
serve_parser.add_argument(
|
||||
"--headless",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Run in headless mode. See multi-node data parallel "
|
||||
"documentation for more details.")
|
||||
serve_parser.add_argument(
|
||||
'--data-parallel-start-rank',
|
||||
'-dpr',
|
||||
type=int,
|
||||
default=0,
|
||||
help="Starting data parallel rank for secondary nodes. "
|
||||
"Requires --headless.")
|
||||
serve_parser.add_argument('--api-server-count',
|
||||
'-asc',
|
||||
type=int,
|
||||
default=1,
|
||||
help='How many API server processes to run.')
|
||||
serve_parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default='',
|
||||
required=False,
|
||||
help="Read CLI options from a config file. "
|
||||
"Must be a YAML with the following options: "
|
||||
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
|
||||
|
||||
serve_parser = make_arg_parser(serve_parser)
|
||||
show_filtered_argument_or_group_from_help(serve_parser, ["serve"])
|
||||
|
@ -522,19 +522,6 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
|
||||
assert_never(generator)
|
||||
|
||||
|
||||
def maybe_register_tokenizer_info_endpoint(args):
|
||||
"""Conditionally register the tokenizer info endpoint if enabled."""
|
||||
if getattr(args, 'enable_tokenizer_info_endpoint', False):
|
||||
|
||||
@router.get("/tokenizer_info")
|
||||
async def get_tokenizer_info(raw_request: Request):
|
||||
"""Get comprehensive tokenizer information."""
|
||||
result = await tokenization(raw_request).get_tokenizer_info()
|
||||
return JSONResponse(content=result.model_dump(),
|
||||
status_code=result.code if isinstance(
|
||||
result, ErrorResponse) else 200)
|
||||
|
||||
|
||||
@router.get("/v1/models")
|
||||
async def show_available_models(raw_request: Request):
|
||||
handler = models(raw_request)
|
||||
@ -1527,6 +1514,8 @@ async def init_app_state(
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
expand_tools_even_if_tool_choice_none=args.
|
||||
expand_tools_even_if_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
@ -1542,6 +1531,8 @@ async def init_app_state(
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
expand_tools_even_if_tool_choice_none=args.
|
||||
expand_tools_even_if_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
@ -1553,7 +1544,6 @@ async def init_app_state(
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
) if "generate" in model_config.supported_tasks else None
|
||||
state.openai_serving_pooling = OpenAIServingPooling(
|
||||
@ -1705,7 +1695,6 @@ async def run_server_worker(listen_address,
|
||||
uvicorn_kwargs['log_config'] = log_config
|
||||
|
||||
async with build_async_engine_client(args, client_config) as engine_client:
|
||||
maybe_register_tokenizer_info_endpoint(args)
|
||||
app = build_app(args)
|
||||
|
||||
vllm_config = await engine_client.get_vllm_config()
|
||||
|
@ -182,9 +182,13 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
"""If set to True, enable tracking server_load_metrics in the app state."""
|
||||
enable_force_include_usage: bool = False
|
||||
"""If set to True, including usage on every request."""
|
||||
enable_tokenizer_info_endpoint: bool = False
|
||||
"""Enable the /get_tokenizer_info endpoint. May expose chat
|
||||
templates and other tokenizer configuration."""
|
||||
expand_tools_even_if_tool_choice_none: bool = False
|
||||
"""Include tool definitions in prompts even when `tool_choice='none'`.
|
||||
|
||||
This is a transitional option that will be removed in v0.10.0. In
|
||||
v0.10.0, tool definitions will always be included regardless of
|
||||
`tool_choice` setting. Use this flag to test the upcoming behavior
|
||||
before the breaking change."""
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
@ -195,6 +199,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
# Special case: allowed_origins, allowed_methods, allowed_headers all
|
||||
# need json.loads type
|
||||
# Should also remove nargs
|
||||
print(frontend_kwargs["allowed_origins"])
|
||||
frontend_kwargs["allowed_origins"]["type"] = json.loads
|
||||
frontend_kwargs["allowed_methods"]["type"] = json.loads
|
||||
frontend_kwargs["allowed_headers"]["type"] = json.loads
|
||||
@ -220,6 +225,11 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
valid_tool_parsers = list(ToolParserManager.tool_parsers.keys())
|
||||
frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers
|
||||
|
||||
# Special case for expand-tools-even-if-tool-choice-none because of
|
||||
# the deprecation field
|
||||
frontend_kwargs["expand_tools_even_if_tool_choice_none"]\
|
||||
["deprecated"] = True
|
||||
|
||||
frontend_group = parser.add_argument_group(
|
||||
title="Frontend",
|
||||
description=FrontendArgs.__doc__,
|
||||
@ -238,34 +248,6 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
register all arguments instead of manually enumerating them here. This
|
||||
avoids code duplication and keeps the argument definitions in one place.
|
||||
"""
|
||||
parser.add_argument("model_tag",
|
||||
type=str,
|
||||
nargs="?",
|
||||
help="The model tag to serve "
|
||||
"(optional if specified in config)")
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Run in headless mode. See multi-node data parallel "
|
||||
"documentation for more details.")
|
||||
parser.add_argument(
|
||||
"--data-parallel-start-rank",
|
||||
"-dpr",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Starting data parallel rank for secondary nodes. "
|
||||
"Requires --headless.")
|
||||
parser.add_argument("--api-server-count",
|
||||
"-asc",
|
||||
type=int,
|
||||
default=1,
|
||||
help="How many API server processes to run.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
help="Read CLI options from a config file. "
|
||||
"Must be a YAML with the following options: "
|
||||
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
|
||||
parser = FrontendArgs.add_cli_args(parser)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
|
||||
|
@ -290,15 +290,6 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
"if the served model does not use priority scheduling."),
|
||||
)
|
||||
cache_salt: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the prefix cache will be salted with the provided "
|
||||
"string to prevent an attacker to guess prompts in multi-user "
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."))
|
||||
# --8<-- [end:responses-extra-params]
|
||||
|
||||
_DEFAULT_SAMPLING_PARAMS = {
|
||||
@ -360,19 +351,6 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
raise ValueError("prompt template is not supported")
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
def check_cache_salt_support(cls, data):
|
||||
if data.get("cache_salt") is not None:
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise ValueError(
|
||||
"Parameter 'cache_salt' is not supported with "
|
||||
"this instance of vLLM, which uses engine V0.")
|
||||
if not isinstance(data["cache_salt"],
|
||||
str) or not data["cache_salt"]:
|
||||
raise ValueError("Parameter 'cache_salt' must be a "
|
||||
"non-empty string if provided.")
|
||||
return data
|
||||
|
||||
|
||||
class ChatCompletionRequest(OpenAIBaseModel):
|
||||
# Ordered by official OpenAI API documentation
|
||||
@ -1026,16 +1004,6 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
" as strings of the form 'token_id:{token_id}' so that tokens "
|
||||
"that are not JSON-encodable can be identified."))
|
||||
|
||||
cache_salt: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the prefix cache will be salted with the provided "
|
||||
"string to prevent an attacker to guess prompts in multi-user "
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."))
|
||||
|
||||
kv_transfer_params: Optional[dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="KVTransfer parameters used for disaggregated serving.")
|
||||
@ -1212,20 +1180,6 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
"At least one of `prompt` or `prompt_embeds` must be set.")
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_cache_salt_support(cls, data):
|
||||
if data.get("cache_salt") is not None:
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise ValueError(
|
||||
"Parameter 'cache_salt' is not supported with "
|
||||
"this instance of vLLM, which uses engine V0.")
|
||||
if not isinstance(data["cache_salt"],
|
||||
str) or not data["cache_salt"]:
|
||||
raise ValueError("Parameter 'cache_salt' must be a "
|
||||
"non-empty string if provided.")
|
||||
return data
|
||||
|
||||
|
||||
class EmbeddingCompletionRequest(OpenAIBaseModel):
|
||||
# Ordered by official OpenAI API documentation
|
||||
@ -1953,16 +1907,6 @@ class DetokenizeResponse(OpenAIBaseModel):
|
||||
prompt: str
|
||||
|
||||
|
||||
class TokenizerInfoResponse(OpenAIBaseModel):
|
||||
"""
|
||||
Response containing tokenizer configuration
|
||||
equivalent to tokenizer_config.json
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
tokenizer_class: str
|
||||
|
||||
|
||||
class LoadLoRAAdapterRequest(BaseModel):
|
||||
lora_name: str
|
||||
lora_path: str
|
||||
@ -2027,7 +1971,7 @@ class TranscriptionRequest(OpenAIBaseModel):
|
||||
"""
|
||||
|
||||
stream: Optional[bool] = False
|
||||
"""When set, it will enable output to be streamed in a similar fashion
|
||||
"""When set, it will enable output to be streamed in a similar fashion
|
||||
as the Chat Completion endpoint.
|
||||
"""
|
||||
# --8<-- [start:transcription-extra-params]
|
||||
@ -2289,9 +2233,9 @@ class TranslationRequest(OpenAIBaseModel):
|
||||
"""
|
||||
|
||||
stream: Optional[bool] = False
|
||||
"""Custom field not present in the original OpenAI definition. When set,
|
||||
"""Custom field not present in the original OpenAI definition. When set,
|
||||
it will enable output to be streamed in a similar fashion as the Chat
|
||||
Completion endpoint.
|
||||
Completion endpoint.
|
||||
"""
|
||||
# Flattened stream option to simplify form data.
|
||||
stream_include_usage: Optional[bool] = False
|
||||
|
@ -63,6 +63,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
reasoning_parser: str = "",
|
||||
enable_auto_tools: bool = False,
|
||||
expand_tools_even_if_tool_choice_none: bool = False,
|
||||
tool_parser: Optional[str] = None,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
@ -111,6 +112,8 @@ class OpenAIServingChat(OpenAIServing):
|
||||
raise TypeError("Error: --enable-auto-tool-choice requires "
|
||||
f"tool_parser:'{tool_parser}' which has not "
|
||||
"been registered") from e
|
||||
self.expand_tools_even_if_tool_choice_none = (
|
||||
expand_tools_even_if_tool_choice_none)
|
||||
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
self.enable_force_include_usage = enable_force_include_usage
|
||||
@ -179,6 +182,20 @@ class OpenAIServingChat(OpenAIServing):
|
||||
|
||||
if request.tools is None:
|
||||
tool_dicts = None
|
||||
elif (request.tool_choice == "none"
|
||||
and not self.expand_tools_even_if_tool_choice_none):
|
||||
if len(request.tools) > 0:
|
||||
logger.warning_once(
|
||||
"Tools are specified but tool_choice is set to 'none' "
|
||||
"and --expand-tools-even-if-tool-choice-none is not "
|
||||
"enabled. Tool definitions will be excluded from the "
|
||||
"prompt. This behavior will change in vLLM v0.10 where "
|
||||
"tool definitions will be included by default even "
|
||||
"with tool_choice='none'. To adopt the new behavior "
|
||||
"now, use --expand-tools-even-if-tool-choice-none. "
|
||||
"To suppress this warning, either remove tools from "
|
||||
"the request or set tool_choice to a different value.")
|
||||
tool_dicts = None
|
||||
else:
|
||||
tool_dicts = [tool.model_dump() for tool in request.tools]
|
||||
|
||||
|
@ -23,7 +23,6 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
|
||||
CompletionResponseStreamChoice,
|
||||
CompletionStreamResponse,
|
||||
ErrorResponse,
|
||||
PromptTokenUsageInfo,
|
||||
RequestResponseMetadata,
|
||||
UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import (
|
||||
@ -57,7 +56,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
*,
|
||||
request_logger: Optional[RequestLogger],
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
):
|
||||
super().__init__(engine_client=engine_client,
|
||||
@ -66,7 +64,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
enable_force_include_usage=enable_force_include_usage)
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
self.default_sampling_params = (
|
||||
self.model_config.get_diff_sampling_param())
|
||||
if self.default_sampling_params:
|
||||
@ -316,8 +313,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
previous_num_tokens = [0] * num_choices * num_prompts
|
||||
has_echoed = [False] * num_choices * num_prompts
|
||||
num_prompt_tokens = [0] * num_prompts
|
||||
num_cached_tokens = None
|
||||
first_iteration = True
|
||||
|
||||
stream_options = request.stream_options
|
||||
if stream_options:
|
||||
@ -333,10 +328,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
prompt_token_ids = res.prompt_token_ids
|
||||
prompt_logprobs = res.prompt_logprobs
|
||||
|
||||
if first_iteration:
|
||||
num_cached_tokens = res.num_cached_tokens
|
||||
first_iteration = False
|
||||
|
||||
if res.prompt is not None:
|
||||
prompt_text = res.prompt
|
||||
else:
|
||||
@ -440,10 +431,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
completion_tokens=total_completion_tokens,
|
||||
total_tokens=total_prompt_tokens + total_completion_tokens)
|
||||
|
||||
if self.enable_prompt_tokens_details and num_cached_tokens:
|
||||
final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
|
||||
cached_tokens=num_cached_tokens)
|
||||
|
||||
if include_usage:
|
||||
final_usage_chunk = CompletionStreamResponse(
|
||||
id=request_id,
|
||||
@ -548,10 +535,6 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||
)
|
||||
|
||||
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
|
||||
usage.prompt_tokens_details = PromptTokenUsageInfo(
|
||||
cached_tokens=final_res.num_cached_tokens)
|
||||
|
||||
request_metadata.final_usage_info = usage
|
||||
|
||||
return CompletionResponse(
|
||||
|
@ -226,7 +226,7 @@ class OpenAIServing:
|
||||
|
||||
def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
|
||||
"""
|
||||
Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
|
||||
Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
|
||||
given tokenizer.
|
||||
"""
|
||||
async_tokenizer = self._async_tokenizer_pool.get(tokenizer)
|
||||
@ -811,12 +811,6 @@ class OpenAIServing:
|
||||
prompt_token_ids=request_prompt_text["prompt_token_ids"])
|
||||
for request_prompt_text in request_prompts_text
|
||||
]
|
||||
cache_salt = request.cache_salt if (
|
||||
hasattr(request, "cache_salt")
|
||||
and request.cache_salt is not None) else None
|
||||
if cache_salt:
|
||||
for prompt_text in engine_prompts_text:
|
||||
prompt_text["cache_salt"] = cache_salt
|
||||
|
||||
# This check is equivalent to simply checking if
|
||||
# `request_prompts_embeds` is empty, but it's difficult to propagate
|
||||
@ -834,9 +828,6 @@ class OpenAIServing:
|
||||
prompt_embeds=request_prompt_embeds["prompt_embeds"])
|
||||
for request_prompt_embeds in request_prompts_embeds
|
||||
]
|
||||
if cache_salt:
|
||||
for prompt_embed in engine_prompts_embeds:
|
||||
prompt_embed["cache_salt"] = cache_salt
|
||||
|
||||
request_prompts = request_prompts_embeds + request_prompts_text
|
||||
engine_prompts = engine_prompts_embeds + engine_prompts_text
|
||||
|
@ -51,6 +51,7 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
reasoning_parser: str = "",
|
||||
enable_auto_tools: bool = False,
|
||||
expand_tools_even_if_tool_choice_none: bool = False,
|
||||
tool_parser: Optional[str] = None,
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_force_include_usage: bool = False,
|
||||
@ -372,7 +373,7 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
})
|
||||
|
||||
# Append the new input.
|
||||
# Responses API supports simple text inputs without chat format.
|
||||
# Reponses API supports simple text inputs without chat format.
|
||||
if isinstance(request.input, str):
|
||||
messages.append({"role": "user", "content": request.input})
|
||||
else:
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Final, Optional, Union
|
||||
|
||||
from typing import Final, Optional, Union
|
||||
|
||||
import jinja2
|
||||
from fastapi import Request
|
||||
@ -17,13 +17,11 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
|
||||
ErrorResponse,
|
||||
TokenizeChatRequest,
|
||||
TokenizeRequest,
|
||||
TokenizeResponse,
|
||||
TokenizerInfoResponse)
|
||||
TokenizeResponse)
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -157,49 +155,3 @@ class OpenAIServingTokenization(OpenAIServing):
|
||||
input_text = prompt_input["prompt"]
|
||||
|
||||
return DetokenizeResponse(prompt=input_text)
|
||||
|
||||
async def get_tokenizer_info(
|
||||
self, ) -> Union[TokenizerInfoResponse, ErrorResponse]:
|
||||
"""Get comprehensive tokenizer information."""
|
||||
try:
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
|
||||
return TokenizerInfoResponse(**info)
|
||||
except Exception as e:
|
||||
return self.create_error_response(
|
||||
f"Failed to get tokenizer info: {str(e)}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizerInfo:
|
||||
tokenizer: AnyTokenizer
|
||||
chat_template: Optional[str]
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Return the tokenizer configuration."""
|
||||
return self._get_tokenizer_config()
|
||||
|
||||
def _get_tokenizer_config(self) -> dict[str, Any]:
|
||||
"""Get tokenizer configuration directly from the tokenizer object."""
|
||||
config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})
|
||||
|
||||
# Remove file path fields
|
||||
config.pop("vocab_file", None)
|
||||
config.pop("merges_file", None)
|
||||
|
||||
config = self._make_json_serializable(config)
|
||||
config["tokenizer_class"] = type(self.tokenizer).__name__
|
||||
if self.chat_template:
|
||||
config["chat_template"] = self.chat_template
|
||||
return config
|
||||
|
||||
def _make_json_serializable(self, obj):
|
||||
"""Convert any non-JSON-serializable objects to serializable format."""
|
||||
if hasattr(obj, "content"):
|
||||
return obj.content
|
||||
elif isinstance(obj, dict):
|
||||
return {k: self._make_json_serializable(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [self._make_json_serializable(item) for item in obj]
|
||||
else:
|
||||
return obj
|
||||
|
@ -112,7 +112,6 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
prompt = self.model_cls.get_generation_prompt(
|
||||
audio=chunk,
|
||||
stt_config=self.asr_config,
|
||||
model_config=self.model_config,
|
||||
language=lang,
|
||||
task_type=self.task_type,
|
||||
request_prompt=request.prompt)
|
||||
|
10
vllm/envs.py
Executable file → Normal file
10
vllm/envs.py
Executable file → Normal file
@ -139,8 +139,6 @@ if TYPE_CHECKING:
|
||||
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
||||
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
||||
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
||||
VLLM_USE_CUDNN_PREFILL: bool = False
|
||||
VLLM_LOOPBACK_IP: str = ""
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@ -963,17 +961,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
|
||||
lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
|
||||
|
||||
# Controls whether or not to use cudnn prefill
|
||||
"VLLM_USE_CUDNN_PREFILL":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))),
|
||||
|
||||
# If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
|
||||
"VLLM_USE_TRTLLM_DECODE_ATTENTION":
|
||||
lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
|
||||
|
||||
# Used to force set up loopback IP
|
||||
"VLLM_LOOPBACK_IP":
|
||||
lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
|
||||
}
|
||||
|
||||
# --8<-- [end:env-vars-definition]
|
||||
|
@ -1172,7 +1172,7 @@ def fused_experts(
|
||||
allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
|
||||
# For now, disable DeepGemm for small N (<= 512) until better
|
||||
# permute/unpermute ops are available.
|
||||
# However, on B200, we use DeepGemm for all cases because they only support
|
||||
# However, on B200, we use DeepGemm for all cases becuase they only support
|
||||
# E8M0 scale, which means we requantize the weight and input to the specific
|
||||
# scale. Fallen back to cutlass or triton for some cases would cause
|
||||
# accuracy issue.
|
||||
|
@ -573,8 +573,8 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
x = hidden_states_B_C_p.transpose(
|
||||
0, 1) # this is the form that causal-conv see
|
||||
if mamba2_metadata.cu_seqlen is None:
|
||||
mamba2_metadata = update_metadata(x, query_start_loc_p,
|
||||
mamba2_metadata)
|
||||
mamba2_metadata = update_metadata(
|
||||
x, attn_metadata.query_start_loc, mamba2_metadata)
|
||||
hidden_states_B_C_p = causal_conv1d_fn(
|
||||
x,
|
||||
conv_weights,
|
||||
@ -583,7 +583,6 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
conv_states=conv_state,
|
||||
has_initial_state=has_initial_states_p,
|
||||
cache_indices=state_indices_tensor_p,
|
||||
metadata=mamba2_metadata,
|
||||
query_start_loc=query_start_loc_p).transpose(
|
||||
0, 1)[:num_prefill_tokens]
|
||||
|
||||
@ -594,14 +593,9 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
initial_states = None
|
||||
if (has_initial_states_p is not None and prep_initial_states):
|
||||
# making a copy of the states
|
||||
if envs.VLLM_USE_V1:
|
||||
initial_states = torch.where(
|
||||
has_initial_states_p[:, None, None, None],
|
||||
ssm_state[state_indices_tensor_p], 0)
|
||||
else:
|
||||
initial_states = torch.where(
|
||||
has_initial_states_p[:num_prefills, None, None, None],
|
||||
ssm_state[state_indices_tensor_p], 0)
|
||||
initial_states = torch.where(
|
||||
has_initial_states_p[:, None, None, None],
|
||||
ssm_state[state_indices_tensor_p], 0)
|
||||
|
||||
scan_output, varlen_state = mamba_chunk_scan_combined(
|
||||
hidden_states_p.view(1, num_prefill_tokens,
|
||||
|
@ -55,6 +55,7 @@ def _causal_conv1d_fwd_kernel( # continuous batching
|
||||
IS_CONTINUOUS_BATCHING: tl.constexpr,
|
||||
USE_PAD_SLOT: tl.constexpr,
|
||||
NP2_STATELEN: tl.constexpr,
|
||||
DECODE_SEQLEN: tl.constexpr,
|
||||
BLOCK_M: tl.constexpr,
|
||||
BLOCK_N: tl.constexpr,
|
||||
):
|
||||
@ -415,7 +416,7 @@ def causal_conv1d_fn(
|
||||
activation = "silu"
|
||||
|
||||
args = None
|
||||
out = torch.empty_like(x)
|
||||
out = torch.zeros_like(x)
|
||||
if metadata is not None:
|
||||
cu_seqlen = metadata.cu_seqlen
|
||||
nums_dict = metadata.nums_dict
|
||||
@ -606,6 +607,7 @@ def causal_conv1d_fn(
|
||||
IS_CONTINUOUS_BATCHING=cache_indices is not None,
|
||||
USE_PAD_SLOT=pad_slot_id is not None,
|
||||
NP2_STATELEN=np2_statelen,
|
||||
DECODE_SEQLEN=1,
|
||||
#launch_cooperative_grid=True
|
||||
BLOCK_M=8,
|
||||
BLOCK_N=256,
|
||||
@ -663,8 +665,7 @@ def _causal_conv1d_update_kernel(
|
||||
|
||||
if IS_CONTINUOUS_BATCHING:
|
||||
# mask = idx_seq < batch
|
||||
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(
|
||||
tl.int64)
|
||||
conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq)
|
||||
else:
|
||||
conv_state_batch_coord = idx_seq
|
||||
if USE_PAD_SLOT: # noqa
|
||||
|
@ -1,21 +1,22 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
|
||||
from enum import IntEnum
|
||||
from typing import Callable, Optional, TypeVar, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers import PretrainedConfig
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.config import ModelConfig, PoolerConfig
|
||||
from vllm.model_executor.pooling_metadata import ( # noqa: E501
|
||||
PoolingMetadata as V0PoolingMetadata)
|
||||
from vllm.model_executor.pooling_metadata import PoolingTensors
|
||||
from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
|
||||
from vllm.utils import resolve_obj_by_qualname
|
||||
from vllm.transformers_utils.config import (
|
||||
get_classification_activation_function,
|
||||
get_cross_encoder_activation_function)
|
||||
from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
|
||||
|
||||
PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
|
||||
@ -30,202 +31,140 @@ class PoolingType(IntEnum):
|
||||
MEAN = 4
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResolvedPoolingConfig:
|
||||
pooling_type: PoolingType
|
||||
class SimplePooler(nn.Module):
|
||||
"""A layer that pools specific information from hidden states.
|
||||
|
||||
normalize: bool
|
||||
softmax: bool
|
||||
step_tag_id: Optional[int]
|
||||
returned_token_ids: Optional[list[int]]
|
||||
This layer does the following:
|
||||
1. Extracts specific tokens or aggregates data based on pooling method.
|
||||
2. Normalizes output if specified.
|
||||
3. Returns structured results as `PoolerOutput`.
|
||||
|
||||
@classmethod
|
||||
def from_config_with_defaults(
|
||||
cls,
|
||||
pooler_config: PoolerConfig,
|
||||
Attributes:
|
||||
pooling_type: The type of pooling to use.
|
||||
normalize: Whether to normalize the pooled data.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def from_pooling_type(
|
||||
pooling_type: PoolingType,
|
||||
*,
|
||||
normalize: bool,
|
||||
softmax: bool,
|
||||
step_tag_id: Optional[int] = None,
|
||||
returned_token_ids: Optional[list[int]] = None,
|
||||
) -> "ResolvedPoolingConfig":
|
||||
return cls(
|
||||
pooling_type=PoolingType[pooler_config.pooling_type]
|
||||
if pooler_config.pooling_type is not None else pooling_type,
|
||||
normalize=pooler_config.normalize
|
||||
if pooler_config.normalize is not None else normalize,
|
||||
softmax=pooler_config.softmax
|
||||
if pooler_config.softmax is not None else softmax,
|
||||
step_tag_id=pooler_config.step_tag_id
|
||||
if pooler_config.step_tag_id is not None else step_tag_id,
|
||||
returned_token_ids=pooler_config.returned_token_ids
|
||||
if pooler_config.returned_token_ids is not None else
|
||||
returned_token_ids,
|
||||
)
|
||||
) -> "SimplePooler":
|
||||
if pooling_type == PoolingType.LAST:
|
||||
assert step_tag_id is None and returned_token_ids is None
|
||||
return LastPool(normalize=normalize, softmax=softmax)
|
||||
if pooling_type == PoolingType.ALL:
|
||||
assert step_tag_id is None and returned_token_ids is None
|
||||
return AllPool(normalize=normalize, softmax=softmax)
|
||||
if pooling_type == PoolingType.CLS:
|
||||
assert step_tag_id is None and returned_token_ids is None
|
||||
return CLSPool(normalize=normalize, softmax=softmax)
|
||||
if pooling_type == PoolingType.MEAN:
|
||||
assert step_tag_id is None and returned_token_ids is None
|
||||
return MeanPool(normalize=normalize, softmax=softmax)
|
||||
if pooling_type == PoolingType.STEP:
|
||||
return StepPool(normalize=normalize,
|
||||
softmax=softmax,
|
||||
step_tag_id=step_tag_id,
|
||||
returned_token_ids=returned_token_ids)
|
||||
|
||||
assert_never(pooling_type)
|
||||
|
||||
def get_prompt_lens(
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> torch.Tensor:
|
||||
if isinstance(pooling_metadata, V1PoolingMetadata):
|
||||
return pooling_metadata.prompt_lens
|
||||
def __init__(self, *, normalize: bool, softmax: bool) -> None:
|
||||
super().__init__()
|
||||
|
||||
assert isinstance(hidden_states, torch.Tensor)
|
||||
return PoolingTensors.from_pooling_metadata(
|
||||
pooling_metadata, hidden_states.device).prompt_lens
|
||||
self.head = PoolerHead(normalize=normalize, softmax=softmax)
|
||||
|
||||
def get_prompt_lens(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> torch.Tensor:
|
||||
if isinstance(pooling_metadata, V1PoolingMetadata):
|
||||
return pooling_metadata.prompt_lens
|
||||
assert isinstance(hidden_states, torch.Tensor)
|
||||
return PoolingTensors.from_pooling_metadata(
|
||||
pooling_metadata, hidden_states.device).prompt_lens
|
||||
|
||||
def get_classification_activation_function(config: PretrainedConfig):
|
||||
return PoolerClassify()
|
||||
def extract_states(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
raise NotImplementedError
|
||||
|
||||
def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput:
|
||||
return PoolingSequenceGroupOutput(data)
|
||||
|
||||
def get_cross_encoder_activation_function(config: PretrainedConfig):
|
||||
function_name: Optional[str] = None
|
||||
if (hasattr(config, "sentence_transformers")
|
||||
and "activation_fn" in config.sentence_transformers):
|
||||
function_name = config.sentence_transformers["activation_fn"]
|
||||
elif (hasattr(config, "sbert_ce_default_activation_function")
|
||||
and config.sbert_ce_default_activation_function is not None):
|
||||
function_name = config.sbert_ce_default_activation_function
|
||||
|
||||
if function_name is not None:
|
||||
assert function_name.startswith("torch.nn.modules."), (
|
||||
"Loading of activation functions is restricted to "
|
||||
"torch.nn.modules for security reasons")
|
||||
fn = resolve_obj_by_qualname(function_name)()
|
||||
return PoolerActivation.wraps(fn)
|
||||
|
||||
return PoolerScore()
|
||||
|
||||
|
||||
def build_output(all_data: torch.Tensor) -> PoolerOutput:
|
||||
all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
|
||||
return PoolerOutput(outputs=all_outputs)
|
||||
|
||||
|
||||
class BasePooler(nn.Module):
|
||||
|
||||
@abstractmethod
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> PoolerOutput:
|
||||
raise NotImplementedError
|
||||
pooled_data = self.extract_states(hidden_states, pooling_metadata)
|
||||
pooled_data = self.head(pooled_data, pooling_metadata)
|
||||
pooled_outputs = [self.build_output(data) for data in pooled_data]
|
||||
return PoolerOutput(outputs=pooled_outputs)
|
||||
|
||||
|
||||
class PoolingMethod(nn.Module, ABC):
|
||||
class CLSPool(SimplePooler):
|
||||
|
||||
@staticmethod
|
||||
def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod":
|
||||
if pooling_type == PoolingType.LAST:
|
||||
return LastPool()
|
||||
if pooling_type == PoolingType.ALL:
|
||||
return AllPool()
|
||||
if pooling_type == PoolingType.CLS:
|
||||
return CLSPool()
|
||||
if pooling_type == PoolingType.MEAN:
|
||||
return MeanPool()
|
||||
|
||||
raise NotImplementedError(f"Unsupported method: {pooling_type}")
|
||||
|
||||
@abstractmethod
|
||||
def forward_one(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_len: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Note:
|
||||
`prompt_len=None` means `prompt_len=len(hidden_states)`.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def forward_all(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_lens: torch.Tensor,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
raise NotImplementedError
|
||||
|
||||
def forward(
|
||||
def extract_states(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
|
||||
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
|
||||
|
||||
if isinstance(hidden_states, list):
|
||||
return [
|
||||
self.forward_one(h, prompt_len)
|
||||
for h, prompt_len in zip(hidden_states, prompt_lens)
|
||||
]
|
||||
result = []
|
||||
for req_state, prompt_len in zip(hidden_states, prompt_lens):
|
||||
assert prompt_len == req_state.shape[0], \
|
||||
"partial prefill not supported with CLS pooling"
|
||||
result.append(req_state[0])
|
||||
return result
|
||||
|
||||
return self.forward_all(hidden_states, prompt_lens)
|
||||
|
||||
|
||||
class CLSPool(PoolingMethod):
|
||||
|
||||
def forward_one(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_len: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
assert prompt_len is None or prompt_len == hidden_states.shape[0], \
|
||||
"partial prefill not supported with CLS pooling"
|
||||
|
||||
return hidden_states[0]
|
||||
|
||||
def forward_all(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_lens: torch.Tensor,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
first_token_flat_indices = torch.zeros_like(prompt_lens)
|
||||
first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
|
||||
return hidden_states[first_token_flat_indices]
|
||||
|
||||
|
||||
class LastPool(PoolingMethod):
|
||||
class LastPool(SimplePooler):
|
||||
|
||||
def forward_one(
|
||||
def extract_states(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_len: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
return hidden_states[-1]
|
||||
|
||||
def forward_all(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_lens: torch.Tensor,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
if isinstance(hidden_states, list):
|
||||
return [h[-1] for h in hidden_states]
|
||||
|
||||
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
|
||||
|
||||
last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1
|
||||
return hidden_states[last_token_flat_indices]
|
||||
|
||||
|
||||
class AllPool(PoolingMethod):
|
||||
class AllPool(SimplePooler):
|
||||
|
||||
def forward_one(
|
||||
def extract_states(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_len: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
assert prompt_len is None or prompt_len == hidden_states.shape[0], \
|
||||
"partial prefill not supported with ALL pooling"
|
||||
|
||||
return hidden_states
|
||||
|
||||
def forward_all(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_lens: torch.Tensor,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
|
||||
|
||||
if isinstance(hidden_states, list):
|
||||
for req_state, prompt_len in zip(hidden_states, prompt_lens):
|
||||
assert prompt_len == req_state.shape[0], \
|
||||
"partial prefill not supported with ALL pooling"
|
||||
return hidden_states
|
||||
|
||||
offset = 0
|
||||
pooled_data = list[torch.Tensor]()
|
||||
|
||||
for prompt_len in prompt_lens:
|
||||
pooled_data.append(hidden_states[offset:offset + prompt_len])
|
||||
offset += prompt_len
|
||||
@ -233,23 +172,24 @@ class AllPool(PoolingMethod):
|
||||
return pooled_data
|
||||
|
||||
|
||||
class MeanPool(PoolingMethod):
|
||||
class MeanPool(SimplePooler):
|
||||
|
||||
def forward_one(
|
||||
def extract_states(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_len: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
assert prompt_len is None or prompt_len == hidden_states.shape[0], \
|
||||
"partial prefill not supported with MEAN pooling"
|
||||
|
||||
return hidden_states.mean(dim=0, dtype=torch.float32)
|
||||
|
||||
def forward_all(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
prompt_lens: torch.Tensor,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
|
||||
|
||||
if isinstance(hidden_states, list):
|
||||
result = []
|
||||
for req_state, prompt_len in zip(hidden_states, prompt_lens):
|
||||
assert prompt_len == req_state.shape[0], \
|
||||
"partial prefill not supported with mean pooling"
|
||||
result.append(torch.mean(req_state, dim=0,
|
||||
dtype=torch.float32))
|
||||
return result
|
||||
|
||||
# Use float32 for torch.cumsum in MeanPool,
|
||||
# otherwise precision will be lost significantly.
|
||||
cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32)
|
||||
@ -263,127 +203,78 @@ class MeanPool(PoolingMethod):
|
||||
hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
|
||||
|
||||
|
||||
_T = TypeVar("_T", torch.Tensor, list[torch.Tensor])
|
||||
class StepPool(SimplePooler):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
normalize: bool,
|
||||
softmax: bool,
|
||||
step_tag_id: Optional[int] = None,
|
||||
returned_token_ids: Optional[list[int]] = None,
|
||||
):
|
||||
super().__init__(normalize=normalize, softmax=softmax)
|
||||
|
||||
class BasePoolerActivation(nn.Module, ABC):
|
||||
self.step_tag_id = step_tag_id
|
||||
self.returned_token_ids = returned_token_ids
|
||||
|
||||
@abstractmethod
|
||||
def forward(self, pooled_data: _T) -> _T:
|
||||
# shape:
|
||||
# classify (& score) -> (batch_size, num_classes)
|
||||
# embed -> (batch_size, embedding_dim) or list(embedding_dim)
|
||||
# (batch_size, dimensions) or list(dimensions) if using MRL
|
||||
raise NotImplementedError
|
||||
def get_prompt_token_ids(
|
||||
self,
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> list[torch.Tensor]:
|
||||
if isinstance(pooling_metadata, V1PoolingMetadata):
|
||||
return [
|
||||
pooling_metadata.prompt_token_ids[i, :num]
|
||||
for i, num in enumerate(pooling_metadata.prompt_lens)
|
||||
]
|
||||
return [
|
||||
torch.tensor(seq_data_i.prompt_token_ids)
|
||||
for seq_data_i in pooling_metadata.seq_data.values()
|
||||
]
|
||||
|
||||
def extract_states(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
|
||||
prompt_token_ids = self.get_prompt_token_ids(pooling_metadata)
|
||||
|
||||
class PoolerActivation(BasePoolerActivation):
|
||||
pooled_data_lst = list[torch.Tensor]()
|
||||
if isinstance(hidden_states, list):
|
||||
for req_state, prompt_len in zip(hidden_states, prompt_lens):
|
||||
assert prompt_len == req_state.shape[0], \
|
||||
"partial prefill not supported with step pooling"
|
||||
pooled_data_lst = hidden_states
|
||||
else:
|
||||
offset = 0
|
||||
for prompt_len in prompt_lens:
|
||||
pooled_data_i = hidden_states[offset:offset + prompt_len]
|
||||
offset += prompt_len
|
||||
pooled_data_lst.append(pooled_data_i)
|
||||
|
||||
@staticmethod
|
||||
def wraps(module: nn.Module):
|
||||
if isinstance(module, nn.Identity):
|
||||
return PoolerIdentity()
|
||||
if isinstance(module, (nn.Sigmoid, nn.Softmax)):
|
||||
return PoolerClassify()
|
||||
pooled_data = list[torch.Tensor]()
|
||||
returned_token_ids = self.returned_token_ids
|
||||
step_tag_id = self.step_tag_id
|
||||
|
||||
return LambdaPoolerActivation(module)
|
||||
for data, token_id in zip(pooled_data_lst, prompt_token_ids):
|
||||
if returned_token_ids is not None and len(returned_token_ids) > 0:
|
||||
data = data[:, returned_token_ids]
|
||||
|
||||
@abstractmethod
|
||||
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
|
||||
def forward(self, pooled_data: _T) -> _T:
|
||||
if isinstance(pooled_data, list):
|
||||
return [self.forward_chunk(data) for data in pooled_data]
|
||||
|
||||
return self.forward_chunk(pooled_data)
|
||||
|
||||
|
||||
class PoolerIdentity(PoolerActivation):
|
||||
|
||||
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
|
||||
return pooled_data
|
||||
|
||||
|
||||
class PoolerNormalize(PoolerActivation):
|
||||
|
||||
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
|
||||
x = F.normalize(pooled_data.float(), p=2, dim=-1)
|
||||
return x.to(pooled_data.dtype)
|
||||
|
||||
|
||||
class PoolerClassify(PoolerActivation):
|
||||
|
||||
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
|
||||
num_labels = pooled_data.shape[-1]
|
||||
if num_labels < 2:
|
||||
return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
|
||||
|
||||
return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype)
|
||||
|
||||
|
||||
class PoolerScore(PoolerActivation):
|
||||
|
||||
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
|
||||
num_labels = pooled_data.shape[-1]
|
||||
if num_labels < 2:
|
||||
return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
|
||||
if step_tag_id is not None:
|
||||
data = data[token_id == step_tag_id]
|
||||
pooled_data.append(data)
|
||||
|
||||
return pooled_data
|
||||
|
||||
|
||||
class LambdaPoolerActivation(PoolerActivation):
|
||||
|
||||
def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
|
||||
super().__init__()
|
||||
|
||||
self.fn = fn
|
||||
|
||||
def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
|
||||
return self.fn(pooled_data)
|
||||
|
||||
|
||||
class PoolerHead(nn.Module):
|
||||
|
||||
@classmethod
|
||||
def from_config_with_defaults(
|
||||
cls,
|
||||
pooler_config: PoolerConfig,
|
||||
pooling_type: PoolingType,
|
||||
normalize: bool,
|
||||
softmax: bool,
|
||||
) -> "PoolerHead":
|
||||
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
|
||||
pooler_config=pooler_config,
|
||||
pooling_type=pooling_type,
|
||||
normalize=normalize,
|
||||
softmax=softmax,
|
||||
step_tag_id=None,
|
||||
returned_token_ids=None,
|
||||
)
|
||||
|
||||
return cls.from_config(resolved_config)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "PoolerHead":
|
||||
if pooler_config.normalize and pooler_config.softmax:
|
||||
raise ValueError("`normalize=True` and `softmax=True` should not "
|
||||
"be set together")
|
||||
|
||||
activation: PoolerActivation
|
||||
if pooler_config.normalize:
|
||||
activation = PoolerNormalize()
|
||||
elif pooler_config.softmax:
|
||||
activation = PoolerClassify()
|
||||
else:
|
||||
activation = PoolerIdentity()
|
||||
|
||||
return cls(activation)
|
||||
|
||||
def __init__(self, activation: PoolerActivation) -> None:
|
||||
def __init__(self, *, normalize: bool, softmax: bool) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.activation = activation
|
||||
self.normalize = normalize
|
||||
self.softmax = softmax
|
||||
|
||||
def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
|
||||
pooling_metadata: PoolingMetadata):
|
||||
@ -421,169 +312,60 @@ class PoolerHead(nn.Module):
|
||||
for vecs, d in zip(pooled_data, dimensions_list)
|
||||
]
|
||||
|
||||
return self.activation(pooled_data)
|
||||
if self.normalize:
|
||||
if isinstance(pooled_data, list):
|
||||
pooled_data = [
|
||||
F.normalize(data, p=2, dim=-1) for data in pooled_data
|
||||
]
|
||||
else:
|
||||
pooled_data = F.normalize(pooled_data, p=2, dim=-1)
|
||||
|
||||
if self.softmax:
|
||||
if isinstance(pooled_data, list):
|
||||
pooled_data = [
|
||||
F.softmax(data, dim=-1)
|
||||
if data.shape[-1] >= 2 else F.sigmoid(data)
|
||||
for data in pooled_data
|
||||
]
|
||||
else:
|
||||
if pooled_data.shape[-1] >= 2:
|
||||
pooled_data = F.softmax(pooled_data, dim=-1)
|
||||
else:
|
||||
pooled_data = F.sigmoid(pooled_data)
|
||||
|
||||
class SimplePooler(BasePooler):
|
||||
"""A layer that pools specific information from hidden states.
|
||||
|
||||
This layer does the following:
|
||||
1. Extracts specific tokens or aggregates data based on pooling method.
|
||||
2. Normalizes output if specified.
|
||||
3. Returns structured results as `PoolerOutput`.
|
||||
|
||||
Attributes:
|
||||
pooling_type: The type of pooling to use.
|
||||
normalize: Whether to normalize the pooled data.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_config_with_defaults(
|
||||
cls,
|
||||
pooler_config: PoolerConfig,
|
||||
pooling_type: PoolingType,
|
||||
normalize: bool,
|
||||
softmax: bool,
|
||||
) -> "SimplePooler":
|
||||
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
|
||||
pooler_config=pooler_config,
|
||||
pooling_type=pooling_type,
|
||||
normalize=normalize,
|
||||
softmax=softmax,
|
||||
)
|
||||
assert resolved_config.pooling_type != PoolingType.STEP
|
||||
|
||||
return cls.from_config(resolved_config)
|
||||
|
||||
@classmethod
|
||||
def from_config(
|
||||
cls,
|
||||
pooler_config: ResolvedPoolingConfig,
|
||||
) -> "SimplePooler":
|
||||
pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type)
|
||||
head = PoolerHead.from_config(pooler_config)
|
||||
|
||||
return cls(pooling, head)
|
||||
|
||||
def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.pooling = pooling
|
||||
self.head = head
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> PoolerOutput:
|
||||
pooled_data = self.pooling(hidden_states, pooling_metadata)
|
||||
pooled_data = self.head(pooled_data, pooling_metadata)
|
||||
return build_output(pooled_data)
|
||||
|
||||
|
||||
class StepPooler(BasePooler):
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "StepPooler":
|
||||
assert pooler_config.pooling_type == PoolingType.STEP
|
||||
|
||||
return cls(
|
||||
PoolerHead.from_config(pooler_config),
|
||||
step_tag_id=pooler_config.step_tag_id,
|
||||
returned_token_ids=pooler_config.returned_token_ids,
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head: PoolerHead,
|
||||
*,
|
||||
step_tag_id: Optional[int] = None,
|
||||
returned_token_ids: Optional[list[int]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.pooling = AllPool()
|
||||
self.head = head
|
||||
self.step_tag_id = step_tag_id
|
||||
self.returned_token_ids = returned_token_ids
|
||||
|
||||
def get_prompt_token_ids(
|
||||
self,
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> list[torch.Tensor]:
|
||||
if isinstance(pooling_metadata, V1PoolingMetadata):
|
||||
return [
|
||||
pooling_metadata.prompt_token_ids[i, :num]
|
||||
for i, num in enumerate(pooling_metadata.prompt_lens)
|
||||
]
|
||||
return [
|
||||
torch.tensor(seq_data_i.prompt_token_ids)
|
||||
for seq_data_i in pooling_metadata.seq_data.values()
|
||||
]
|
||||
|
||||
def extract_states(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[list[torch.Tensor], torch.Tensor]:
|
||||
pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
|
||||
prompt_token_ids = self.get_prompt_token_ids(pooling_metadata)
|
||||
|
||||
pooled_data = list[torch.Tensor]()
|
||||
returned_token_ids = self.returned_token_ids
|
||||
step_tag_id = self.step_tag_id
|
||||
|
||||
for data, token_id in zip(pooled_data_lst, prompt_token_ids):
|
||||
if returned_token_ids is not None and len(returned_token_ids) > 0:
|
||||
data = data[:, returned_token_ids]
|
||||
|
||||
if step_tag_id is not None:
|
||||
data = data[token_id == step_tag_id]
|
||||
pooled_data.append(data)
|
||||
|
||||
# shape:
|
||||
# classify (& score) -> (batch_size, num_classes)
|
||||
# embed -> (batch_size, embedding_dim) or list(embedding_dim)
|
||||
# (batch_size, dimensions) or list(dimensions) if using MRL
|
||||
return pooled_data
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> PoolerOutput:
|
||||
pooled_data = self.extract_states(hidden_states, pooling_metadata)
|
||||
pooled_data = self.head(pooled_data, pooling_metadata)
|
||||
return build_output(pooled_data)
|
||||
|
||||
|
||||
class Pooler(nn.Module):
|
||||
|
||||
@staticmethod
|
||||
@classmethod
|
||||
def from_config_with_defaults(
|
||||
cls,
|
||||
pooler_config: PoolerConfig,
|
||||
pooling_type: PoolingType,
|
||||
normalize: bool,
|
||||
softmax: bool,
|
||||
step_tag_id: Optional[int] = None,
|
||||
returned_token_ids: Optional[list[int]] = None,
|
||||
) -> BasePooler:
|
||||
resolved_config = ResolvedPoolingConfig.from_config_with_defaults(
|
||||
pooler_config=pooler_config,
|
||||
pooling_type=pooling_type,
|
||||
normalize=normalize,
|
||||
softmax=softmax,
|
||||
step_tag_id=step_tag_id,
|
||||
returned_token_ids=returned_token_ids,
|
||||
) -> SimplePooler:
|
||||
return SimplePooler.from_pooling_type(
|
||||
pooling_type=PoolingType[pooler_config.pooling_type]
|
||||
if pooler_config.pooling_type is not None else pooling_type,
|
||||
normalize=pooler_config.normalize
|
||||
if pooler_config.normalize is not None else normalize,
|
||||
softmax=pooler_config.softmax
|
||||
if pooler_config.softmax is not None else softmax,
|
||||
step_tag_id=pooler_config.step_tag_id
|
||||
if pooler_config.step_tag_id is not None else step_tag_id,
|
||||
returned_token_ids=pooler_config.returned_token_ids
|
||||
if pooler_config.returned_token_ids is not None else
|
||||
returned_token_ids,
|
||||
)
|
||||
|
||||
if pooling_type == PoolingType.STEP:
|
||||
return StepPooler.from_config(resolved_config)
|
||||
|
||||
return SimplePooler.from_config(resolved_config)
|
||||
|
||||
|
||||
PoolingFn = Callable[
|
||||
[Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
|
||||
Union[torch.Tensor, list[torch.Tensor]]]
|
||||
ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
|
||||
|
||||
|
||||
class ClassifierPooler(nn.Module):
|
||||
"""A pooling layer for classification tasks.
|
||||
@ -600,39 +382,69 @@ class ClassifierPooler(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
config: ModelConfig,
|
||||
pooling: PoolingFn,
|
||||
classifier: ClassifierFn,
|
||||
act_fn: Optional[PoolerActivation] = None,
|
||||
) -> None:
|
||||
classifier: nn.Module,
|
||||
pooler: Optional[nn.Module] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.pooling = pooling
|
||||
self.classifier = classifier
|
||||
self.pooler = pooler
|
||||
|
||||
self.classification_act_fn = get_classification_activation_function(
|
||||
config.hf_config) if act_fn is None else act_fn
|
||||
config.hf_config)
|
||||
self.cross_encoder_act_fn = get_cross_encoder_activation_function(
|
||||
config.hf_config) if act_fn is None else act_fn
|
||||
config.hf_config)
|
||||
|
||||
def _get_act_fn(self, use_cross_encoder: bool):
|
||||
return (self.cross_encoder_act_fn
|
||||
if use_cross_encoder else self.classification_act_fn)
|
||||
|
||||
def get_prompt_lens(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> torch.Tensor:
|
||||
if isinstance(pooling_metadata, V1PoolingMetadata):
|
||||
return pooling_metadata.prompt_lens
|
||||
assert isinstance(hidden_states, torch.Tensor)
|
||||
return PoolingTensors.from_pooling_metadata(
|
||||
pooling_metadata, hidden_states.device).prompt_lens
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> PoolerOutput:
|
||||
"""Pools sentence pair scores from the hidden_states."""
|
||||
pooled_data = self.pooling(hidden_states, pooling_metadata)
|
||||
prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata)
|
||||
|
||||
# apply classifier once on the full batch if possible
|
||||
if isinstance(pooled_data, torch.Tensor):
|
||||
pooled_output = self.classifier(pooled_data)
|
||||
elif len({data.shape for data in pooled_data}) <= 1:
|
||||
pooled_output = self.classifier(torch.stack(pooled_data))
|
||||
pooled_data = list[torch.Tensor]()
|
||||
if isinstance(hidden_states, list):
|
||||
for req_state, prompt_len in zip(hidden_states, prompt_lens):
|
||||
assert prompt_len == req_state.shape[0], \
|
||||
"partial prefill not supported with classifier"
|
||||
pooled_data = hidden_states
|
||||
else:
|
||||
pooled_output = [self.classifier(data) for data in pooled_data]
|
||||
offset = 0
|
||||
for prompt_len in prompt_lens:
|
||||
pooled_data_i = hidden_states[offset:offset + prompt_len]
|
||||
offset += prompt_len
|
||||
pooled_data.append(pooled_data_i)
|
||||
|
||||
pooled_data_lst = []
|
||||
for pooled_data_i in pooled_data:
|
||||
|
||||
if self.pooler is not None:
|
||||
final_shape_tensor = self.pooler(pooled_data_i)
|
||||
else:
|
||||
final_shape_tensor = self.classifier(pooled_data_i)
|
||||
|
||||
pooled_data_lst.append(final_shape_tensor)
|
||||
|
||||
pooled_output = torch.stack(pooled_data_lst)
|
||||
|
||||
if self.pooler is not None:
|
||||
# apply classifier once on the full batch if possible
|
||||
pooled_output = self.classifier(pooled_output)
|
||||
|
||||
if isinstance(pooling_metadata, V0PoolingMetadata):
|
||||
use_cross_encoder_list = [
|
||||
@ -657,4 +469,5 @@ class ClassifierPooler(nn.Module):
|
||||
pooled_output)
|
||||
])
|
||||
|
||||
return build_output(scores)
|
||||
pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores]
|
||||
return PoolerOutput(outputs=pooled_outputs)
|
||||
|
@ -36,7 +36,6 @@ QuantizationMethods = Literal[
|
||||
"torchao",
|
||||
"auto-round",
|
||||
"rtn",
|
||||
"inc",
|
||||
]
|
||||
QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
|
||||
|
||||
@ -105,7 +104,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
|
||||
from .gptq_marlin import GPTQMarlinConfig
|
||||
from .gptq_marlin_24 import GPTQMarlin24Config
|
||||
from .hqq_marlin import HQQMarlinConfig
|
||||
from .inc import INCConfig
|
||||
from .ipex_quant import IPEXConfig
|
||||
from .marlin import MarlinConfig
|
||||
from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
|
||||
@ -146,8 +144,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
|
||||
"moe_wna16": MoeWNA16Config,
|
||||
"torchao": TorchAOConfig,
|
||||
"auto-round": AutoRoundConfig,
|
||||
"rtn": RTNConfig,
|
||||
"inc": INCConfig,
|
||||
"rtn": RTNConfig
|
||||
}
|
||||
# Update the `method_to_config` with customized quantization methods.
|
||||
method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
|
||||
@ -160,4 +157,4 @@ __all__ = [
|
||||
"QuantizationMethods",
|
||||
"get_quantization_config",
|
||||
"QUANTIZATION_METHODS",
|
||||
]
|
||||
]
|
@ -929,8 +929,10 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
|
||||
scoring_func=scoring_func,
|
||||
e_score_correction_bias=e_score_correction_bias)
|
||||
|
||||
per_act_token = (
|
||||
self.input_quant.strategy == QuantizationStrategy.TOKEN)
|
||||
a1_scale = layer.w13_input_scale
|
||||
a2_scale = layer.w2_input_scale
|
||||
per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
|
||||
a2_scale.numel() != 1 if a2_scale is not None else False)
|
||||
|
||||
if self.fused_experts is None:
|
||||
# If no modular kernel is provided, use cutlass_moe_fp8
|
||||
@ -948,8 +950,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
|
||||
expert_map=None if self.disable_expert_map else expert_map,
|
||||
w1_scale=layer.w13_weight_scale,
|
||||
w2_scale=layer.w2_weight_scale,
|
||||
a1_scale=layer.w13_input_scale,
|
||||
a2_scale=layer.w2_input_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
)
|
||||
else:
|
||||
return self.fused_experts(
|
||||
|
@ -1,61 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Intel Gaudi supports quantization of various modules and functions,
|
||||
# including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`.
|
||||
# During model loading,
|
||||
# INC will patch layers with quantization/dequantization operators.
|
||||
# Meanwhile, INC will convert original weight to target datatype
|
||||
# and loading to target device.
|
||||
# static scaling should be provided through Quant_CONFIG:
|
||||
# `QUANT_CONFIG` is an environment variable,
|
||||
# that points to the measurement or quantization JSON config file.
|
||||
# The measurement configuration file is used during the calibration procedure,
|
||||
# to collect measurements for a given model.
|
||||
# The quantization configuration is used during inference.
|
||||
# For more information, please refer to:
|
||||
# https://docs.habana.ai/en/v1.21.1/PyTorch/vLLM_Inference/vLLM_FP8_Inference.html
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, UnquantizedFusedMoEMethod)
|
||||
from vllm.model_executor.layers.linear import (LinearBase,
|
||||
UnquantizedLinearMethod)
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig, QuantizeMethodBase)
|
||||
|
||||
|
||||
class INCConfig(QuantizationConfig):
|
||||
"""Config class for FP8 using Intel Neural Compressor."""
|
||||
|
||||
@classmethod
|
||||
def get_name(cls) -> QuantizationMethods:
|
||||
return "inc"
|
||||
|
||||
@classmethod
|
||||
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
|
||||
return [torch.bfloat16]
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config: dict[str, Any]) -> "INCConfig":
|
||||
raise AssertionError
|
||||
|
||||
def get_quant_method(self, layer: torch.nn.Module,
|
||||
prefix: str) -> Optional["QuantizeMethodBase"]:
|
||||
if isinstance(layer, LinearBase):
|
||||
return UnquantizedLinearMethod()
|
||||
elif isinstance(layer, FusedMoE):
|
||||
return UnquantizedFusedMoEMethod(layer.moe_config)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
raise AssertionError
|
||||
|
||||
@staticmethod
|
||||
def get_config_filenames() -> list[str]:
|
||||
return []
|
@ -378,6 +378,8 @@ def per_token_group_quant_fp8(
|
||||
is supported for now.
|
||||
column_major_scales: Outputs scales in column major.
|
||||
out_q: Optional output tensor. If not provided, function will create.
|
||||
tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
|
||||
scaling factor for quantization.
|
||||
Returns:
|
||||
tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
|
||||
scaling factor.
|
||||
|
@ -6,12 +6,9 @@ import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import LoadConfig, ModelConfig, VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader.utils import (
|
||||
initialize_model, process_weights_after_loading, set_default_torch_dtype)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class BaseModelLoader(ABC):
|
||||
"""Base class for model loaders."""
|
||||
@ -35,16 +32,11 @@ class BaseModelLoader(ABC):
|
||||
model_config: ModelConfig) -> nn.Module:
|
||||
"""Load a model with the given configurations."""
|
||||
device_config = vllm_config.device_config
|
||||
load_config = vllm_config.load_config
|
||||
load_device = device_config.device if load_config.device is None else \
|
||||
load_config.device
|
||||
target_device = torch.device(load_device)
|
||||
target_device = torch.device(device_config.device)
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with target_device:
|
||||
model = initialize_model(vllm_config=vllm_config,
|
||||
model_config=model_config)
|
||||
|
||||
logger.debug("Loading weights on %s ...", load_device)
|
||||
# Quantization does not happen in `load_weights` but after it
|
||||
self.load_weights(model, model_config)
|
||||
process_weights_after_loading(model, model_config, target_device)
|
||||
|
@ -152,8 +152,8 @@ def get_quant_config(model_config: ModelConfig,
|
||||
quant_cls = get_quantization_config(model_config.quantization)
|
||||
|
||||
# GGUF doesn't have config file
|
||||
if model_config.quantization in ("gguf", "inc"):
|
||||
return quant_cls()
|
||||
if model_config.quantization == "gguf":
|
||||
return quant_cls.from_config({})
|
||||
|
||||
# Read the quantization config from the HF model config, if available.
|
||||
hf_quant_config = getattr(model_config.hf_config, "quantization_config",
|
||||
|
@ -58,27 +58,22 @@ def _create_pooling_model_cls(
|
||||
) -> None:
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
|
||||
|
||||
self.vllm_config = vllm_config
|
||||
|
||||
# These are not used in pooling models
|
||||
for attr in ("lm_head", "logits_processor"):
|
||||
if hasattr(self, attr):
|
||||
delattr(self, attr)
|
||||
|
||||
# If the model already defines a pooler instance, don't overwrite it
|
||||
if not getattr(self, "_pooler", None):
|
||||
self._init_pooler(vllm_config, prefix=prefix)
|
||||
|
||||
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self._pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=default_pooling_type,
|
||||
normalize=default_normalize,
|
||||
softmax=default_softmax,
|
||||
)
|
||||
# If the model already defines a pooler instance, don't overwrite it
|
||||
if not getattr(self, "_pooler", None):
|
||||
self._pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=default_pooling_type,
|
||||
normalize=default_normalize,
|
||||
softmax=default_softmax,
|
||||
)
|
||||
|
||||
def pooler(
|
||||
self,
|
||||
@ -170,9 +165,7 @@ def as_seq_cls_model(cls: _T) -> _T:
|
||||
|
||||
# Lazy import
|
||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler,
|
||||
PoolerOutput, PoolingType,
|
||||
SimplePooler)
|
||||
from vllm.model_executor.layers.pooler import PoolerOutput, PoolingType
|
||||
from vllm.model_executor.models.interfaces import SupportsCrossEncoding
|
||||
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@ -189,40 +182,30 @@ def as_seq_cls_model(cls: _T) -> _T:
|
||||
class ModelForSequenceClassification(ModelForPooling,
|
||||
SupportsCrossEncoding):
|
||||
|
||||
def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
vllm_config: "VllmConfig",
|
||||
prefix: str = "",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.score = RowParallelLinear(
|
||||
config.hidden_size,
|
||||
config.num_labels,
|
||||
input_is_parallel=False,
|
||||
bias=False,
|
||||
params_dtype=torch.float32,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, "score"),
|
||||
)
|
||||
self.vllm_config = vllm_config
|
||||
self.task = vllm_config.model_config.task
|
||||
self.pooling_type = (
|
||||
vllm_config.model_config.pooler_config.pooling_type)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
pooler = SimplePooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=False,
|
||||
softmax=True,
|
||||
)
|
||||
|
||||
self._pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=pooler.pooling,
|
||||
classifier=self._classifier,
|
||||
act_fn=pooler.head.activation,
|
||||
)
|
||||
|
||||
def _classifier(self, x: torch.Tensor):
|
||||
x, _ = self.score(x.float())
|
||||
return x
|
||||
self.score = RowParallelLinear(config.hidden_size,
|
||||
config.num_labels,
|
||||
quant_config=quant_config,
|
||||
input_is_parallel=False,
|
||||
bias=False,
|
||||
prefix=maybe_prefix(
|
||||
prefix, "score"))
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -239,7 +222,27 @@ def as_seq_cls_model(cls: _T) -> _T:
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> PoolerOutput:
|
||||
return self._pooler(hidden_states, pooling_metadata)
|
||||
|
||||
def get_logits(hidden_states):
|
||||
if isinstance(hidden_states, list):
|
||||
logits = [self.score(state)[0] for state in hidden_states]
|
||||
else:
|
||||
logits, _ = self.score(hidden_states)
|
||||
return logits
|
||||
|
||||
if self.pooling_type == PoolingType.ALL:
|
||||
logits = get_logits(hidden_states)
|
||||
return self._pooler(logits, pooling_metadata)
|
||||
else:
|
||||
hidden_states = self._pooler.extract_states(
|
||||
hidden_states, pooling_metadata)
|
||||
logits = get_logits(hidden_states)
|
||||
pooled_data = self._pooler.head(logits, pooling_metadata)
|
||||
|
||||
pooled_outputs = [
|
||||
self._pooler.build_output(data) for data in pooled_data
|
||||
]
|
||||
return PoolerOutput(outputs=pooled_outputs)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
tokens = getattr(self.config, "classifier_from_token", None)
|
||||
|
@ -47,6 +47,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
@ -484,6 +485,7 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
|
||||
else:
|
||||
self.lm_head = PPMissingLayer()
|
||||
|
||||
self.sampler = get_sampler()
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors)
|
||||
|
||||
@ -510,6 +512,14 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(logits, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
|
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -18,7 +18,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler,
|
||||
PoolingMethod, PoolingType)
|
||||
PoolingType)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
@ -84,18 +84,14 @@ class BertPooler(nn.Module):
|
||||
|
||||
def __init__(self, config: BertConfig):
|
||||
super().__init__()
|
||||
|
||||
self.pooling = PoolingMethod.from_pooling_type(PoolingType.CLS)
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.activation = nn.Tanh()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
pooled_output = self.pooling(hidden_states, pooling_metadata)
|
||||
pooled_output = self.dense(pooled_output)
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
# We "pool" the model by simply taking the hidden state corresponding
|
||||
# to the first token.
|
||||
first_token_tensor = hidden_states[0, :]
|
||||
pooled_output = self.dense(first_token_tensor)
|
||||
pooled_output = self.activation(pooled_output)
|
||||
return pooled_output
|
||||
|
||||
@ -476,11 +472,8 @@ class BertForSequenceClassification(nn.Module, SupportsV0Only,
|
||||
embedding_class=BertEmbedding,
|
||||
add_pooling_layer=True)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
self._pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=self.bert.pooler,
|
||||
classifier=self.classifier,
|
||||
)
|
||||
self._pooler = ClassifierPooler(vllm_config.model_config,
|
||||
self.classifier, self.bert.pooler)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(self)
|
||||
|
@ -205,19 +205,6 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
|
||||
}
|
||||
|
||||
|
||||
class GraniteMoeHybridModelConfig(VerifyAndUpdateConfig):
|
||||
|
||||
@staticmethod
|
||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
||||
config = vllm_config.model_config
|
||||
config.max_seq_len_to_capture = config.max_model_len
|
||||
logger.info(
|
||||
"Setting max_seq_len_to_capture to %d "
|
||||
"to ensure that CUDA graph capture "
|
||||
"covers sequences of length up to max_model_len.",
|
||||
config.max_model_len)
|
||||
|
||||
|
||||
class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
|
||||
|
||||
@classmethod
|
||||
@ -310,5 +297,4 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
||||
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
|
||||
"XLMRobertaModel": JinaRobertaModelConfig,
|
||||
"JinaVLForRanking": JinaVLForSequenceClassificationConfig,
|
||||
"GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig,
|
||||
}
|
||||
|
@ -36,6 +36,7 @@ from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.sampler import get_sampler
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
@ -548,6 +549,7 @@ class GraniteSpeechForConditionalGeneration(
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
self.cache_config = cache_config
|
||||
self.sampler = get_sampler()
|
||||
|
||||
# The language model is typically a Granite LLM
|
||||
self.language_model = init_vllm_registered_model(
|
||||
|
@ -9,7 +9,7 @@ import torch.nn as nn
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.pooler import PoolerHead, PoolerNormalize
|
||||
from vllm.model_executor.layers.pooler import PoolerHead
|
||||
from vllm.model_executor.models.llama import LlamaForCausalLM
|
||||
from vllm.model_executor.pooling_metadata import (PoolingMetadata,
|
||||
PoolingTensors)
|
||||
@ -49,7 +49,7 @@ class GritLMPooler(nn.Module):
|
||||
self.embed_pattern_ids = tokens_to_ids(
|
||||
["▁<", "|", "embed", "|", ">", "<0x0A>"])
|
||||
|
||||
self.head = PoolerHead(PoolerNormalize())
|
||||
self.head = PoolerHead(normalize=True, softmax=False)
|
||||
|
||||
def _find_array(self, arr: array, target: array, start_idx: int) -> int:
|
||||
"""
|
||||
|
@ -49,6 +49,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
@ -660,6 +661,7 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
|
||||
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
|
||||
config.vocab_size,
|
||||
logit_scale)
|
||||
self.sampler = get_sampler()
|
||||
else:
|
||||
self.lm_head = PPMissingLayer()
|
||||
|
||||
@ -683,6 +685,14 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(logits, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
def make_empty_intermediate_tensors(
|
||||
self, batch_size: int, dtype: torch.dtype,
|
||||
device: torch.device) -> IntermediateTensors:
|
||||
|
@ -22,8 +22,8 @@ from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
|
||||
Idefics3Processor)
|
||||
from transformers import (AddedToken, BatchFeature, Idefics3Config,
|
||||
Idefics3ImageProcessor, Idefics3Processor)
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
@ -199,14 +199,21 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
return grid_w * grid_h + 1
|
||||
|
||||
# TODO: Remove after requiring transformers>=4.52
|
||||
def _get_content(self, token: Union[AddedToken, str]) -> str:
|
||||
if isinstance(token, str):
|
||||
return token
|
||||
|
||||
return token.content
|
||||
|
||||
def _get_image_token(
|
||||
self,
|
||||
processor: Optional[Idefics3Processor]) -> tuple[str, str, str]:
|
||||
if processor is None:
|
||||
processor = self.get_hf_processor()
|
||||
|
||||
image_token = processor.image_token
|
||||
fake_image_token = processor.fake_image_token
|
||||
image_token = self._get_content(processor.image_token)
|
||||
fake_image_token = self._get_content(processor.fake_image_token)
|
||||
global_image_token = processor.global_image_tag
|
||||
return image_token, fake_image_token, global_image_token
|
||||
|
||||
|
@ -659,7 +659,7 @@ def supports_cross_encoding(
|
||||
def has_step_pooler(model: Union[type[object], object]) -> bool:
|
||||
"""Check if the model uses step pooler."""
|
||||
return is_pooling_model(model) and any(
|
||||
type(module).__name__ == "StepPooler" for module in model.modules())
|
||||
type(module).__name__ == "StepPool" for module in model.modules())
|
||||
|
||||
|
||||
class SupportsQuant:
|
||||
@ -722,8 +722,7 @@ class SupportsTranscription(Protocol):
|
||||
|
||||
@classmethod
|
||||
def get_generation_prompt(cls, audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig, language: str,
|
||||
stt_config: SpeechToTextConfig, language: str,
|
||||
task_type: str,
|
||||
request_prompt: str) -> PromptType:
|
||||
"""Get the prompt for the ASR model.
|
||||
|
@ -19,8 +19,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
|
||||
from vllm.model_executor.layers.pooler import (ClassifierPooler, PoolingType,
|
||||
SimplePooler)
|
||||
from vllm.model_executor.layers.pooler import Pooler, PoolingType
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
|
||||
@ -565,41 +564,29 @@ class JambaForSequenceClassification(JambaForCausalLM):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
num_labels: int = config.num_labels
|
||||
score_bias: bool = getattr(config, 'score_bias', False)
|
||||
|
||||
# TODO: The original reward weights have float32 accuracy data, we
|
||||
# would like to load them in fp32 to get that extra precision.
|
||||
# Currently weight_loader passes the weight which is already in bf16
|
||||
self.score = nn.Linear(
|
||||
config.hidden_size,
|
||||
num_labels,
|
||||
bias=score_bias,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
self.score = nn.Linear(config.hidden_size, num_labels, bias=score_bias)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
pooler = SimplePooler.from_config_with_defaults(
|
||||
self._pooler = Pooler.from_config_with_defaults(
|
||||
pooler_config,
|
||||
pooling_type=PoolingType.LAST,
|
||||
normalize=False,
|
||||
softmax=False,
|
||||
)
|
||||
|
||||
self._pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=pooler.pooling,
|
||||
classifier=self.score,
|
||||
act_fn=pooler.head.activation,
|
||||
)
|
||||
softmax=False)
|
||||
|
||||
def pooler(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Optional[PoolerOutput]:
|
||||
return self._pooler(hidden_states, pooling_metadata)
|
||||
hidden_states = hidden_states.float()
|
||||
logits = self.score(hidden_states)
|
||||
return self._pooler(logits, pooling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
# TODO: The reward weights themselves have float32 accuracy data, we
|
||||
# would like to load them in fp32 to get that extra precision.
|
||||
super().load_weights(weights)
|
||||
self.score = self.score.float()
|
||||
|
@ -1,214 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.parallel_state import get_pp_group
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.quantization.torchao import TorchAOConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.llama4 import (Llama4DecoderLayer,
|
||||
Llama4ForCausalLM)
|
||||
from vllm.model_executor.models.utils import extract_layer_index
|
||||
|
||||
from .utils import AutoWeightsLoader, maybe_prefix
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
class LlamaModel(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
start_layer_id: int = 0,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.config = (
|
||||
vllm_config.speculative_config.draft_model_config.hf_config)
|
||||
self.validate_and_update_config(start_layer_id, quant_config)
|
||||
self.vocab_size = self.config.vocab_size
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
self.config.vocab_size,
|
||||
self.config.hidden_size,
|
||||
prefix=maybe_prefix(prefix, "embed_tokens"),
|
||||
)
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
Llama4DecoderLayer(
|
||||
self.config,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
|
||||
) for i in range(self.config.num_hidden_layers)
|
||||
])
|
||||
self.fc = torch.nn.Linear(self.config.hidden_size * 2,
|
||||
self.config.hidden_size,
|
||||
bias=False)
|
||||
self.norm = RMSNorm(self.config.hidden_size,
|
||||
eps=self.config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor],
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
input_embeds = self.embed_tokens(input_ids)
|
||||
hidden_states = self.fc(
|
||||
torch.cat((input_embeds, hidden_states), dim=-1))
|
||||
residual = None
|
||||
for layer in self.layers:
|
||||
hidden_states, residual = layer(
|
||||
positions,
|
||||
hidden_states,
|
||||
residual,
|
||||
)
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states, hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
(".qkv_proj", ".k_proj", "k"),
|
||||
(".qkv_proj", ".v_proj", "v"),
|
||||
(".gate_up_proj", ".gate_proj", 0),
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
name = name.removeprefix("model.")
|
||||
for param_name, weight_name, shard_id in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# if PP disabled then draft will share embed with target
|
||||
if get_pp_group().world_size == 1 and \
|
||||
"embed_tokens." in name:
|
||||
continue
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
for name in params_dict:
|
||||
# if PP disabled then draft will share embed with target
|
||||
if get_pp_group().world_size == 1 and \
|
||||
"embed_tokens." in name:
|
||||
continue
|
||||
assert name in loaded_params, f"{name} is not loaded!"
|
||||
return loaded_params
|
||||
|
||||
def validate_and_update_config(
|
||||
self,
|
||||
start_layer_id: int,
|
||||
quant_config: Optional[QuantizationConfig] = None) -> None:
|
||||
# yoco and moe is not supported by draft model yet
|
||||
assert self.config.yoco_global_kv_layer is None
|
||||
assert self.config.yoco_local_kv_layer is None
|
||||
assert len(self.config.moe_layers) == 0
|
||||
# draft model layer index is increased by start_layer_id,
|
||||
# so we need to pad relevant configs accordingly
|
||||
self.config.no_rope_layers = [
|
||||
0
|
||||
] * start_layer_id + self.config.no_rope_layers
|
||||
# currently only TorchAO quantization is supported
|
||||
if isinstance(quant_config, TorchAOConfig):
|
||||
|
||||
def pad_layer_name(layer: str) -> str:
|
||||
layer_index = extract_layer_index(layer)
|
||||
return layer.replace(str(layer_index),
|
||||
str(layer_index + start_layer_id))
|
||||
|
||||
quant_config.torchao_config.module_fqn_to_config = {
|
||||
pad_layer_name(layer): quantization
|
||||
for layer, quantization in
|
||||
quant_config.torchao_config.module_fqn_to_config.items()
|
||||
}
|
||||
|
||||
|
||||
class EagleLlama4ForCausalLM(Llama4ForCausalLM):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
nn.Module.__init__(self)
|
||||
self.config = (
|
||||
vllm_config.speculative_config.draft_model_config.hf_config)
|
||||
target_layer_num = vllm_config.model_config.get_num_layers(
|
||||
vllm_config.parallel_config)
|
||||
# draft model quantization config may differ from target model
|
||||
quant_config = VllmConfig.get_quantization_config(
|
||||
vllm_config.speculative_config.draft_model_config,
|
||||
vllm_config.load_config)
|
||||
self.model = LlamaModel(vllm_config=vllm_config,
|
||||
prefix="model",
|
||||
start_layer_id=target_layer_num,
|
||||
quant_config=quant_config)
|
||||
logit_scale = getattr(self.config, "logit_scale", 1.0)
|
||||
self.logits_processor = LogitsProcessor(self.config.vocab_size,
|
||||
scale=logit_scale)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return self.model(input_ids, positions, hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> None:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
# lm_head is tied with target model (Llama4ForCausalLM)
|
||||
skip_prefixes=(["lm_head."]),
|
||||
)
|
||||
|
||||
model_weights = {}
|
||||
weights = [
|
||||
self.permute_qk_weight_for_rotary(name, loaded_weight)
|
||||
for name, loaded_weight in weights
|
||||
]
|
||||
for name, loaded_weight in weights:
|
||||
if "lm_head" not in name:
|
||||
name = "model." + name
|
||||
model_weights[name] = loaded_weight
|
||||
|
||||
loader.load_weights(model_weights.items())
|
@ -36,6 +36,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.sampler import get_sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader, maybe_remap_kv_scale_name)
|
||||
@ -175,6 +176,7 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
|
||||
self.lm_head = PPMissingLayer()
|
||||
|
||||
self.logits_processor = LogitsProcessor(config.vocab_size)
|
||||
self.sampler = get_sampler()
|
||||
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors)
|
||||
|
@ -30,6 +30,7 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
@ -160,6 +161,8 @@ class MiMoMTP(nn.Module):
|
||||
self.lm_head = ParallelLMHead(self.config.vocab_size,
|
||||
self.config.hidden_size)
|
||||
|
||||
self.sampler = get_sampler()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
@ -184,6 +187,14 @@ class MiMoMTP(nn.Module):
|
||||
return self.model.compute_logits(hidden_states, self.lm_head,
|
||||
sampling_metadata, spec_step_idx)
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(logits, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -13,8 +13,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.linear import (QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.pooler import (BasePooler, ClassifierPooler,
|
||||
PoolingMethod, PoolingType)
|
||||
from vllm.model_executor.layers.pooler import ClassifierPooler
|
||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
@ -253,13 +252,10 @@ class ModernBertModel(nn.Module):
|
||||
return norm_outputs
|
||||
|
||||
|
||||
class ModernBertPooler(BasePooler):
|
||||
class ModernBertPooler(nn.Module):
|
||||
|
||||
def __init__(self, config: ModernBertConfig):
|
||||
super().__init__()
|
||||
|
||||
pooling_type = PoolingType[config.classifier_pooling.upper()]
|
||||
self.pooling = PoolingMethod.from_pooling_type(pooling_type)
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size,
|
||||
config.classifier_bias)
|
||||
self.pooling_type = config.classifier_pooling
|
||||
@ -268,12 +264,15 @@ class ModernBertPooler(BasePooler):
|
||||
eps=config.norm_eps,
|
||||
bias=config.norm_bias)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: Union[torch.Tensor, list[torch.Tensor]],
|
||||
pooling_metadata: PoolingMetadata,
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
pooled_output = self.pooling(hidden_states, pooling_metadata)
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
pooled_output = hidden_states
|
||||
if self.pooling_type == "mean":
|
||||
pooled_output = pooled_output.mean(dim=0, keepdim=False)
|
||||
elif self.pooling_type == "cls":
|
||||
pooled_output = pooled_output[0, :]
|
||||
else:
|
||||
raise ValueError("Pooling type should be either `cls` or `mean`, "
|
||||
f"but got {self.pooling_type}")
|
||||
pooled_output = self.norm(self.act(self.dense(pooled_output)))
|
||||
return pooled_output
|
||||
|
||||
@ -288,11 +287,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsV0Only,
|
||||
self.model = ModernBertModel(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "modernbert"))
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
self._pooler = ClassifierPooler(
|
||||
vllm_config.model_config,
|
||||
pooling=ModernBertPooler(config),
|
||||
classifier=self.classifier,
|
||||
)
|
||||
self._pooler = ClassifierPooler(vllm_config.model_config,
|
||||
self.classifier,
|
||||
ModernBertPooler(config))
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user