mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Compare commits
86 Commits
woosuk/mod
...
v0.11.1rc0
Author | SHA1 | Date | |
---|---|---|---|
b761df963c | |||
33f6aaf972 | |||
56aafa8c0b | |||
8d52f2b3a7 | |||
984d18498a | |||
d4d9899860 | |||
db1e42f627 | |||
bc9d7b5595 | |||
fe6b19c314 | |||
2827b3f4a3 | |||
2b6b1d7809 | |||
633f943e30 | |||
b03b1b97f6 | |||
dfb9af2014 | |||
19f76ee68e | |||
dd70437a4f | |||
99b3a504c5 | |||
6e30010d2f | |||
52621c8f5c | |||
d48f4d6daf | |||
e84e0735c7 | |||
3edf87d25f | |||
392edee34a | |||
983056e456 | |||
13dd93c667 | |||
53a30845be | |||
8b77328ffe | |||
9fe4c2bdb9 | |||
081b5594a2 | |||
57329a8c01 | |||
8c435c9bce | |||
e71b8e210d | |||
89fa54e6f7 | |||
3d54bdcb73 | |||
6b0fcbbf43 | |||
0fa673af4c | |||
3468f17ebe | |||
71b25b0d48 | |||
0ea80c87d9 | |||
b8d9e4a326 | |||
13cc7f5370 | |||
916bd9204d | |||
e04a1b6b21 | |||
2e5df88c92 | |||
0754ac4c49 | |||
03858e6d1c | |||
532a6cfccb | |||
eb32335e35 | |||
69a8c8e99a | |||
6c340da4df | |||
2f17117606 | |||
1e9a77e037 | |||
d2af67441d | |||
0bcc3a160d | |||
70fbdb26e9 | |||
7f570f1caa | |||
eaeca3cd7f | |||
12c1287d64 | |||
17b4c6685c | |||
3c2b2ccece | |||
7be9ffcd9f | |||
393de22d2e | |||
1260180c67 | |||
af4ee63e0e | |||
bc092ea873 | |||
755ed7b05b | |||
a676e668ee | |||
c85be1f6dd | |||
845adb3ec6 | |||
90b139cfff | |||
4492e3a554 | |||
05c19485a5 | |||
52d0cb8458 | |||
5c1e496a75 | |||
e7f27ea648 | |||
1f29141258 | |||
6160ba4151 | |||
fea8006062 | |||
e6750d0b18 | |||
8c853050e7 | |||
f84a472a03 | |||
54e42b72db | |||
2dda3e35d0 | |||
d83f3f7cb3 | |||
302eb941f3 | |||
487745ff49 |
@ -58,11 +58,8 @@ function cpu_tests() {
|
||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||
--ignore=tests/models/language/generation/test_bart.py
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||
--ignore=tests/models/language/generation/test_bart.py
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
|
@ -35,7 +35,7 @@ docker run \
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
cd tests
|
||||
pytest -v -s v1/core
|
||||
pytest -v -s v1/engine
|
||||
|
@ -300,10 +300,12 @@ steps:
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s v1/kv_connector/unit
|
||||
- pytest -v -s v1/metrics
|
||||
- pytest -v -s v1/test_kv_sharing.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
@ -770,8 +772,9 @@ steps:
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
@ -869,25 +872,27 @@ steps:
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 110min
|
||||
timeout_in_minutes: 150
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/compilation/
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- vllm/compilation
|
||||
- vllm/worker/worker_base.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
@ -897,20 +902,29 @@ steps:
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Model Tests (2 GPUs) # 37min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
||||
- vllm/model_executor/models/
|
||||
- tests/basic_correctness/
|
||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
||||
- tests/models/
|
||||
commands:
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
timeout_in_minutes: 60
|
||||
|
@ -13,6 +13,7 @@ build:
|
||||
|
||||
mkdocs:
|
||||
configuration: mkdocs.yaml
|
||||
fail_on_warning: true
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
|
@ -1,17 +1,31 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
import numpy as np
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
DeviceConfig,
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||
|
||||
|
||||
def main(args):
|
||||
def benchmark_propose(args):
|
||||
rows = []
|
||||
for max_ngram in args.max_ngram:
|
||||
collector = TimeCollector(TimeCollector.US)
|
||||
@ -69,10 +83,88 @@ def main(args):
|
||||
)
|
||||
|
||||
|
||||
def benchmark_batched_propose(args):
|
||||
NUM_SPECULATIVE_TOKENS_NGRAM = 10
|
||||
PROMPT_LOOKUP_MIN = 5
|
||||
PROMPT_LOOKUP_MAX = 15
|
||||
MAX_MODEL_LEN = int(1e7)
|
||||
DEVICE = current_platform.device_type
|
||||
|
||||
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
|
||||
|
||||
speculative_config = SpeculativeConfig(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=ParallelConfig(),
|
||||
method="ngram",
|
||||
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
|
||||
prompt_lookup_max=PROMPT_LOOKUP_MAX,
|
||||
prompt_lookup_min=PROMPT_LOOKUP_MIN,
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=CacheConfig(),
|
||||
speculative_config=speculative_config,
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
parallel_config=ParallelConfig(),
|
||||
load_config=LoadConfig(),
|
||||
scheduler_config=SchedulerConfig(),
|
||||
)
|
||||
|
||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||
mock_pp_group = mock.MagicMock()
|
||||
mock_pp_group.world_size = 1
|
||||
with mock.patch(
|
||||
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
|
||||
):
|
||||
runner = GPUModelRunner(vllm_config, DEVICE)
|
||||
|
||||
# hack max model len
|
||||
runner.max_model_len = MAX_MODEL_LEN
|
||||
runner.drafter.max_model_len = MAX_MODEL_LEN
|
||||
|
||||
dummy_input_batch = InputBatch(
|
||||
max_num_reqs=args.num_req,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_batched_tokens=args.num_req * args.num_token,
|
||||
device=DEVICE,
|
||||
pin_memory=False,
|
||||
vocab_size=256000,
|
||||
block_sizes=[16],
|
||||
)
|
||||
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
||||
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
||||
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
||||
dummy_input_batch.token_ids_cpu = np.random.randint(
|
||||
0, 20, (args.num_req, args.num_token)
|
||||
)
|
||||
|
||||
runner.input_batch = dummy_input_batch
|
||||
|
||||
sampled_token_ids = [[0]] * args.num_req
|
||||
|
||||
print("Starting benchmark")
|
||||
# first run is warmup so ignore it
|
||||
for _ in range(args.num_iteration):
|
||||
start = time.time()
|
||||
runner.drafter.propose(
|
||||
sampled_token_ids,
|
||||
dummy_input_batch.req_ids,
|
||||
dummy_input_batch.num_tokens_no_spec,
|
||||
dummy_input_batch.token_ids_cpu,
|
||||
dummy_input_batch.spec_decode_unsupported_reqs,
|
||||
)
|
||||
end = time.time()
|
||||
print(f"Iteration time (s): {end - start}")
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batched", action="store_true", help="consider time to prepare batch"
|
||||
) # noqa: E501
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
@ -105,8 +197,17 @@ def invoke_main() -> None:
|
||||
help="Number of speculative tokens to generate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
if not args.batched:
|
||||
benchmark_propose(args)
|
||||
else:
|
||||
benchmark_batched_propose(args)
|
||||
|
||||
|
||||
"""
|
||||
# Example command lines:
|
||||
# time python3 benchmarks/benchmark_ngram_proposer.py
|
||||
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
|
||||
""" # noqa: E501
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
|
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
w8a8_triton_block_scaled_mm,
|
||||
w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||
|
||||
@ -158,7 +158,7 @@ def bench_fp8(
|
||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||
),
|
||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||
),
|
||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||
|
@ -3,6 +3,7 @@
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
import os
|
||||
|
||||
import torch
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
@ -23,21 +24,45 @@ PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
|
||||
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
|
||||
}
|
||||
|
||||
_needs_fbgemm = any(
|
||||
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
|
||||
)
|
||||
if _needs_fbgemm:
|
||||
try:
|
||||
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
|
||||
triton_scale_nvfp4_quant,
|
||||
)
|
||||
except ImportError:
|
||||
print(
|
||||
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
|
||||
"These providers will be skipped. Please install fbgemm_gpu with: "
|
||||
"'pip install fbgemm-gpu-genai' to run them."
|
||||
)
|
||||
# Disable FBGEMM providers so the benchmark can run.
|
||||
for cfg in PROVIDER_CFGS.values():
|
||||
if cfg.get("fbgemm"):
|
||||
cfg["enabled"] = False
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
|
||||
|
||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
|
||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
|
||||
# Compute global scale for weight
|
||||
b_amax = torch.abs(b).max().to(torch.float32)
|
||||
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
|
||||
else:
|
||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||
return b_fp4, scale_b_fp4, b_global_scale
|
||||
|
||||
|
||||
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
|
||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
|
||||
|
||||
# Compute global scale for activation
|
||||
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
||||
@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||
|
||||
# Alpha for the GEMM operation
|
||||
alpha = 1.0 / (a_global_scale * b_global_scale)
|
||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||
if cfg["no_a_quant"]:
|
||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||
|
||||
def run():
|
||||
return torch.ops.fbgemm.f4f4bf16(
|
||||
a_fp4,
|
||||
b_fp4,
|
||||
scale_a_fp4,
|
||||
scale_b_fp4,
|
||||
global_scale=alpha,
|
||||
use_mx=False,
|
||||
)
|
||||
|
||||
return run
|
||||
else:
|
||||
|
||||
def run():
|
||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||
return torch.ops.fbgemm.f4f4bf16(
|
||||
a_fp4,
|
||||
b_fp4,
|
||||
scale_a_fp4,
|
||||
scale_b_fp4,
|
||||
global_scale=alpha,
|
||||
use_mx=False,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
if cfg["no_a_quant"]:
|
||||
# Pre-quantize activation
|
||||
@ -130,10 +184,13 @@ if __name__ == "__main__":
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||
save_path=save_dir,
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
@ -51,7 +51,7 @@ def calculate_diff(
|
||||
):
|
||||
"""Calculate the difference between Inductor and CUDA implementations."""
|
||||
device = torch.device("cuda")
|
||||
x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
|
||||
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
|
||||
|
||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
||||
|
||||
@ -59,23 +59,25 @@ def calculate_diff(
|
||||
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
||||
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
||||
|
||||
out_allclose = lambda o1, o2: torch.allclose(
|
||||
o1.to(torch.float32),
|
||||
o2.to(torch.float32),
|
||||
rtol=1e-3,
|
||||
atol=1e-5,
|
||||
)
|
||||
scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
|
||||
|
||||
if (
|
||||
out_allclose(cuda_out, torch_out)
|
||||
and scale_allclose(cuda_scale, torch_scale)
|
||||
and out_allclose(cuda_out, torch_eager_out)
|
||||
and scale_allclose(cuda_scale, torch_eager_scale)
|
||||
):
|
||||
try:
|
||||
torch.testing.assert_close(
|
||||
cuda_out.to(torch.float32),
|
||||
torch_out.to(torch.float32),
|
||||
rtol=1e-3,
|
||||
atol=1e-5,
|
||||
)
|
||||
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
|
||||
torch.testing.assert_close(
|
||||
cuda_out.to(torch.float32),
|
||||
torch_eager_out.to(torch.float32),
|
||||
rtol=1e-3,
|
||||
atol=1e-5,
|
||||
)
|
||||
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
|
||||
print("✅ All implementations match")
|
||||
else:
|
||||
except AssertionError as e:
|
||||
print("❌ Implementations differ")
|
||||
print(e)
|
||||
|
||||
|
||||
configs = []
|
||||
@ -91,7 +93,7 @@ def benchmark_quantization(
|
||||
):
|
||||
device = torch.device("cuda")
|
||||
|
||||
x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
|
||||
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
||||
@ -157,21 +159,21 @@ if __name__ == "__main__":
|
||||
)
|
||||
parser.add_argument("-c", "--check", action="store_true")
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
|
||||
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hidden-sizes",
|
||||
type=int,
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
|
||||
default=[896, 1024, 2048, 4096, 7168],
|
||||
help="Hidden sizes to benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-sizes",
|
||||
type=int,
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="Batch sizes to benchmark (default: 1,16,32,64,128)",
|
||||
default=[1, 16, 128, 512, 1024],
|
||||
help="Batch sizes to benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--group-sizes",
|
||||
@ -192,8 +194,8 @@ if __name__ == "__main__":
|
||||
|
||||
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||
|
||||
hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
|
||||
hidden_sizes = args.hidden_sizes
|
||||
batch_sizes = args.batch_sizes
|
||||
|
||||
if args.group_sizes is not None:
|
||||
group_shapes = []
|
||||
|
@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
|
||||
|
||||
|
||||
def make_rand_tensors(
|
||||
a_shape: tuple[int],
|
||||
b_shape: tuple[int],
|
||||
c_shape: tuple[int],
|
||||
a_shape: tuple[int, ...],
|
||||
b_shape: tuple[int, ...],
|
||||
c_shape: tuple[int, ...],
|
||||
a_dtype: torch.dtype,
|
||||
b_dtype: torch.dtype,
|
||||
c_dtype: torch.dtype,
|
||||
@ -243,7 +243,7 @@ class OpType(Enum):
|
||||
lora_rank: int,
|
||||
num_loras: int,
|
||||
num_slices: int,
|
||||
) -> tuple[tuple[int], tuple[int], tuple[int]]:
|
||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||
"""
|
||||
Given num_slices, return the shapes of the A, B, and C matrices
|
||||
in A x B = C, for the op_type
|
||||
|
@ -8,12 +8,16 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_token_group_quant_fp8,
|
||||
w8a8_triton_block_scaled_mm,
|
||||
w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
|
||||
from vllm.utils.deep_gemm import (
|
||||
calc_diff,
|
||||
fp8_gemm_nt,
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_block_cast_to_fp8,
|
||||
)
|
||||
|
||||
|
||||
def benchmark_shape(m: int,
|
||||
@ -59,7 +63,7 @@ def benchmark_shape(m: int,
|
||||
|
||||
# === vLLM Triton Implementation ===
|
||||
def vllm_triton_gemm():
|
||||
return w8a8_triton_block_scaled_mm(A_vllm,
|
||||
return w8a8_block_fp8_matmul(A_vllm,
|
||||
B_vllm,
|
||||
A_scale_vllm,
|
||||
B_scale_vllm,
|
||||
|
@ -101,6 +101,7 @@ else()
|
||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||
endif()
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
@ -177,8 +178,14 @@ elseif (S390_FOUND)
|
||||
"-mzvector"
|
||||
"-march=native"
|
||||
"-mtune=native")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||
if(RVV_FOUND)
|
||||
message(FAIL_ERROR "Can't support rvv now.")
|
||||
else()
|
||||
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||
endif()
|
||||
|
||||
#
|
||||
|
@ -14,7 +14,8 @@
|
||||
// arm implementation
|
||||
#include "cpu_types_arm.hpp"
|
||||
#else
|
||||
#warning "unsupported vLLM cpu implementation"
|
||||
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
|
||||
#include "cpu_types_scalar.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
|
513
csrc/cpu/cpu_types_scalar.hpp
Normal file
513
csrc/cpu/cpu_types_scalar.hpp
Normal file
@ -0,0 +1,513 @@
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <torch/all.h>
|
||||
#include "float_convert.hpp"
|
||||
|
||||
namespace vec_op {
|
||||
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||
|
||||
#ifndef CPU_OP_GUARD
|
||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||
#else
|
||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||
std::cout << #NAME << " invoked." << std::endl;
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME) \
|
||||
std::cout << #NAME << " exit." << std::endl;
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
#define __max(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define __min(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define __abs(a) ((a) < (0) ? (0 - a) : (a))
|
||||
|
||||
typedef struct f16x8_t {
|
||||
uint16_t val[8];
|
||||
} f16x8_t;
|
||||
|
||||
typedef struct f16x16_t {
|
||||
uint16_t val[16];
|
||||
} f16x16_t;
|
||||
|
||||
typedef struct f16x32_t {
|
||||
uint16_t val[32];
|
||||
} f16x32_t;
|
||||
|
||||
typedef struct f32x4_t {
|
||||
float val[4];
|
||||
} f32x4_t;
|
||||
|
||||
typedef struct f32x8_t {
|
||||
float val[8];
|
||||
} f32x8_t;
|
||||
|
||||
typedef struct f32x16_t {
|
||||
float val[16];
|
||||
} f32x16_t;
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
|
||||
(f(std::integral_constant<T, indexes>{}), ...);
|
||||
};
|
||||
}; // namespace
|
||||
|
||||
template <typename T, T count, typename F,
|
||||
typename = std::enable_if_t<std::is_invocable_v<F, T> > >
|
||||
constexpr void unroll_loop(F&& f) {
|
||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct Vec {
|
||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||
};
|
||||
|
||||
struct FP32Vec8;
|
||||
struct FP32Vec16;
|
||||
|
||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
f16x8_t reg;
|
||||
|
||||
explicit FP16Vec8(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
|
||||
|
||||
explicit FP16Vec8(const FP32Vec8&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
f16x16_t reg;
|
||||
|
||||
explicit FP16Vec16(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
|
||||
|
||||
explicit FP16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
||||
|
||||
void save(void* ptr, const int elem_num) const {
|
||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
f16x8_t reg;
|
||||
|
||||
explicit BF16Vec8(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
|
||||
|
||||
explicit BF16Vec8(const FP32Vec8&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
f16x16_t reg;
|
||||
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
|
||||
|
||||
void save(void* ptr, const int elem_num) const {
|
||||
int num = __min(elem_num, VEC_ELEM_NUM);
|
||||
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
f16x32_t reg;
|
||||
|
||||
explicit BF16Vec32(const void* ptr)
|
||||
: reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
|
||||
|
||||
explicit BF16Vec32(f16x32_t data) : reg(data) {};
|
||||
|
||||
explicit BF16Vec32(BF16Vec8& vec8_data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
|
||||
}
|
||||
}
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
constexpr static int VEC_ELEM_NUM = 4;
|
||||
|
||||
f32x4_t reg;
|
||||
|
||||
explicit FP32Vec4(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec4() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec4(const float* ptr)
|
||||
: reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
|
||||
|
||||
explicit FP32Vec4(f32x4_t data) : reg(data) {};
|
||||
|
||||
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
|
||||
};
|
||||
|
||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
f32x8_t reg;
|
||||
|
||||
explicit FP32Vec8(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec8() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec8(const float* ptr)
|
||||
: reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
|
||||
|
||||
explicit FP32Vec8(f32x8_t data) : reg(data) {};
|
||||
|
||||
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
|
||||
|
||||
explicit FP32Vec8(const FP16Vec8& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
FP32Vec8(const BF16Vec8& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
float result = 0;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result += reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = expf(reg.val[i]);
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = tanhf(reg.val[i]);
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = erf(reg.val[i]);
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] * b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator+(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] + b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator-(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] - b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
FP32Vec8 operator/(const FP32Vec8& b) const {
|
||||
f32x8_t ret;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
ret.val[i] = reg.val[i] / b.reg.val[i];
|
||||
}
|
||||
return FP32Vec8(ret);
|
||||
}
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
f32x16_t reg;
|
||||
|
||||
explicit FP32Vec16(float v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = v;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16() {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const float* ptr)
|
||||
: reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
|
||||
|
||||
explicit FP32Vec16(f32x16_t data) : reg(data) {};
|
||||
|
||||
FP32Vec16(const FP32Vec4& data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
|
||||
}
|
||||
}
|
||||
|
||||
FP32Vec16(const FP32Vec8& data) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
|
||||
}
|
||||
}
|
||||
|
||||
FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
|
||||
|
||||
explicit FP32Vec16(const FP16Vec16& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = fp16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16& v) {
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = bf16_to_float(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
||||
|
||||
FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] * b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] + b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] - b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = reg.val[i] / b.reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 max(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 min(const FP32Vec16& b) const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 abs() const {
|
||||
FP32Vec16 result(0.0f);
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result.reg.val[i] = __abs(reg.val[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
float result = 0.0f;
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result += reg.val[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_max() const {
|
||||
float result = reg.val[0];
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result = __max(reg.val[i], result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
float reduce_min() const {
|
||||
float result = reg.val[0];
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
result = __min(reg.val[i], result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <int group_size>
|
||||
float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
float sum = 0.0;
|
||||
int start = idx * group_size;
|
||||
int end = (idx + 1) * group_size;
|
||||
|
||||
for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
|
||||
sum += reg.val[start];
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct VecType {
|
||||
using vec_type = void;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using vec_t = typename VecType<T>::vec_type;
|
||||
|
||||
template <>
|
||||
struct VecType<float> {
|
||||
using vec_type = FP32Vec8;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecType<c10::Half> {
|
||||
using vec_type = FP16Vec8;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecType<c10::BFloat16> {
|
||||
using vec_type = BF16Vec8;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void storeFP32(float v, T* ptr) {
|
||||
*ptr = v;
|
||||
}
|
||||
|
||||
/*
|
||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||
c10::Half __attribute__((__may_alias__)) *v_ptr =
|
||||
reinterpret_cast<c10::Half *>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
*/
|
||||
|
||||
template <>
|
||||
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
|
||||
uint16_t fp16 = float_to_fp16(v);
|
||||
*reinterpret_cast<uint16_t*>(ptr) = fp16;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||
c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
|
||||
reinterpret_cast<c10::BFloat16*>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
|
||||
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_fp16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
int i = 0;
|
||||
for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
|
||||
reg.val[i] = float_to_bf16(v.reg.val[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
|
||||
|
||||
}; // namespace vec_op
|
106
csrc/cpu/float_convert.hpp
Normal file
106
csrc/cpu/float_convert.hpp
Normal file
@ -0,0 +1,106 @@
|
||||
|
||||
static float bf16_to_float(uint16_t bf16) {
|
||||
uint32_t bits = static_cast<uint32_t>(bf16) << 16;
|
||||
float fp32;
|
||||
std::memcpy(&fp32, &bits, sizeof(fp32));
|
||||
return fp32;
|
||||
}
|
||||
|
||||
static uint16_t float_to_bf16(float fp32) {
|
||||
uint32_t bits;
|
||||
std::memcpy(&bits, &fp32, sizeof(fp32));
|
||||
return static_cast<uint16_t>(bits >> 16);
|
||||
}
|
||||
|
||||
/************************************************
|
||||
* Copyright (c) 2015 Princeton Vision Group
|
||||
* Licensed under the MIT license.
|
||||
* Codes below copied from
|
||||
* https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
|
||||
*************************************************/
|
||||
static uint16_t float_to_fp16(float fp32) {
|
||||
uint16_t fp16;
|
||||
|
||||
unsigned x;
|
||||
unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
|
||||
unsigned sign, exponent, mantissa;
|
||||
|
||||
std::memcpy(&x, &fp32, sizeof(fp32));
|
||||
u = (x & 0x7fffffff);
|
||||
|
||||
// Get rid of +NaN/-NaN case first.
|
||||
if (u > 0x7f800000) {
|
||||
fp16 = 0x7fffU;
|
||||
return fp16;
|
||||
}
|
||||
|
||||
sign = ((x >> 16) & 0x8000);
|
||||
|
||||
// Get rid of +Inf/-Inf, +0/-0.
|
||||
if (u > 0x477fefff) {
|
||||
fp16 = sign | 0x7c00U;
|
||||
return fp16;
|
||||
}
|
||||
if (u < 0x33000001) {
|
||||
fp16 = (sign | 0x0000);
|
||||
return fp16;
|
||||
}
|
||||
|
||||
exponent = ((u >> 23) & 0xff);
|
||||
mantissa = (u & 0x7fffff);
|
||||
|
||||
if (exponent > 0x70) {
|
||||
shift = 13;
|
||||
exponent -= 0x70;
|
||||
} else {
|
||||
shift = 0x7e - exponent;
|
||||
exponent = 0;
|
||||
mantissa |= 0x800000;
|
||||
}
|
||||
lsb = (1 << shift);
|
||||
lsb_s1 = (lsb >> 1);
|
||||
lsb_m1 = (lsb - 1);
|
||||
|
||||
// Round to nearest even.
|
||||
remainder = (mantissa & lsb_m1);
|
||||
mantissa >>= shift;
|
||||
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
|
||||
++mantissa;
|
||||
if (!(mantissa & 0x3ff)) {
|
||||
++exponent;
|
||||
mantissa = 0;
|
||||
}
|
||||
}
|
||||
|
||||
fp16 = (sign | (exponent << 10) | mantissa);
|
||||
|
||||
return fp16;
|
||||
}
|
||||
|
||||
static float fp16_to_float(uint16_t fp16) {
|
||||
unsigned sign = ((fp16 >> 15) & 1);
|
||||
unsigned exponent = ((fp16 >> 10) & 0x1f);
|
||||
unsigned mantissa = ((fp16 & 0x3ff) << 13);
|
||||
int temp;
|
||||
float fp32;
|
||||
if (exponent == 0x1f) { /* NaN or Inf */
|
||||
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
|
||||
exponent = 0xff;
|
||||
} else if (!exponent) { /* Denorm or Zero */
|
||||
if (mantissa) {
|
||||
unsigned int msb;
|
||||
exponent = 0x71;
|
||||
do {
|
||||
msb = (mantissa & 0x400000);
|
||||
mantissa <<= 1; /* normalize */
|
||||
--exponent;
|
||||
} while (!msb);
|
||||
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
|
||||
}
|
||||
} else {
|
||||
exponent += 0x70;
|
||||
}
|
||||
temp = ((sign << 31) | (exponent << 23) | mantissa);
|
||||
std::memcpy(&fp32, &temp, sizeof(temp));
|
||||
return fp32;
|
||||
}
|
@ -44,6 +44,9 @@ __global__ void moe_align_block_size_kernel(
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int expert_id = topk_ids[i];
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
int warp_idx = expert_id / experts_per_warp;
|
||||
int expert_offset = expert_id % experts_per_warp;
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
||||
@ -95,12 +98,15 @@ template <typename scalar_t>
|
||||
__global__ void count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
size_t numel) {
|
||||
size_t numel, int32_t num_experts) {
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
if (expert_id >= num_experts) {
|
||||
continue;
|
||||
}
|
||||
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
}
|
||||
@ -269,7 +275,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -6,11 +6,11 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
||||
const int64_t rows_per_block);
|
||||
|
||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const c10::optional<at::Tensor>& in_bias,
|
||||
const std::optional<at::Tensor>& in_bias,
|
||||
const int64_t CuCount);
|
||||
|
||||
void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const at::Tensor& scale_a, const at::Tensor& scale_b,
|
||||
const int64_t CuCount);
|
||||
|
||||
|
@ -1271,7 +1271,7 @@ int mindiv(int N, int div1, int div2) {
|
||||
}
|
||||
|
||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const c10::optional<at::Tensor>& in_bias,
|
||||
const std::optional<at::Tensor>& in_bias,
|
||||
const int64_t CuCount) {
|
||||
auto M_in = in_a.size(0);
|
||||
auto K_in = in_a.size(1);
|
||||
@ -1729,7 +1729,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
|
||||
void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const c10::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
const at::Tensor& scale_a, const at::Tensor& scale_b,
|
||||
const int64_t CuCount) {
|
||||
static c10::ScalarType kFp8Type = is_fp8_ocp()
|
||||
|
@ -114,9 +114,6 @@ WORKDIR /workspace/vllm
|
||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
cp requirements/test.in requirements/cpu-test.in && \
|
||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||
sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
@ -65,8 +65,6 @@ ARG PYTORCH_BRANCH
|
||||
ARG PYTORCH_VISION_BRANCH
|
||||
ARG PYTORCH_REPO
|
||||
ARG PYTORCH_VISION_REPO
|
||||
ARG FA_BRANCH
|
||||
ARG FA_REPO
|
||||
RUN git clone ${PYTORCH_REPO} pytorch
|
||||
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
|
||||
pip install -r requirements.txt && git submodule update --init --recursive \
|
||||
@ -77,14 +75,20 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
|
||||
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& pip install dist/*.whl
|
||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||
&& cp /app/vision/dist/*.whl /app/install
|
||||
|
||||
FROM base AS build_fa
|
||||
ARG FA_BRANCH
|
||||
ARG FA_REPO
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN git clone ${FA_REPO}
|
||||
RUN cd flash-attention \
|
||||
&& git checkout ${FA_BRANCH} \
|
||||
&& git submodule update --init \
|
||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||
&& cp /app/vision/dist/*.whl /app/install \
|
||||
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
|
||||
|
||||
FROM base AS build_aiter
|
||||
ARG AITER_BRANCH
|
||||
@ -103,6 +107,8 @@ FROM base AS debs
|
||||
RUN mkdir /app/debs
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
@ -111,13 +117,7 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
|
||||
FROM base AS final
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG BASE_IMAGE
|
||||
|
@ -139,9 +139,9 @@ there is relatively little gain from TP. On the other hand, TP incurs significan
|
||||
overhead because of all-reduce being performed after every layer.
|
||||
|
||||
Given this, it may be advantageous to instead shard the batched input data using TP, essentially
|
||||
performing batch-level DP. This has been shown to improve the throughput by around 10% for
|
||||
performing batch-level DP. This has been shown to improve the throughput and TTFT by around 10% for
|
||||
`tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations,
|
||||
batch-level DP can provide another 40% increase to throughput compared to regular TP.
|
||||
batch-level DP can provide another 40% improvement compared to regular TP.
|
||||
|
||||
Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank,
|
||||
there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already.
|
||||
@ -172,14 +172,15 @@ Batch-level DP needs to be implemented on a per-model basis,
|
||||
and enabled by setting `supports_encoder_tp_data = True` in the model class.
|
||||
Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.
|
||||
|
||||
Known supported models:
|
||||
Known supported models (with corresponding benchmarks):
|
||||
|
||||
- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
|
||||
- dots_ocr (<gh-pr:25466>)
|
||||
- GLM-4.1V or above (<gh-pr:23168>)
|
||||
- InternVL (<gh-pr:23909>)
|
||||
- Kimi-VL (<gh-pr:23817>)
|
||||
- Llama4 (<gh-pr:18368>)
|
||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||
- Qwen2.5-VL (<gh-pr:22742>)
|
||||
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
|
||||
- Step3 (<gh-pr:22697>)
|
||||
|
||||
## Input Processing
|
||||
|
@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
|
||||
Install the NIXL library: `uv pip install nixl`, as a quick start.
|
||||
|
||||
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
|
||||
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
|
||||
|
||||
### Transport Configuration
|
||||
|
||||
@ -154,6 +154,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
||||
|
||||
Refer to these example scripts in the vLLM repository:
|
||||
|
||||
- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
|
||||
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
|
||||
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
|
||||
|
@ -6,6 +6,17 @@ vLLM supports the generation of structured outputs using
|
||||
This document shows you some examples of the different options that are
|
||||
available to generate structured outputs.
|
||||
|
||||
!!! warning
|
||||
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
|
||||
|
||||
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
|
||||
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
|
||||
- `guided_choice` -> `{"structured_outputs": {"choice": ...}}` or `StructuredOutputsParams(choice=...)`
|
||||
- `guided_grammar` -> `{"structured_outputs": {"grammar": ...}}` or `StructuredOutputsParams(grammar=...)`
|
||||
- `guided_whitespace_pattern` -> `{"structured_outputs": {"whitespace_pattern": ...}}` or `StructuredOutputsParams(whitespace_pattern=...)`
|
||||
- `structural_tag` -> `{"structured_outputs": {"structural_tag": ...}}` or `StructuredOutputsParams(structural_tag=...)`
|
||||
- `guided_decoding_backend` -> Remove this field from your request
|
||||
|
||||
## Online Serving (OpenAI API)
|
||||
|
||||
You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
|
||||
|
@ -310,6 +310,15 @@ Flags:
|
||||
* For non-reasoning: `--tool-call-parser hunyuan_a13b`
|
||||
* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
|
||||
|
||||
### LongCat-Flash-Chat Models (`longcat`)
|
||||
|
||||
Supported models:
|
||||
|
||||
* `meituan-longcat/LongCat-Flash-Chat`
|
||||
* `meituan-longcat/LongCat-Flash-Chat-FP8`
|
||||
|
||||
Flags: `--tool-call-parser longcat`
|
||||
|
||||
### GLM-4.5 Models (`glm45`)
|
||||
|
||||
Supported models:
|
||||
|
@ -20,7 +20,80 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
--8<-- "docs/getting_started/installation/cpu/build.inc.md"
|
||||
Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
|
||||
|
||||
```bash
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
```
|
||||
|
||||
Clone the vLLM project:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git vllm_source
|
||||
cd vllm_source
|
||||
```
|
||||
|
||||
Install the required dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
|
||||
uv pip install -r requirements/cpu.txt --torch-backend cpu
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Build and install vLLM:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
|
||||
```
|
||||
|
||||
If you want to develop vLLM, install it in editable mode instead.
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
|
||||
```
|
||||
|
||||
Optionally, build a portable wheel which you can then install elsewhere:
|
||||
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu uv build --wheel
|
||||
```
|
||||
|
||||
```bash
|
||||
uv pip install dist/*.whl
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
VLLM_TARGET_DEVICE=cpu python -m build --wheel --no-isolation
|
||||
```
|
||||
|
||||
```bash
|
||||
pip install dist/*.whl
|
||||
```
|
||||
|
||||
!!! example "Troubleshooting"
|
||||
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
|
||||
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
|
||||
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
|
||||
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
|
||||
```toml title="pyproject.toml"
|
||||
[build-system]
|
||||
requires = [
|
||||
"cmake>=3.26.1",
|
||||
...
|
||||
"torch==X.Y.Z+cpu" # <-------
|
||||
]
|
||||
```
|
||||
- If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
@ -57,4 +130,4 @@ docker run --rm \
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
# --8<-- [end:extra-information]
|
||||
# --8<-- [end:extra-information]
|
@ -32,8 +32,9 @@ def auto_mock(module, attr, max_mocks=50):
|
||||
for _ in range(max_mocks):
|
||||
try:
|
||||
# First treat attr as an attr, then as a submodule
|
||||
return getattr(importlib.import_module(module), attr,
|
||||
importlib.import_module(f"{module}.{attr}"))
|
||||
with patch("importlib.metadata.version", return_value="0.0.0"):
|
||||
return getattr(importlib.import_module(module), attr,
|
||||
importlib.import_module(f"{module}.{attr}"))
|
||||
except importlib.metadata.PackageNotFoundError as e:
|
||||
raise e
|
||||
except ModuleNotFoundError as e:
|
||||
@ -167,5 +168,5 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
|
||||
# Specify encoding for building on Windows
|
||||
with open(doc_path, "w", encoding="utf-8") as f:
|
||||
f.write(parser.format_help())
|
||||
f.write(super(type(parser), parser).format_help())
|
||||
logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
|
@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
|
||||
|
||||
In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
|
||||
Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
|
||||
which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
|
||||
which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.
|
||||
|
||||
## Configuration
|
||||
|
||||
|
@ -29,7 +29,7 @@ _*Vision-language models currently accept only image inputs. Support for video i
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
|
||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
- Pipeline parallel
|
||||
- Tensor parallel
|
||||
@ -428,6 +428,7 @@ th {
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Using vLLM
|
||||
|
||||
First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
|
||||
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
|
||||
|
||||
Then, vLLM supports the following usage patterns:
|
||||
|
||||
|
@ -87,6 +87,7 @@ def main(args: dict):
|
||||
use_tqdm=False,
|
||||
chat_template=chat_template,
|
||||
)
|
||||
print_outputs(outputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -11,9 +11,9 @@ vLLM performance and metrics.
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
|
||||
- **performance_statistics.json**: Tracks performance metrics including latency and
|
||||
throughput for your vLLM service.
|
||||
- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
|
||||
- **query_statistics.json**: Tracks query performance, request volume, and key
|
||||
performance indicators for your vLLM service.
|
||||
|
||||
## Deployment Options
|
||||
|
@ -21,9 +21,9 @@ deployment methods:
|
||||
|
||||
## Dashboard Descriptions
|
||||
|
||||
- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
|
||||
- **performance_statistics.yaml**: Performance metrics with aggregated latency
|
||||
statistics
|
||||
- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
|
||||
- **query_statistics.yaml**: Query performance and deployment metrics
|
||||
|
||||
## Deployment Options
|
||||
|
||||
|
@ -1,12 +1,11 @@
|
||||
# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
|
||||
# see https://github.com/pytorch/pytorch/pull/151218
|
||||
cmake>=3.26.1
|
||||
ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.6.0+cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
regex
|
||||
|
@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.8.0; platform_system == "Darwin"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
|
||||
@ -23,7 +23,7 @@ datasets # for benchmark scripts
|
||||
|
||||
# Intel Extension for PyTorch, only for x86_64 CPUs
|
||||
intel-openmp==2024.2.1; platform_machine == "x86_64"
|
||||
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||
intel_extension_for_pytorch==2.8.0; platform_machine == "x86_64"
|
||||
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
|
||||
|
||||
# Use this to gather CPU info and optimize based on ARM Neoverse cores
|
||||
|
@ -103,7 +103,7 @@ backend_configs = {
|
||||
# Triton Attention
|
||||
"TritonAttn":
|
||||
BackendConfig(name="TritonAttn",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
}),
|
||||
|
@ -338,7 +338,7 @@ else:
|
||||
@pytest.mark.parametrize("model_name, model_class", MODELS)
|
||||
@pytest.mark.parametrize("backend",
|
||||
[_Backend.FLASHINFER] if current_platform.is_cuda()
|
||||
else [_Backend.TRITON_ATTN_VLLM_V1])
|
||||
else [_Backend.TRITON_ATTN])
|
||||
@pytest.mark.parametrize(
|
||||
"split_attention",
|
||||
[False, True] if current_platform.is_rocm() else [False])
|
||||
|
@ -50,8 +50,11 @@ def test_is_type(type_hint, type, expected):
|
||||
|
||||
@pytest.mark.parametrize(("type_hints", "type", "expected"), [
|
||||
({float, int}, int, True),
|
||||
({int, tuple}, int, True),
|
||||
({int, tuple[int]}, int, True),
|
||||
({int, tuple[int, ...]}, int, True),
|
||||
({int, tuple[int]}, float, False),
|
||||
({int, tuple[int, ...]}, float, False),
|
||||
({str, Literal["x", "y"]}, Literal, True),
|
||||
])
|
||||
def test_contains_type(type_hints, type, expected):
|
||||
|
@ -15,7 +15,7 @@ from transformers import AutoConfig
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
|
||||
|
||||
@ -27,7 +27,7 @@ def default_server_args() -> list[str]:
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
@ -36,6 +36,27 @@ def default_server_args() -> list[str]:
|
||||
]
|
||||
|
||||
|
||||
EXAMPLE_PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"What is an LLM?",
|
||||
]
|
||||
|
||||
|
||||
def _encode_embeds(embeds: torch.Tensor):
|
||||
buffer = io.BytesIO()
|
||||
torch.save(embeds, buffer)
|
||||
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def example_prompt_embeds(hf_runner):
|
||||
"""Create example embeddings and return them as base64 encoded string."""
|
||||
with hf_runner(MODEL_NAME) as hf_model:
|
||||
example_embeddings = hf_model.get_prompt_embeddings(EXAMPLE_PROMPTS)
|
||||
|
||||
return [_encode_embeds(item) for item in example_embeddings]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=["", "--disable-frontend-multiprocessing"])
|
||||
def server_with_prompt_embeds(default_server_args, request):
|
||||
@ -52,21 +73,16 @@ async def client_with_prompt_embeds(server_with_prompt_embeds):
|
||||
yield async_client
|
||||
|
||||
|
||||
def create_dummy_embeds(num_tokens: int = 5) -> str:
|
||||
"""Create dummy embeddings and return them as base64 encoded string."""
|
||||
dummy_embeds = torch.randn(num_tokens, CONFIG.hidden_size)
|
||||
buffer = io.BytesIO()
|
||||
torch.save(dummy_embeds, buffer)
|
||||
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
@pytest.mark.skip("This test is skipped because it is flaky.")
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_completions_with_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
):
|
||||
encoded_embeds, encoded_embeds2 = example_prompt_embeds
|
||||
|
||||
# Test case: Single prompt embeds input
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
@ -77,7 +93,6 @@ async def test_completions_with_prompt_embeds(
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
# Test case: batch completion with prompt_embeds
|
||||
encoded_embeds2 = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
@ -89,7 +104,6 @@ async def test_completions_with_prompt_embeds(
|
||||
assert len(completion.choices[1].text) >= 1
|
||||
|
||||
# Test case: streaming with prompt_embeds
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
single_completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
@ -117,7 +131,6 @@ async def test_completions_with_prompt_embeds(
|
||||
assert "".join(chunks) == single_output
|
||||
|
||||
# Test case: batch streaming with prompt_embeds
|
||||
encoded_embeds2 = create_dummy_embeds()
|
||||
stream = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
@ -139,7 +152,6 @@ async def test_completions_with_prompt_embeds(
|
||||
assert len(chunks_stream_embeds[1]) > 0
|
||||
|
||||
# Test case: mixed text and prompt_embeds
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
completion_mixed = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="This is a prompt",
|
||||
@ -184,10 +196,14 @@ async def test_completions_errors_with_prompt_embeds(
|
||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_completions_with_logprobs_and_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int,
|
||||
model_name: str):
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
logprobs_arg: int,
|
||||
model_name: str,
|
||||
):
|
||||
encoded_embeds, encoded_embeds2 = example_prompt_embeds
|
||||
|
||||
# Test case: Logprobs using prompt_embeds
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
@ -207,7 +223,6 @@ async def test_completions_with_logprobs_and_prompt_embeds(
|
||||
assert len(logprobs.tokens) == 5
|
||||
|
||||
# Test case: Log probs with batch completion and prompt_embeds
|
||||
encoded_embeds2 = create_dummy_embeds()
|
||||
completion = await client_with_prompt_embeds.completions.create(
|
||||
model=model_name,
|
||||
prompt="", # Add empty prompt as required parameter
|
||||
@ -232,9 +247,12 @@ async def test_completions_with_logprobs_and_prompt_embeds(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prompt_logprobs_raises_error(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI):
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
):
|
||||
encoded_embeds, _ = example_prompt_embeds
|
||||
|
||||
with pytest.raises(BadRequestError, match="not compatible"):
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="",
|
||||
|
@ -68,7 +68,7 @@ def default_server_args(with_tool_parser: bool):
|
||||
def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
|
||||
default_server_args: list[str]):
|
||||
with monkeypatch_module.context() as m:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
|
||||
default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
@ -31,7 +31,7 @@ DEVICE_MLA_BACKENDS = {
|
||||
}
|
||||
|
||||
DEVICE_REGULAR_ATTN_BACKENDS = {
|
||||
"cuda": ["XFORMERS", "FLASHINFER"],
|
||||
"cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
|
||||
"hip": ["ROCM_FLASH"],
|
||||
"cpu": ["TORCH_SDPA"],
|
||||
}
|
||||
@ -86,7 +86,7 @@ def test_env(
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float16, None, block_size)
|
||||
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
|
||||
assert backend.get_name() == "TORCH_SDPA"
|
||||
|
||||
elif device == "hip":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
@ -125,7 +125,7 @@ def test_env(
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1"
|
||||
expected = name
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
@ -133,7 +133,7 @@ def test_env(
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
expected = "TRITON_ATTN_VLLM_V1"
|
||||
expected = "TRITON_ATTN"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
elif device == "cuda":
|
||||
@ -160,7 +160,7 @@ def test_env(
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
expected = "CUTLASS_MLA_VLLM_V1"
|
||||
expected = "CUTLASS_MLA"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER_MLA":
|
||||
if block_size not in [32, 64]:
|
||||
@ -193,7 +193,7 @@ def test_env(
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1"
|
||||
expected = name
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASH_ATTN_MLA":
|
||||
backend = get_attn_backend(16,
|
||||
@ -210,7 +210,7 @@ def test_env(
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
expected = "TRITON_MLA_VLLM_V1"
|
||||
expected = "TRITON_MLA"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER":
|
||||
backend = get_attn_backend(16,
|
||||
@ -218,25 +218,24 @@ def test_env(
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASHINFER_VLLM_V1"
|
||||
expected = "FLASHINFER"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
elif name == "XFORMERS":
|
||||
backend = get_attn_backend(32,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_VLLM_V1"
|
||||
expected = "XFORMERS"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
backend = get_attn_backend(16,
|
||||
elif name == "FLASH_ATTN":
|
||||
backend = get_attn_backend(32,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
expected = "FLASH_ATTN"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
@ -252,7 +251,7 @@ def test_fp32_fallback(
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16)
|
||||
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
|
||||
assert backend.get_name() == "TORCH_SDPA"
|
||||
|
||||
elif device == "cuda":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
@ -266,6 +265,9 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
# TODO: When testing for v1, pipe in `use_v1` as an argument to
|
||||
# get_attn_backend
|
||||
|
||||
pytest.skip("Skipping as current backend selector does not " \
|
||||
"handle fallbacks when a backend is set via env var.")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
|
||||
|
||||
|
@ -28,7 +28,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
# Test standard ROCm attention
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
|
||||
assert (backend.get_name() == "ROCM_FLASH"
|
||||
or backend.get_name() == "TRITON_ATTN_VLLM_V1")
|
||||
or backend.get_name() == "TRITON_ATTN")
|
||||
|
||||
# MLA test for deepseek related
|
||||
|
||||
@ -40,8 +40,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
16,
|
||||
False,
|
||||
use_mla=True)
|
||||
assert (backend.get_name() == "TRITON_MLA"
|
||||
or backend.get_name() == "TRITON_MLA_VLLM_V1")
|
||||
assert backend.get_name() == "TRITON_MLA"
|
||||
|
||||
# If attention backend is None
|
||||
# If use_mla is true
|
||||
@ -53,8 +52,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
16,
|
||||
False,
|
||||
use_mla=True)
|
||||
assert (backend.get_name() == "TRITON_MLA"
|
||||
or backend.get_name() == "TRITON_MLA_VLLM_V1")
|
||||
assert backend.get_name() == "TRITON_MLA"
|
||||
|
||||
# change the attention backend to AITER MLA
|
||||
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
|
||||
@ -64,8 +62,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
1,
|
||||
False,
|
||||
use_mla=True)
|
||||
assert (backend.get_name() == "ROCM_AITER_MLA"
|
||||
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
|
||||
assert backend.get_name() == "ROCM_AITER_MLA"
|
||||
|
||||
# If attention backend is None
|
||||
# If use_mla is true
|
||||
@ -79,5 +76,4 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
1,
|
||||
False,
|
||||
use_mla=True)
|
||||
assert (backend.get_name() == "ROCM_AITER_MLA"
|
||||
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
|
||||
assert backend.get_name() == "ROCM_AITER_MLA"
|
||||
|
@ -46,6 +46,8 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
|
||||
# o will have the same shape as q
|
||||
o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
|
||||
|
||||
lse = torch.zeros(B, H_Q, dtype=dtype, device="cuda")
|
||||
|
||||
b_seq_len = torch.full((B, ), seq_len, device="cuda")
|
||||
|
||||
attn_logits = torch.empty(
|
||||
@ -60,6 +62,7 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
|
||||
k_buffer,
|
||||
v_buffer,
|
||||
o,
|
||||
lse,
|
||||
req_to_token,
|
||||
b_seq_len,
|
||||
attn_logits,
|
||||
@ -72,12 +75,14 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
|
||||
v_buffer = v_buffer.view(CACHE_SIZE // PAGE_SIZE, PAGE_SIZE, H_KV, D_V)
|
||||
|
||||
o1 = torch.zeros_like(o)
|
||||
lse1 = torch.zeros_like(lse)
|
||||
|
||||
decode_attention_fwd(
|
||||
q,
|
||||
k_buffer,
|
||||
v_buffer,
|
||||
o1,
|
||||
lse1,
|
||||
req_to_page,
|
||||
b_seq_len,
|
||||
attn_logits,
|
||||
|
@ -60,7 +60,7 @@ TENSORS_SHAPES_FN = [
|
||||
@torch.inference_mode()
|
||||
def test_rotary_embedding(
|
||||
is_neox_style: bool,
|
||||
tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
|
||||
tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
|
@ -7,7 +7,7 @@ import torch.nn.functional as F
|
||||
from einops import rearrange, repeat
|
||||
|
||||
from vllm.model_executor.layers.mamba.ops.ssd_combined import (
|
||||
mamba_chunk_scan_combined)
|
||||
mamba_chunk_scan_combined_varlen)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.attention.backends.mamba2_attn import (
|
||||
_query_start_loc_to_chunk_indices_offsets)
|
||||
@ -185,9 +185,14 @@ def generate_continuous_batched_examples(example_lens_by_batch,
|
||||
IND_S = [x % full_length for x in IND_E]
|
||||
IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
|
||||
|
||||
# varlen has implicit batch=1
|
||||
dt2 = dt2.squeeze(0)
|
||||
X2 = X2.squeeze(0)
|
||||
B2 = B2.squeeze(0)
|
||||
C2 = C2.squeeze(0)
|
||||
yield ([Y_min[s, IND_S[s]:IND_E[s]]
|
||||
for s in range(num_examples)] if return_naive_ref else None,
|
||||
cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
|
||||
cu_seqlens, seq_idx, (A, dt2, X2, B2, C2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype",
|
||||
@ -198,7 +203,7 @@ def generate_continuous_batched_examples(example_lens_by_batch,
|
||||
def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
|
||||
itype):
|
||||
|
||||
# this tests the kernels on a single example (no batching)
|
||||
# this tests the kernels on a single example (bs=1)
|
||||
|
||||
# TODO: the bfloat16 case requires higher thresholds. To be investigated
|
||||
|
||||
@ -219,23 +224,40 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
|
||||
|
||||
Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
|
||||
B, C, chunk_size)
|
||||
|
||||
cu_seqlens = torch.tensor((0, seqlen), device='cuda').cumsum(dim=0)
|
||||
seq_idx = torch.zeros(seqlen, dtype=torch.int32, device=cu_seqlens.device)
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
cu_seqlens, chunk_size, cu_seqlens[-1])
|
||||
|
||||
# varlen has implicit batch=1
|
||||
X = X.squeeze(0)
|
||||
dt = dt.squeeze(0)
|
||||
A = A.squeeze(0)
|
||||
B = B.squeeze(0)
|
||||
C = C.squeeze(0)
|
||||
Y = torch.empty_like(X)
|
||||
final_state = mamba_chunk_scan_combined(X,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
chunk_size,
|
||||
D=None,
|
||||
return_final_states=True,
|
||||
out=Y)
|
||||
final_state = mamba_chunk_scan_combined_varlen(X,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=cu_seqlens,
|
||||
seq_idx=seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
out=Y)
|
||||
|
||||
# just test the last in sequence
|
||||
torch.testing.assert_close(Y[:, -1], Y_min[:, -1], atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(Y[-1], Y_min[0, -1], atol=atol, rtol=rtol)
|
||||
|
||||
# just test the last head
|
||||
# NOTE, in the kernel we always cast states to fp32
|
||||
torch.testing.assert_close(final_state[:, -1],
|
||||
torch.testing.assert_close(final_state[:, -1].to(torch.float32),
|
||||
final_state_min[:, -1].to(torch.float32),
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
@ -300,7 +322,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
cu_seqlens, chunk_size, cu_seqlens[-1])
|
||||
|
||||
Y = torch.empty_like(X)
|
||||
new_states = mamba_chunk_scan_combined(
|
||||
new_states = mamba_chunk_scan_combined_varlen(
|
||||
X,
|
||||
dt,
|
||||
A,
|
||||
@ -312,7 +334,6 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
seq_idx=seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
return_varlen_states=True,
|
||||
initial_states=states,
|
||||
out=Y,
|
||||
)
|
||||
@ -321,7 +342,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
for i in range(num_examples):
|
||||
|
||||
# just test one dim and dstate
|
||||
Y_eg = Y[0, cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
|
||||
Y_eg = Y[cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
|
||||
Y_min_eg = Y_min[i][:, 0, 0]
|
||||
torch.testing.assert_close(Y_eg, Y_min_eg, atol=atol, rtol=rtol)
|
||||
|
||||
@ -386,7 +407,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
cu_seqlens, chunk_size, cu_seqlens[-1])
|
||||
Y_ref = torch.empty_like(X)
|
||||
state_ref = mamba_chunk_scan_combined(
|
||||
state_ref = mamba_chunk_scan_combined_varlen(
|
||||
X,
|
||||
dt,
|
||||
A,
|
||||
@ -398,7 +419,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
seq_idx=seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
return_varlen_states=True,
|
||||
initial_states=None,
|
||||
out=Y_ref,
|
||||
)
|
||||
@ -414,27 +434,27 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
chunked_seq_idx = torch.repeat_interleave(
|
||||
torch.arange(len(chunked_seqlens), device=device),
|
||||
chunked_seqlens,
|
||||
output_size=chunked_cu_seqlens[-1]).unsqueeze(0).to(torch.int32)
|
||||
output_size=chunked_cu_seqlens[-1]).to(torch.int32)
|
||||
chunked_input_seq_len = chunked_cu_seqlens[-1]
|
||||
X_chunked = torch.zeros_like(X)[:, :chunked_input_seq_len, ...]
|
||||
dt_chunked = torch.zeros_like(dt)[:, :chunked_input_seq_len, ...]
|
||||
B_chunked = torch.zeros_like(B)[:, :chunked_input_seq_len, ...]
|
||||
C_chunked = torch.zeros_like(C)[:, :chunked_input_seq_len, ...]
|
||||
X_chunked = torch.zeros_like(X)[:chunked_input_seq_len, ...]
|
||||
dt_chunked = torch.zeros_like(dt)[:chunked_input_seq_len, ...]
|
||||
B_chunked = torch.zeros_like(B)[:chunked_input_seq_len, ...]
|
||||
C_chunked = torch.zeros_like(C)[:chunked_input_seq_len, ...]
|
||||
for i in range(num_sequences):
|
||||
# fmt: off
|
||||
chunk_f = lambda x, i: x[:, cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...] # noqa: E501
|
||||
chunk_f = lambda x, i: x[cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...] # noqa: E501
|
||||
|
||||
X_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i) # noqa: E501
|
||||
dt_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i) # noqa: E501
|
||||
B_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i) # noqa: E501
|
||||
C_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i) # noqa: E501
|
||||
X_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i) # noqa: E501
|
||||
dt_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i) # noqa: E501
|
||||
B_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i) # noqa: E501
|
||||
C_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i) # noqa: E501
|
||||
# fmt: on
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1])
|
||||
Y_partial = torch.empty_like(X_chunked)
|
||||
partial_state = mamba_chunk_scan_combined(
|
||||
partial_state = mamba_chunk_scan_combined_varlen(
|
||||
X_chunked,
|
||||
dt_chunked,
|
||||
A,
|
||||
@ -446,7 +466,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
seq_idx=chunked_seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
return_varlen_states=True,
|
||||
initial_states=None,
|
||||
out=Y_partial,
|
||||
)
|
||||
@ -461,29 +480,28 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
remaining_chunked_seq_idx = torch.repeat_interleave(
|
||||
torch.arange(len(remaining_chunked_seqlens), device=device),
|
||||
remaining_chunked_seqlens,
|
||||
output_size=remaining_chunked_cu_seqlens[-1]).unsqueeze(0).to(
|
||||
torch.int32)
|
||||
output_size=remaining_chunked_cu_seqlens[-1]).to(torch.int32)
|
||||
remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
|
||||
# fmt: off
|
||||
remaining_X_chunked = torch.zeros_like(X)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_dt_chunked = torch.zeros_like(dt)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_B_chunked = torch.zeros_like(B)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_C_chunked = torch.zeros_like(C)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_X_chunked = torch.zeros_like(X)[:remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_dt_chunked = torch.zeros_like(dt)[:remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_B_chunked = torch.zeros_like(B)[:remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_C_chunked = torch.zeros_like(C)[:remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
for i in range(num_sequences):
|
||||
remaining_chunk_f = lambda x, i: x[:, cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...] # noqa: E501
|
||||
remaining_chunk_f = lambda x, i: x[cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...] # noqa: E501
|
||||
|
||||
remaining_X_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i) # noqa: E501
|
||||
remaining_dt_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i) # noqa: E501
|
||||
remaining_B_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i) # noqa: E501
|
||||
remaining_C_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i) # noqa: E501
|
||||
remaining_X_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i) # noqa: E501
|
||||
remaining_dt_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i) # noqa: E501
|
||||
remaining_B_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i) # noqa: E501
|
||||
remaining_C_chunked[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i) # noqa: E501
|
||||
|
||||
# assert input chunking is correct
|
||||
concat_chunk_f = lambda pt1, pt2, i: torch.cat([
|
||||
pt1[:,chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
|
||||
pt2[:,remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
|
||||
pt1[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
|
||||
pt2[remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
|
||||
],
|
||||
dim=1)
|
||||
concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=1) # noqa: E501
|
||||
dim=0)
|
||||
concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=0) # noqa: E501
|
||||
# fmt: on
|
||||
|
||||
assert concat_batch_f(X_chunked, remaining_X_chunked).equal(X)
|
||||
@ -498,7 +516,7 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
remaining_chunked_cu_seqlens[-1])
|
||||
|
||||
Y_chunked = torch.empty_like(remaining_X_chunked)
|
||||
state_chunked = mamba_chunk_scan_combined(
|
||||
state_chunked = mamba_chunk_scan_combined_varlen(
|
||||
remaining_X_chunked,
|
||||
remaining_dt_chunked,
|
||||
A,
|
||||
@ -510,7 +528,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
seq_idx=remaining_chunked_seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
return_varlen_states=True,
|
||||
initial_states=partial_state,
|
||||
out=Y_chunked,
|
||||
)
|
||||
@ -518,17 +535,17 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
|
||||
# kernel chunked is same as kernel overall
|
||||
for i in range(num_sequences):
|
||||
Y_seq = Y[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
|
||||
Y_ref_seq = Y_ref[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
|
||||
Y_seq = Y[cu_seqlens[i]:cu_seqlens[i + 1], ...]
|
||||
Y_ref_seq = Y_ref[cu_seqlens[i]:cu_seqlens[i + 1], ...]
|
||||
torch.testing.assert_close(
|
||||
Y_seq[:, :chunked_seqlens[i], ...],
|
||||
Y_ref_seq[:, :chunked_seqlens[i], ...],
|
||||
Y_seq[:chunked_seqlens[i], ...],
|
||||
Y_ref_seq[:chunked_seqlens[i], ...],
|
||||
atol=atol,
|
||||
rtol=rtol,
|
||||
msg=lambda x: f"seq{i} output part1 " + x) # noqa: B023
|
||||
torch.testing.assert_close(
|
||||
Y_seq[:, chunked_seqlens[i]:, ...],
|
||||
Y_ref_seq[:, chunked_seqlens[i]:, ...],
|
||||
Y_seq[chunked_seqlens[i]:, ...],
|
||||
Y_ref_seq[chunked_seqlens[i]:, ...],
|
||||
atol=atol,
|
||||
rtol=rtol,
|
||||
msg=lambda x: f"seq{i} output part2 " + x) # noqa: B023
|
||||
|
@ -222,7 +222,8 @@ if (has_flashinfer_cutlass_fused_moe()
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
|
||||
FlashInferExperts)
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
|
||||
FlashInferCutlassMoEPrepareAndFinalize)
|
||||
FlashInferCutlassMoEPrepareAndFinalize,
|
||||
create_flashinfer_prepare_finalize)
|
||||
|
||||
register_prepare_and_finalize(
|
||||
FlashInferCutlassMoEPrepareAndFinalize,
|
||||
@ -373,7 +374,7 @@ def make_prepare_finalize(
|
||||
assert prepare_finalize is not None
|
||||
return prepare_finalize
|
||||
elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
|
||||
return FlashInferCutlassMoEPrepareAndFinalize(
|
||||
return create_flashinfer_prepare_finalize(
|
||||
use_dp=moe.moe_parallel_config.dp_size > 1)
|
||||
else:
|
||||
return MoEPrepareAndFinalizeNoEP()
|
||||
|
@ -138,7 +138,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
|
||||
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
hidden_states=td.hidden_states,
|
||||
router_logits=score,
|
||||
use_grouped_topk=False,
|
||||
@ -206,7 +206,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
hidden_states=td.hidden_states,
|
||||
router_logits=score,
|
||||
use_grouped_topk=False,
|
||||
|
@ -11,11 +11,12 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
|
||||
native_w8a8_block_matmul)
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
cutlass_scaled_mm, get_col_major_tma_aligned_tensor,
|
||||
per_token_group_quant_fp8, w8a8_triton_block_scaled_mm)
|
||||
cutlass_scaled_mm, per_token_group_quant_fp8, w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_gemm
|
||||
from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
|
||||
from vllm.utils.deep_gemm import (fp8_gemm_nt,
|
||||
get_col_major_tma_aligned_tensor,
|
||||
per_block_cast_to_fp8)
|
||||
|
||||
if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
|
||||
@ -90,8 +91,7 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
||||
|
||||
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
|
||||
out_dtype)
|
||||
out = w8a8_triton_block_scaled_mm(A_fp8, B_fp8, As, Bs, block_size,
|
||||
out_dtype)
|
||||
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
|
||||
|
||||
rel_diff = (torch.mean(
|
||||
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
|
||||
|
@ -20,11 +20,9 @@ from vllm.platforms import current_platform
|
||||
(8, 513, 64), # Non-divisible (native only)
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [42])
|
||||
@pytest.mark.parametrize("use_ue8m0", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
group_size: int, seed: int,
|
||||
use_ue8m0: bool) -> None:
|
||||
group_size: int, seed: int) -> None:
|
||||
"""Test QuantFP8 group quantization with various configurations.
|
||||
|
||||
Tests both CUDA and native implementations, column-major scales,
|
||||
@ -40,8 +38,7 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
group_shape = GroupShape(1, group_size)
|
||||
quant_op = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=False,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=False)
|
||||
|
||||
# 1. Test native implementation (always available)
|
||||
x_quant_native, scales_native = quant_op.forward_native(x.clone())
|
||||
@ -51,15 +48,9 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
# 2. Test column-major scales configuration
|
||||
quant_op_col = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=True,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=True)
|
||||
_, scales_col = quant_op_col.forward_native(x.clone())
|
||||
assert scales_col.shape == (batch_size, expected_num_groups)
|
||||
assert scales_col.stride(0) == 1
|
||||
assert scales_col.stride(1) == batch_size
|
||||
|
||||
# Test column-major scales consistency
|
||||
assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8)
|
||||
assert scales_col.shape == (expected_num_groups, batch_size)
|
||||
|
||||
# 3. Test CUDA implementation (only for divisible dimensions)
|
||||
if is_divisible:
|
||||
@ -77,9 +68,8 @@ def test_quantfp8_group_functionality(batch_size: int, hidden_dim: int,
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", [42])
|
||||
@pytest.mark.parametrize("use_ue8m0", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
|
||||
def test_quantfp8_group_multidimensional(seed: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
group_size = 64
|
||||
@ -92,8 +82,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
|
||||
group_shape = GroupShape(1, group_size)
|
||||
quant_op = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=False,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=False)
|
||||
|
||||
x_quant, scales = quant_op.forward_native(x_3d.clone())
|
||||
assert x_quant.shape == x_3d.shape
|
||||
@ -102,8 +91,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
|
||||
# Test column_major_scales with multi-dim
|
||||
quant_op_col = QuantFP8(static=False,
|
||||
group_shape=group_shape,
|
||||
column_major_scales=True,
|
||||
use_ue8m0=use_ue8m0)
|
||||
column_major_scales=True)
|
||||
_, scales_col = quant_op_col.forward_native(x_3d.clone())
|
||||
assert scales_col.shape == (batch1, hidden_dim // group_size, batch2)
|
||||
|
||||
|
@ -165,7 +165,7 @@ def onednn_gemm_test_helper(primitive_cache_size: int,
|
||||
def test_onednn_int8_scaled_gemm(
|
||||
n: int,
|
||||
k: int,
|
||||
m_list: tuple[int],
|
||||
m_list: tuple[int, ...],
|
||||
per_tensor_a_scale: bool,
|
||||
per_tensor_b_scale: bool,
|
||||
use_bias: bool,
|
||||
@ -196,7 +196,7 @@ def test_onednn_int8_scaled_gemm(
|
||||
def test_onednn_gemm(
|
||||
n: int,
|
||||
k: int,
|
||||
m_list: tuple[int],
|
||||
m_list: tuple[int, ...],
|
||||
use_bias: bool,
|
||||
use_stride: bool,
|
||||
dtype: torch.dtype,
|
||||
|
@ -524,14 +524,14 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
|
||||
* Backend instance
|
||||
'''
|
||||
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
|
||||
if backend_name == STR_XFORMERS_ATTN_VAL:
|
||||
from vllm.v1.attention.backends.xformers import (
|
||||
XFormersAttentionBackend)
|
||||
return XFormersAttentionBackend()
|
||||
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
|
||||
if backend_name == STR_FLASH_ATTN_VAL:
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
return FlashAttentionBackend()
|
||||
if backend_name == "TRITON_ATTN_VLLM_V1":
|
||||
if backend_name == "TRITON_ATTN":
|
||||
from vllm.v1.attention.backends.triton_attn import (
|
||||
TritonAttentionBackend)
|
||||
return TritonAttentionBackend()
|
||||
@ -539,7 +539,7 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
from vllm.v1.attention.backends.flex_attention import (
|
||||
FlexAttentionBackend)
|
||||
return FlexAttentionBackend()
|
||||
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
|
||||
if backend_name == "TORCH_SDPA":
|
||||
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
|
||||
return TorchSDPABackend()
|
||||
if backend_name == "FLASHINFER":
|
||||
|
@ -91,8 +91,7 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
|
||||
@pytest.mark.parametrize("enable_lora", [False, True])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
llama_3p2_1b_files,
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
llama_3p2_1b_files):
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
@ -17,6 +17,8 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
|
||||
from vllm.model_executor.layers.layernorm import (RMSNorm,
|
||||
dispatch_rocm_rmsnorm_func,
|
||||
fused_add_rms_norm, rms_norm)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
|
||||
@ -109,6 +111,34 @@ def test_enabled_ops_invalid(env: str):
|
||||
RMSNorm(1024).enabled()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_rocm() or not current_platform.is_fp8_fnuz(),
|
||||
reason="AITER is a feature exclusive for ROCm and FP8_FNUZ")
|
||||
@pytest.mark.parametrize("use_cutlass", [True, False])
|
||||
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
|
||||
@pytest.mark.parametrize("use_rocm_aiter_gemm_w8a8_blockscale", ["0", "1"])
|
||||
def test_w8a8_blockscale_dispatch(use_cutlass: bool, use_rocm_aiter: str,
|
||||
use_rocm_aiter_gemm_w8a8_blockscale: str,
|
||||
monkeypatch):
|
||||
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER_LINEAR",
|
||||
use_rocm_aiter_gemm_w8a8_blockscale)
|
||||
|
||||
use_aiter_and_is_supported = (bool(int(use_rocm_aiter)) and bool(
|
||||
int(use_rocm_aiter_gemm_w8a8_blockscale)))
|
||||
block_scale_func = dispatch_w8a8_blockscale_func(
|
||||
use_cutlass, use_aiter_and_is_supported=use_aiter_and_is_supported)
|
||||
if use_cutlass:
|
||||
assert block_scale_func == cutlass_scaled_mm
|
||||
elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
|
||||
use_rocm_aiter_gemm_w8a8_blockscale):
|
||||
assert block_scale_func == (
|
||||
torch.ops.vllm.rocm_aiter_gemm_w8a8_blockscale)
|
||||
else:
|
||||
assert block_scale_func == w8a8_block_fp8_matmul
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
|
||||
def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
|
||||
|
132
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
132
tests/models/multimodal/generation/test_qwen2_5_vl.py
Normal file
@ -0,0 +1,132 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
|
||||
from ....conftest import VIDEO_ASSETS
|
||||
|
||||
models = ["Qwen/Qwen2.5-VL-3B-Instruct"]
|
||||
target_dtype = "bfloat16"
|
||||
|
||||
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
|
||||
|
||||
|
||||
def qwen2_5_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
"baby_reading":
|
||||
qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
})
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
"""Test EVS (Efficient Video Sampling) functionality with different
|
||||
pruning rates.
|
||||
"""
|
||||
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
prompts = [VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
|
||||
# Basic validation that we got a response
|
||||
assert len(outputs) == 1
|
||||
output_ids, output_text = outputs[0]
|
||||
|
||||
# Ensure we got some output
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
"""Test EVS functionality with batched videos.
|
||||
|
||||
This test validates that:
|
||||
1. The model handles batched video inputs correctly with EVS
|
||||
2. Both pruning configurations work with multiple videos
|
||||
3. The model doesn't crash when processing multiple videos simultaneously
|
||||
"""
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
# Test batched videos
|
||||
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0],
|
||||
sampled_vids[0]] # Use same video twice for testing
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
|
||||
# Basic validation that we got responses for both videos
|
||||
assert len(outputs) == 2
|
||||
|
||||
for output_ids, output_text in outputs:
|
||||
# Ensure we got some output for each video
|
||||
assert len(output_ids) > 0
|
||||
assert len(output_text) > 0
|
||||
|
||||
# Ensure the output is a string
|
||||
assert isinstance(output_text, str)
|
@ -101,7 +101,7 @@ class VLMTestInfo(NamedTuple):
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
|
||||
torch.Tensor]] = None
|
||||
list[torch.Tensor]]] = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
@ -137,12 +137,12 @@ class VLMTestInfo(NamedTuple):
|
||||
# Default expandable params per test; these defaults can be overridden in
|
||||
# instances of this object; the complete set of test cases for the model
|
||||
# is all combinations of .models + all fields below
|
||||
max_tokens: Union[int, tuple[int]] = 128
|
||||
num_logprobs: Union[int, tuple[int]] = 5
|
||||
dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
|
||||
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
|
||||
max_tokens: int = 128
|
||||
num_logprobs: int = 5
|
||||
dtype: str = "auto"
|
||||
distributed_executor_backend: Optional[str] = None
|
||||
# Only expanded in video tests
|
||||
num_video_frames: Union[int, tuple[int]] = 16
|
||||
num_video_frames: int = 16
|
||||
|
||||
# Fixed image sizes / image size factors; most tests use image_size_factors
|
||||
# The values provided for these two fields will be stacked and expanded
|
||||
|
@ -213,6 +213,7 @@ _IGNORE_MM_KEYS = {
|
||||
MM_DATA_PATCHES = {
|
||||
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
|
||||
"glm4v": glm4_1v_patch_mm_data,
|
||||
"glm4v_moe": glm4_1v_patch_mm_data,
|
||||
"qwen3_vl": qwen3_vl_patch_mm_data,
|
||||
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
|
||||
}
|
||||
|
@ -19,6 +19,8 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.models.interfaces import (SupportsMultiModal,
|
||||
supports_multimodal)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
InputProcessingContext)
|
||||
@ -88,6 +90,7 @@ def resize_mm_data(
|
||||
|
||||
|
||||
def create_batched_mm_kwargs(
|
||||
model_cls: type[SupportsMultiModal],
|
||||
model_config: ModelConfig,
|
||||
processor: BaseMultiModalProcessor,
|
||||
size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
|
||||
@ -127,16 +130,22 @@ def create_batched_mm_kwargs(
|
||||
mm_data=resized_mm_data,
|
||||
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=processor_inputs.tokenization_kwargs,
|
||||
)["mm_kwargs"]
|
||||
)["mm_kwargs"].require_data()
|
||||
items = [
|
||||
item for modality in supported_mm_limits
|
||||
for item in mm_kwargs[modality]
|
||||
]
|
||||
return group_mm_kwargs_by_modality(items)
|
||||
return group_mm_kwargs_by_modality(
|
||||
items,
|
||||
merge_by_field_config=model_cls.merge_by_field_config,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def initialize_dummy_model(model_cls: nn.Module, model_config: ModelConfig):
|
||||
def initialize_dummy_model(
|
||||
model_cls: type[nn.Module],
|
||||
model_config: ModelConfig,
|
||||
):
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
init_distributed_environment(
|
||||
world_size=1,
|
||||
@ -198,8 +207,12 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
hf_overrides=hf_overrides_fn,
|
||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype)
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
assert supports_multimodal(model_cls)
|
||||
|
||||
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
|
||||
|
||||
inputs_parse_methods = []
|
||||
@ -228,7 +241,7 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
|
||||
with initialize_dummy_model(model_cls, model_config) as model:
|
||||
for modality, _, mm_kwargs in create_batched_mm_kwargs(
|
||||
model_config, processor):
|
||||
model_cls, model_config, processor):
|
||||
for method_name in inputs_parse_methods:
|
||||
print(f"Testing `{method_name}` with modality={modality} "
|
||||
f"and mm_kwargs{list(mm_kwargs.keys())}")
|
||||
|
@ -196,6 +196,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"CwmForCausalLM": _HfExamplesInfo("facebook/cwm", # noqa: E501
|
||||
trust_remote_code=True,
|
||||
is_available_online=False),
|
||||
"DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
|
||||
"DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
@ -273,6 +276,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
is_available_online=False),
|
||||
"Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||
is_available_online=False),
|
||||
"LongcatFlashForCausalLM": _HfExamplesInfo
|
||||
("meituan-longcat/LongCat-Flash-Chat", trust_remote_code=True),
|
||||
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
||||
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
|
||||
min_transformers_version="4.55.3",
|
||||
@ -526,7 +531,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"NemotronH_Nano_VL": _HfExamplesInfo("nano_vl_dummy",
|
||||
"NemotronH_Nano_VL_V2": _HfExamplesInfo("nano_vl_dummy",
|
||||
is_available_online=False,
|
||||
trust_remote_code=True),
|
||||
"Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
|
||||
@ -639,6 +644,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
speculative_model="zai-org/GLM-4.5",
|
||||
min_transformers_version="4.54",
|
||||
is_available_online=False),
|
||||
"LongCatFlashMTPModel": _HfExamplesInfo(
|
||||
"meituan-longcat/LongCat-Flash-Chat",
|
||||
trust_remote_code=True,
|
||||
speculative_model="meituan-longcat/LongCat-Flash-Chat"),
|
||||
"MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
|
||||
trust_remote_code=True,
|
||||
speculative_model="XiaomiMiMo/MiMo-7B-RL"),
|
||||
|
@ -84,7 +84,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
|
||||
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
|
||||
# L4 supports FA3.
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||
if model_arch == "WhisperForConditionalGeneration":
|
||||
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
LLM(
|
||||
|
@ -428,9 +428,8 @@ def dummy_hf_overrides(
|
||||
num_hidden_layers = (3 if model_arch
|
||||
== "Gemma3nForConditionalGeneration" else 1)
|
||||
|
||||
text_config.update({
|
||||
update_dict = {
|
||||
"num_layers": num_layers,
|
||||
"num_hidden_layers": num_hidden_layers,
|
||||
"num_experts": num_experts,
|
||||
"num_experts_per_tok": 2,
|
||||
"num_local_experts": num_experts,
|
||||
@ -440,7 +439,14 @@ def dummy_hf_overrides(
|
||||
"n_routed_experts": num_experts,
|
||||
# For Gemma-3n
|
||||
"num_kv_shared_layers": 1,
|
||||
})
|
||||
}
|
||||
|
||||
# Update num_hidden_layers for non-Longcat architectures
|
||||
if model_arch != "LongcatFlashForCausalLM" \
|
||||
and model_arch != "LongCatFlashMTPModel":
|
||||
update_dict["num_hidden_layers"] = num_hidden_layers
|
||||
|
||||
text_config.update(update_dict)
|
||||
|
||||
if hasattr(hf_config, "vision_config"):
|
||||
hf_config.vision_config.update({
|
||||
|
@ -5,7 +5,6 @@ import base64
|
||||
import mimetypes
|
||||
import os
|
||||
from tempfile import NamedTemporaryFile, TemporaryDirectory
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -15,9 +14,6 @@ from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.inputs import PlaceholderRange
|
||||
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal.inputs import MultiModalPlaceholderDict
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_ASSETS = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
@ -218,18 +214,13 @@ async def test_fetch_video_http_with_dynamic_loader(
|
||||
assert metadata_sync["video_backend"] == "opencv_dynamic"
|
||||
|
||||
|
||||
# Used for `test_argsort_mm_positions`.
|
||||
class TestCase(NamedTuple):
|
||||
mm_positions: "MultiModalPlaceholderDict"
|
||||
expected_modality_idxs: list[tuple[str, int]]
|
||||
|
||||
|
||||
def test_argsort_mm_positions():
|
||||
|
||||
test_cases = [
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
[
|
||||
# Single modality
|
||||
## Internally sorted
|
||||
TestCase(
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=0, length=2),
|
||||
@ -242,7 +233,7 @@ def test_argsort_mm_positions():
|
||||
],
|
||||
),
|
||||
## Internally unsorted
|
||||
TestCase(
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=3, length=2),
|
||||
@ -257,7 +248,7 @@ def test_argsort_mm_positions():
|
||||
|
||||
# Two modalities
|
||||
## Internally sorted
|
||||
TestCase(
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=7, length=4),
|
||||
@ -276,7 +267,7 @@ def test_argsort_mm_positions():
|
||||
],
|
||||
),
|
||||
## Interleaved, internally sorted
|
||||
TestCase(
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=0, length=4),
|
||||
@ -295,7 +286,7 @@ def test_argsort_mm_positions():
|
||||
],
|
||||
),
|
||||
## Interleaved, internally unsorted
|
||||
TestCase(
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=8, length=2),
|
||||
@ -316,7 +307,7 @@ def test_argsort_mm_positions():
|
||||
|
||||
# Three modalities
|
||||
## Internally sorted
|
||||
TestCase(
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=15, length=7),
|
||||
@ -341,7 +332,7 @@ def test_argsort_mm_positions():
|
||||
],
|
||||
),
|
||||
## Interleaved, internally sorted
|
||||
TestCase(
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=0, length=2),
|
||||
@ -363,8 +354,8 @@ def test_argsort_mm_positions():
|
||||
("image", 2),
|
||||
],
|
||||
),
|
||||
## Interleaved, internally sunorted
|
||||
TestCase(
|
||||
## Interleaved, internally unsorted
|
||||
dict(
|
||||
mm_positions={
|
||||
"image": [
|
||||
PlaceholderRange(offset=0, length=2),
|
||||
@ -386,9 +377,13 @@ def test_argsort_mm_positions():
|
||||
("image", 1),
|
||||
],
|
||||
),
|
||||
]
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_argsort_mm_positions(case):
|
||||
mm_positions = case["mm_positions"]
|
||||
expected_modality_idxs = case["expected_modality_idxs"]
|
||||
|
||||
for mm_positions, expected_modality_idxs in test_cases:
|
||||
modality_idxs = argsort_mm_positions(mm_positions)
|
||||
modality_idxs = argsort_mm_positions(mm_positions)
|
||||
|
||||
assert modality_idxs == expected_modality_idxs
|
||||
assert modality_idxs == expected_modality_idxs
|
||||
|
@ -18,9 +18,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
|
||||
CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
|
||||
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
|
||||
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
W8A8BlockFp8LinearOp)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
cutlass_fp4_supported)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
@ -745,35 +742,3 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
|
||||
perplexity = llm.generate_prompt_perplexity([prompt])[0]
|
||||
print(perplexity)
|
||||
assert perplexity <= exp_perplexity
|
||||
|
||||
|
||||
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
|
||||
with vllm_runner(model_path) as llm:
|
||||
|
||||
fp8_dtype = current_platform.fp8_dtype()
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method,
|
||||
CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
|
||||
assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear,
|
||||
W8A8BlockFp8LinearOp)
|
||||
|
||||
assert qkv_proj.weight.dtype is fp8_dtype
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert len(qkv_proj.weight.shape) == 2
|
||||
assert len(qkv_proj.weight_scale.shape) == 2
|
||||
|
||||
input_quant_op = \
|
||||
qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
|
||||
assert isinstance(input_quant_op, QuantFP8)
|
||||
assert input_quant_op._forward_method == input_quant_op.forward_cuda
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
@ -97,7 +97,6 @@ def test_auto_task(model_id, expected_runner_type, expected_convert_type,
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.convert_type == expected_convert_type
|
||||
assert expected_task in config.supported_tasks
|
||||
|
||||
|
||||
# Can remove once --task option is fully deprecated
|
||||
@ -120,7 +119,6 @@ def test_score_task(model_id, expected_runner_type, expected_convert_type,
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.convert_type == expected_convert_type
|
||||
assert expected_task in config.supported_tasks
|
||||
|
||||
|
||||
# Can remove once --task option is fully deprecated
|
||||
@ -137,7 +135,6 @@ def test_transcription_task(model_id, expected_runner_type,
|
||||
|
||||
assert config.runner_type == expected_runner_type
|
||||
assert config.convert_type == expected_convert_type
|
||||
assert expected_task in config.supported_tasks
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -96,7 +96,7 @@ def test_routing_strategy_integration(monkeypatch, device):
|
||||
envs.environment_variables[env_name] = lambda s=strategy: s
|
||||
|
||||
# Test the select_experts method
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits,
|
||||
top_k=top_k,
|
||||
|
@ -1,61 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, envs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
pytest.skip(
|
||||
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
|
||||
# TODO TPU will appear busy if we fan-out test params here
|
||||
@pytest.mark.parametrize("n_prompts", [1])
|
||||
def test_logprobs(model_name: str, n_prompts: int):
|
||||
"""
|
||||
Request top logprobs with different sampling settings and check
|
||||
that results contains the requested number, ordered ascendingly.
|
||||
"""
|
||||
|
||||
def check_num_logprobs(logprobs, expected_num: int):
|
||||
for step in logprobs:
|
||||
prev_logp = 1.0
|
||||
# order by rank
|
||||
sorted_step = dict(
|
||||
sorted(step.items(), key=lambda item: item[1].rank))
|
||||
|
||||
if len(step) != expected_num:
|
||||
print("watch out", sorted_step)
|
||||
|
||||
# check results are ordered by prob value
|
||||
# assert len(step) == expected_num
|
||||
for rankno, (tid, logp) in enumerate(sorted_step.items()):
|
||||
assert logp.logprob <= prev_logp
|
||||
prev_logp = logp.logprob
|
||||
assert logp.rank == rankno + 1
|
||||
|
||||
llm = LLM(model_name,
|
||||
enforce_eager=False,
|
||||
max_num_seqs=1,
|
||||
max_model_len=128,
|
||||
max_num_batched_tokens=128)
|
||||
prompts = [
|
||||
"Write a short story about a robot that dreams for the first time."
|
||||
] * n_prompts
|
||||
greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64,\
|
||||
logprobs=4)
|
||||
regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
|
||||
logprobs=4)
|
||||
topkp_sampling_params = SamplingParams(temperature=0.4, max_tokens=64,\
|
||||
logprobs=4, top_k=12, top_p=0.5)
|
||||
|
||||
for sp in [greedy_sampling_params, regular_sampling_params, \
|
||||
topkp_sampling_params]:
|
||||
output = llm.generate(prompts, sp)
|
||||
for o in output:
|
||||
check_num_logprobs(o.outputs[0].logprobs, 4)
|
@ -69,6 +69,8 @@ def test_triton_placeholder_language():
|
||||
assert lang.constexpr is None
|
||||
assert lang.dtype is None
|
||||
assert lang.int64 is None
|
||||
assert lang.int32 is None
|
||||
assert lang.tensor is None
|
||||
|
||||
|
||||
def test_triton_placeholder_language_from_parent():
|
||||
|
@ -1131,14 +1131,14 @@ def has_module_attribute(module_name, attribute_name):
|
||||
|
||||
def get_attn_backend_list_based_on_platform() -> list[str]:
|
||||
if current_platform.is_cuda():
|
||||
return ["FLASH_ATTN_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TREE_ATTN"]
|
||||
return ["FLASH_ATTN", "TRITON_ATTN", "TREE_ATTN"]
|
||||
elif current_platform.is_rocm():
|
||||
attn_backend_list = ["TRITON_ATTN_VLLM_V1"]
|
||||
attn_backend_list = ["TRITON_ATTN"]
|
||||
try:
|
||||
import aiter # noqa: F401
|
||||
attn_backend_list.append("FLASH_ATTN_VLLM_V1")
|
||||
attn_backend_list.append("FLASH_ATTN")
|
||||
except Exception:
|
||||
print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
|
||||
print("Skip FLASH_ATTN on ROCm as aiter is not installed")
|
||||
|
||||
return attn_backend_list
|
||||
else:
|
||||
|
@ -23,15 +23,16 @@ from vllm_test_utils.monitor import monitor
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.transformers_utils.detokenizer_utils import (
|
||||
convert_ids_list_to_tokens)
|
||||
from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
|
||||
MemorySnapshot, PlaceholderModule, StoreBoolean,
|
||||
bind_kv_cache, common_broadcastable_dtype,
|
||||
current_stream, deprecate_kwargs, get_open_port,
|
||||
get_tcp_uri, is_lossless_cast, join_host_port,
|
||||
make_zmq_path, make_zmq_socket, memory_profiling,
|
||||
merge_async_iterators, sha256, split_host_port,
|
||||
split_zmq_path, supports_kw, swap_dict_values)
|
||||
|
||||
# isort: off
|
||||
from vllm.utils import (
|
||||
CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot,
|
||||
PlaceholderModule, bind_kv_cache, common_broadcastable_dtype,
|
||||
current_stream, deprecate_kwargs, get_open_port, get_tcp_uri,
|
||||
is_lossless_cast, join_host_port, make_zmq_path, make_zmq_socket,
|
||||
memory_profiling, merge_async_iterators, sha256, split_host_port,
|
||||
split_zmq_path, supports_kw, swap_dict_values, unique_filepath)
|
||||
# isort: on
|
||||
from ..utils import create_new_process_for_each_test, error_on_warning
|
||||
|
||||
|
||||
@ -1032,3 +1033,15 @@ def test_load_config_file(tmp_path):
|
||||
# Assert that the processed arguments match the expected output
|
||||
assert processed_args == expected_args
|
||||
os.remove(str(config_file_path))
|
||||
|
||||
|
||||
def test_unique_filepath():
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
|
||||
paths = set()
|
||||
for i in range(10):
|
||||
path = unique_filepath(path_fn)
|
||||
path.write_text("test")
|
||||
paths.add(path)
|
||||
assert len(paths) == 10
|
||||
assert len(list(Path(temp_dir).glob("*.txt"))) == 10
|
||||
|
@ -21,16 +21,15 @@ from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
||||
BACKENDS_TO_TEST = [
|
||||
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
|
||||
_Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN,
|
||||
"FLEX_ATTENTION_SLOW"
|
||||
_Backend.FLASH_ATTN, _Backend.FLASHINFER, _Backend.FLEX_ATTENTION,
|
||||
_Backend.TRITON_ATTN, _Backend.TREE_ATTN, "FLEX_ATTENTION_SLOW"
|
||||
]
|
||||
|
||||
# Remove flashinfer from the list if it's not available
|
||||
try:
|
||||
import flashinfer # noqa: F401
|
||||
except ImportError:
|
||||
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
|
||||
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)
|
||||
|
||||
|
||||
def _convert_dtype_to_torch(dtype):
|
||||
@ -214,7 +213,7 @@ def run_attention_backend(
|
||||
builder_cls, impl_cls = get_attention_backend(actual_backend)
|
||||
|
||||
# Mock flashinfer's get_per_layer_parameters if needed
|
||||
if actual_backend == _Backend.FLASHINFER_VLLM_V1:
|
||||
if actual_backend == _Backend.FLASHINFER:
|
||||
import unittest.mock
|
||||
|
||||
from vllm.v1.attention.backends.utils import PerLayerParameters
|
||||
@ -434,7 +433,7 @@ def _test_backend_correctness(
|
||||
# [num_blocks, 2, block_size, num_kv_heads, head_size]
|
||||
# Select the appropriate KV cache format for each backend
|
||||
kv_cache_for_backend = kv_cache
|
||||
if backend_name == _Backend.FLASHINFER_VLLM_V1:
|
||||
if backend_name == _Backend.FLASHINFER:
|
||||
kv_cache_for_backend = kv_cache.transpose(0, 1)
|
||||
|
||||
# For FlashInfer default to HND layout and
|
||||
@ -518,8 +517,8 @@ def test_causal_backend_correctness(batch_spec_name: str, model: str):
|
||||
|
||||
|
||||
SLIDING_WINDOW_BACKENDS_TO_TEST = [
|
||||
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION,
|
||||
_Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW"
|
||||
_Backend.FLASH_ATTN, _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN,
|
||||
"FLEX_ATTENTION_SLOW"
|
||||
]
|
||||
|
||||
|
||||
|
@ -15,8 +15,8 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||
|
||||
BACKENDS_TO_TEST = [
|
||||
_Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, _Backend.FLASH_ATTN_MLA,
|
||||
_Backend.TRITON_MLA_VLLM_V1
|
||||
_Backend.CUTLASS_MLA, _Backend.FLASHMLA, _Backend.FLASH_ATTN_MLA,
|
||||
_Backend.TRITON_MLA
|
||||
]
|
||||
|
||||
# Remove CUTLASS_MLA from the list if not using sm100
|
||||
|
@ -120,30 +120,30 @@ def get_attention_backend(backend_name: _Backend):
|
||||
Tuple of (backend_builder_class, backend_impl_class)
|
||||
"""
|
||||
backend_map = {
|
||||
_Backend.FLASH_ATTN_VLLM_V1:
|
||||
_Backend.FLASH_ATTN:
|
||||
("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
|
||||
if current_platform.is_cuda() else
|
||||
"vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
|
||||
),
|
||||
_Backend.FLASHINFER_VLLM_V1:
|
||||
_Backend.FLASHINFER:
|
||||
"vllm.v1.attention.backends.flashinfer.FlashInferBackend",
|
||||
_Backend.FLEX_ATTENTION:
|
||||
"vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
|
||||
_Backend.TRITON_ATTN_VLLM_V1:
|
||||
_Backend.TRITON_ATTN:
|
||||
"vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
|
||||
_Backend.TREE_ATTN:
|
||||
"vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
|
||||
_Backend.XFORMERS_VLLM_V1:
|
||||
_Backend.XFORMERS:
|
||||
"vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
|
||||
_Backend.CUTLASS_MLA:
|
||||
"vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
|
||||
_Backend.FLASHMLA_VLLM_V1:
|
||||
_Backend.FLASHMLA:
|
||||
"vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
|
||||
_Backend.FLASH_ATTN_MLA:
|
||||
"vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
|
||||
_Backend.FLASHINFER_MLA:
|
||||
"vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
|
||||
_Backend.TRITON_MLA_VLLM_V1:
|
||||
_Backend.TRITON_MLA:
|
||||
"vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
|
||||
}
|
||||
|
||||
|
@ -89,7 +89,7 @@ backend_configs = {
|
||||
# Triton Attention
|
||||
"TritonAttn":
|
||||
BackendConfig(name="TritonAttn",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
|
@ -9,11 +9,14 @@ from ...utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("attn_backend",
|
||||
["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
|
||||
@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
|
||||
def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
|
||||
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
|
||||
|
||||
if attn_backend == "FLASHINFER":
|
||||
pytest.skip("This test is failing with FlashInfer backend and "
|
||||
"needs investigation. See issue #25679.")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
@ -176,12 +176,11 @@ def test_eagle_correctness(
|
||||
m.setenv("VLLM_MLA_DISABLE", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
if (attn_backend == "TRITON_ATTN_VLLM_V1"
|
||||
and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
|
||||
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN does not support "
|
||||
"multi-token eagle spec decode on current platform")
|
||||
|
||||
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
|
||||
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
|
||||
m.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
method, model_name, spec_model_name, tp_size = model_setup
|
||||
|
@ -12,6 +12,7 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
|
||||
STOP_STRINGS,
|
||||
DummyOutputProcessorTestVectors,
|
||||
MockEngineCore)
|
||||
from vllm import PoolingParams
|
||||
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
@ -998,3 +999,35 @@ async def test_cumulative_output_collector_n():
|
||||
third = [k for k in result.outputs if k.index == 2]
|
||||
assert len(third) == 1
|
||||
assert third[0].text == "c"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("runner", ["generate", "pooling"])
|
||||
def test_abort_requests(runner: str, dummy_test_vectors):
|
||||
output_processor = OutputProcessor(dummy_test_vectors.tokenizer,
|
||||
log_stats=True)
|
||||
requests = [
|
||||
EngineCoreRequest(
|
||||
request_id=f"request-{idx}",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
sampling_params=SamplingParams() if runner == "generate" else None,
|
||||
pooling_params=PoolingParams(
|
||||
task="embed") if runner == "pooling" else None,
|
||||
) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
|
||||
]
|
||||
|
||||
for request in requests:
|
||||
if runner == "generate":
|
||||
output_kind = request.sampling_params.output_kind
|
||||
else:
|
||||
output_kind = request.pooling_params.output_kind
|
||||
queue = RequestOutputCollector(output_kind=output_kind)
|
||||
output_processor.add_request(request, None, queue=queue)
|
||||
|
||||
for request in requests:
|
||||
output_processor.abort_requests([request.request_id])
|
||||
|
@ -26,6 +26,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
|
||||
KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
|
||||
NixlConnectorWorker, NixlKVConnectorStats)
|
||||
from vllm.distributed.kv_transfer.kv_transfer_state import (
|
||||
ensure_kv_transfer_shutdown, has_kv_transfer_group)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.platforms.interface import Platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@ -35,6 +37,26 @@ from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
||||
from .utils import create_request, create_scheduler, create_vllm_config
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def clear_kv_transfer():
|
||||
"""
|
||||
The test cases in this file use `VLLM_ENABLE_V1_MULTIPROCESSING=0`,
|
||||
causing the global variable `_KV_CONNECTOR_AGENT`
|
||||
to be assigned but never deleted.
|
||||
|
||||
Since the current pytest process does not terminate and instead
|
||||
continues running tests from other files,
|
||||
this global variable remains in memory and interferes
|
||||
with test cases in other modules.
|
||||
|
||||
So we use this fixture to ensure that the global variable
|
||||
`_KV_CONNECTOR_AGENT` is properly cleaned up after each test.
|
||||
"""
|
||||
yield
|
||||
if has_kv_transfer_group():
|
||||
ensure_kv_transfer_shutdown()
|
||||
|
||||
|
||||
class FakeNixlWrapper:
|
||||
"""Mock implementation of NixlWrapper for testing.
|
||||
|
||||
|
@ -8,7 +8,8 @@ import ray
|
||||
from vllm.config import ModelDType
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
|
||||
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
|
||||
from vllm.v1.metrics.ray_wrappers import (RayPrometheusMetric,
|
||||
RayPrometheusStatLogger)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
@ -65,3 +66,39 @@ def test_engine_log_metrics_ray(
|
||||
# Create the actor and call the async method
|
||||
actor = EngineTestActor.remote() # type: ignore[attr-defined]
|
||||
ray.get(actor.run.remote())
|
||||
|
||||
|
||||
def test_sanitized_opentelemetry_name():
|
||||
"""Test the metric name sanitization logic for Ray."""
|
||||
|
||||
# Only a-z, A-Z, 0-9, _, test valid characters are preserved
|
||||
valid_name = "valid_metric_123_abcDEF"
|
||||
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
|
||||
valid_name) == valid_name
|
||||
|
||||
# Test dash, dot, are replaced
|
||||
name_with_dash_dot = "metric-name.test"
|
||||
expected = "metric_name_test"
|
||||
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
|
||||
name_with_dash_dot) == expected
|
||||
|
||||
# Test colon is replaced with underscore
|
||||
name_with_colon = "metric:name"
|
||||
expected = "metric_name"
|
||||
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
|
||||
name_with_colon) == expected
|
||||
|
||||
# Test multiple invalid characters are replaced
|
||||
name_with_invalid = "metric:name@with#special%chars"
|
||||
expected = "metric_name_with_special_chars"
|
||||
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
|
||||
name_with_invalid) == expected
|
||||
|
||||
# Test mixed valid and invalid characters
|
||||
complex_name = "vllm:engine_stats/time.latency_ms-99p"
|
||||
expected = "vllm_engine_stats_time_latency_ms_99p"
|
||||
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
|
||||
complex_name) == expected
|
||||
|
||||
# Test empty string
|
||||
assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == ""
|
||||
|
@ -72,8 +72,10 @@ def _create_allowed_token_ids(
|
||||
|
||||
|
||||
def _create_bad_words_token_ids(
|
||||
batch_size: int, vocab_size: int,
|
||||
bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
|
||||
batch_size: int,
|
||||
vocab_size: int,
|
||||
bad_words_lengths: tuple[int, ...],
|
||||
) -> dict[int, list[list[int]]]:
|
||||
bad_words_token_ids = {}
|
||||
for batch_idx in range(batch_size):
|
||||
token_ids_single_batch = []
|
||||
@ -402,7 +404,7 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
|
||||
@pytest.mark.parametrize("batch_size", [1, 2, 32])
|
||||
@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
|
||||
def test_sampler_bad_words(device: str, batch_size: int,
|
||||
bad_words_lengths: list[tuple[int]]):
|
||||
bad_words_lengths: tuple[int, ...]):
|
||||
"""
|
||||
Test to verify that when the bad words restriction is present, tokens
|
||||
are penalized based on their match with the bad words.
|
||||
|
@ -30,7 +30,7 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
def _create_proposer(
|
||||
method: str,
|
||||
num_speculative_tokens: int,
|
||||
speculative_token_tree: Optional[list[tuple[int]]] = None,
|
||||
speculative_token_tree: Optional[list[tuple[int, ...]]] = None,
|
||||
) -> EagleProposer:
|
||||
model_config = ModelConfig(model=model_dir,
|
||||
runner="generate",
|
||||
@ -314,12 +314,11 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
|
||||
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
if (attn_backend == "TRITON_ATTN_VLLM_V1"
|
||||
and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
|
||||
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN does not support "
|
||||
"multi-token eagle spec decode on current platform")
|
||||
|
||||
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
|
||||
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
# Setup draft model mock
|
||||
@ -400,16 +399,15 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
|
||||
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
if (attn_backend == "TRITON_ATTN_VLLM_V1"
|
||||
and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
|
||||
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN does not support "
|
||||
"multi-token eagle spec decode on current platform")
|
||||
|
||||
if (attn_backend == "TREE_ATTN"):
|
||||
pytest.skip("TREE_ATTN is tested separately in test_propose_tree"
|
||||
"because it requires special input mocking.")
|
||||
|
||||
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
|
||||
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
# Use GPU device
|
||||
@ -510,12 +508,12 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
|
||||
device=device)
|
||||
sampling_metadata = mock.MagicMock()
|
||||
|
||||
if attn_backend == "FLASH_ATTN_VLLM_V1":
|
||||
if attn_backend == "FLASH_ATTN":
|
||||
attn_metadata_builder_cls, _ = get_attention_backend(
|
||||
_Backend.FLASH_ATTN_VLLM_V1)
|
||||
elif attn_backend == "TRITON_ATTN_VLLM_V1":
|
||||
_Backend.FLASH_ATTN)
|
||||
elif attn_backend == "TRITON_ATTN":
|
||||
attn_metadata_builder_cls, _ = get_attention_backend(
|
||||
_Backend.TRITON_ATTN_VLLM_V1)
|
||||
_Backend.TRITON_ATTN)
|
||||
elif attn_backend == "TREE_ATTN":
|
||||
attn_metadata_builder_cls, _ = get_attention_backend(
|
||||
_Backend.TREE_ATTN)
|
||||
|
@ -41,12 +41,11 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
if (attn_backend == "TRITON_ATTN_VLLM_V1"
|
||||
and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
|
||||
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN does not support "
|
||||
"multi-token eagle spec decode on current platform")
|
||||
|
||||
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
|
||||
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
|
||||
m.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
llm = LLM(
|
||||
|
@ -9,11 +9,13 @@ from vllm.v1.spec_decode.ngram_proposer import (
|
||||
|
||||
def test_find_longest_matched_ngram_and_propose_tokens():
|
||||
tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
|
||||
assert _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=2) is None
|
||||
result = _find_longest_matched_ngram_and_propose_tokens(
|
||||
origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=2)
|
||||
assert len(result) == 0
|
||||
|
||||
tokens = np.array([1, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(
|
||||
@ -62,7 +64,7 @@ def test_find_longest_matched_ngram_and_propose_tokens():
|
||||
|
||||
def test_ngram_proposer():
|
||||
|
||||
def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
|
||||
def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
|
||||
# Dummy model config. Just to set max_model_len.
|
||||
model_config = ModelConfig(model="facebook/opt-125m")
|
||||
return NgramProposer(
|
||||
@ -75,36 +77,120 @@ def test_ngram_proposer():
|
||||
)))
|
||||
|
||||
# No match.
|
||||
result = ngram_proposer(
|
||||
min_n=2, max_n=2,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
|
||||
assert result is None
|
||||
token_ids_cpu = np.array([[1, 2, 3, 4, 5]])
|
||||
result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
|
||||
sampled_token_ids=[[0]],
|
||||
req_ids=["0"],
|
||||
num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert len(result[0]) == 0
|
||||
|
||||
# No match for 4-gram.
|
||||
result = ngram_proposer(
|
||||
min_n=4, max_n=4,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
assert result is None
|
||||
token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
|
||||
result = get_ngram_proposer(min_n=4, max_n=4, k=2).propose(
|
||||
sampled_token_ids=[[0]],
|
||||
req_ids=["0"],
|
||||
num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert len(result[0]) == 0
|
||||
|
||||
# No match for 4-gram but match for 3-gram.
|
||||
result = ngram_proposer(
|
||||
min_n=3, max_n=4,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
assert np.array_equal(result, np.array([4, 1]))
|
||||
token_ids_cpu = np.array([[1, 2, 3, 4, 1, 2, 3]])
|
||||
result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
|
||||
sampled_token_ids=[[0]],
|
||||
req_ids=["0"],
|
||||
num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert np.array_equal(result, np.array([[4, 1]]))
|
||||
|
||||
# Match for both 4-gram and 3-gram.
|
||||
# In this case, the proposer should return the 4-gram match.
|
||||
result = ngram_proposer(min_n=3, max_n=4, k=2).propose(
|
||||
context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 1]
|
||||
token_ids_cpu = np.array([[2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]])
|
||||
result = get_ngram_proposer(min_n=3, max_n=4, k=2).propose(
|
||||
sampled_token_ids=[[0]],
|
||||
req_ids=["0"],
|
||||
num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert np.array_equal(result, np.array([[1, 2]])) # Not [5, 1]]
|
||||
|
||||
# Match for 2-gram and 3-gram, but not 4-gram.
|
||||
result = ngram_proposer(min_n=2, max_n=4, k=2).propose(
|
||||
context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 2]
|
||||
token_ids_cpu = np.array([[3, 4, 5, 2, 3, 4, 1, 2, 3, 4]])
|
||||
result = get_ngram_proposer(min_n=2, max_n=4, k=2).propose(
|
||||
sampled_token_ids=[[0]],
|
||||
req_ids=["0"],
|
||||
num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert np.array_equal(result, np.array([[1, 2]])) # Not [5, 2]]
|
||||
|
||||
# Multiple 3-gram matched, but always pick the first one.
|
||||
result = ngram_proposer(
|
||||
min_n=3, max_n=3, k=2).propose(context_token_ids=np.array(
|
||||
[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]))
|
||||
assert np.array_equal(result, np.array([100, 1]))
|
||||
token_ids_cpu = np.array(
|
||||
[[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]])
|
||||
result = get_ngram_proposer(min_n=3, max_n=3, k=2).propose(
|
||||
sampled_token_ids=[[0]],
|
||||
req_ids=["0"],
|
||||
num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert np.array_equal(result, np.array([[100, 1]]))
|
||||
|
||||
# check empty input
|
||||
token_ids_cpu = np.array([[]])
|
||||
result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
|
||||
sampled_token_ids=[[0]],
|
||||
req_ids=["0"],
|
||||
num_tokens_no_spec=np.array([len(c) for c in token_ids_cpu]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert len(result[0]) == 0
|
||||
|
||||
# check multibatch input
|
||||
# first request has 5 tokens and a match
|
||||
# second request has 3 tokens and no match. Padded with -1 for max len 5
|
||||
token_ids_cpu = np.array([[1, 2, 3, 1, 2], [4, 5, 6, -1, -1]])
|
||||
result = get_ngram_proposer(min_n=2, max_n=2, k=2).propose(
|
||||
sampled_token_ids=[[0], [1]],
|
||||
req_ids=["0", "1"],
|
||||
num_tokens_no_spec=np.array([5, 3]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert len(result[0]) == 2
|
||||
assert np.array_equal(result[0], np.array([3, 1]))
|
||||
assert np.array_equal(result[1], np.array([]))
|
||||
|
||||
# test if 0 threads available: can happen if TP size > CPU count
|
||||
ngram_proposer = get_ngram_proposer(min_n=2, max_n=2, k=2)
|
||||
ngram_proposer.num_numba_thread_available = 0
|
||||
# set max_model_len to 2 * threshold to ensure multithread is used
|
||||
num_tokens_threshold = ngram_proposer.num_tokens_threshold
|
||||
ngram_proposer.max_model_len = 2 * num_tokens_threshold
|
||||
# using multibatch test
|
||||
middle_integer = num_tokens_threshold // 2
|
||||
input_1 = [_ for _ in range(num_tokens_threshold)]
|
||||
input_1 += [middle_integer, middle_integer + 1]
|
||||
input_2 = [-1] * len(input_1)
|
||||
input_2[:3] = [4, 5, 6]
|
||||
token_ids_cpu = np.array([input_1, input_2])
|
||||
result = ngram_proposer.propose(
|
||||
sampled_token_ids=[[0], [1]],
|
||||
req_ids=["0", "1"],
|
||||
num_tokens_no_spec=np.array([len(input_1), 3]),
|
||||
token_ids_cpu=token_ids_cpu,
|
||||
spec_decode_unsupported_reqs=(),
|
||||
)
|
||||
assert len(result[0]) == 2
|
||||
assert np.array_equal(result[0],
|
||||
np.array([middle_integer + 2, middle_integer + 3]))
|
||||
assert np.array_equal(result[1], np.array([]))
|
||||
|
@ -278,7 +278,7 @@ def test_tree_attn_correctness() -> None:
|
||||
block_table=block_table,
|
||||
slot_mapping=branch_slot_mapping,
|
||||
seqlen_k=sequence_position + q_len,
|
||||
backend=_Backend.FLASH_ATTN_VLLM_V1,
|
||||
backend=_Backend.FLASH_ATTN,
|
||||
).view(batch_size, -1, num_heads, dim_per_head)
|
||||
|
||||
# Compare the outputs.
|
||||
|
@ -1,17 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.v1.attention.backends.flash_attn import (
|
||||
FlashAttentionBackend, FlashAttentionMetadataBuilder)
|
||||
from vllm.v1.attention.backends.flex_attention import (
|
||||
FlexAttentionBackend, FlexAttentionMetadataBuilder)
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
|
||||
from vllm.v1.worker.utils import (AttentionGroup,
|
||||
initialize_kv_cache_for_kv_sharing)
|
||||
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
|
||||
|
||||
|
||||
def new_kv_cache_spec():
|
||||
@ -37,56 +30,17 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
|
||||
new_kv_cache_spec()),
|
||||
]
|
||||
|
||||
attn_groups = [
|
||||
# KV cache group 0 has two attention groups
|
||||
[
|
||||
AttentionGroup(
|
||||
backend=FlashAttentionBackend,
|
||||
metadata_builder=Mock(spec=FlashAttentionMetadataBuilder),
|
||||
layer_names=["model.layers.0"],
|
||||
),
|
||||
AttentionGroup(
|
||||
backend=FlexAttentionBackend,
|
||||
metadata_builder=Mock(spec=FlexAttentionMetadataBuilder),
|
||||
layer_names=["model.layers.1"],
|
||||
),
|
||||
],
|
||||
]
|
||||
|
||||
# Only layers 0 and 1 will have KV caches allocated
|
||||
kv_caches = {
|
||||
"model.layers.0": torch.zeros(1, 2, 3),
|
||||
"model.layers.1": torch.ones(1, 2, 3),
|
||||
}
|
||||
|
||||
initialize_kv_cache_for_kv_sharing(
|
||||
add_kv_sharing_layers_to_kv_cache_groups(
|
||||
shared_kv_cache_layers=shared_kv_cache_layers,
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
kv_caches=kv_caches,
|
||||
attn_groups=attn_groups,
|
||||
)
|
||||
|
||||
# Check that the KV caches were shared correctly
|
||||
assert kv_caches["model.layers.2"].data_ptr(
|
||||
) == kv_caches["model.layers.0"].data_ptr()
|
||||
assert kv_caches["model.layers.3"].data_ptr(
|
||||
) == kv_caches["model.layers.1"].data_ptr()
|
||||
|
||||
# Check that the layers were added to the correct KV cache group
|
||||
assert len(kv_cache_groups) == 1
|
||||
assert kv_cache_groups[0].layer_names == [
|
||||
"model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
|
||||
]
|
||||
|
||||
# Check that the layers were added to the attention groups
|
||||
assert len(attn_groups) == 1 and len(attn_groups[0]) == 2
|
||||
assert attn_groups[0][0].layer_names == [
|
||||
"model.layers.0", "model.layers.2"
|
||||
]
|
||||
assert attn_groups[0][1].layer_names == [
|
||||
"model.layers.1", "model.layers.3"
|
||||
]
|
||||
|
||||
|
||||
def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
|
||||
"""
|
||||
@ -103,48 +57,17 @@ def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
|
||||
new_kv_cache_spec()),
|
||||
]
|
||||
|
||||
attn_groups = [
|
||||
# KV cache group 0 has a single attention group
|
||||
# as all layers have the same flash attention backend
|
||||
[
|
||||
AttentionGroup(
|
||||
backend=FlashAttentionBackend,
|
||||
metadata_builder=Mock(spec=FlashAttentionMetadataBuilder),
|
||||
layer_names=["model.layers.0", "model.layers.1"],
|
||||
),
|
||||
],
|
||||
]
|
||||
|
||||
kv_caches = {
|
||||
"model.layers.0": torch.zeros(1, 2, 3),
|
||||
"model.layers.1": torch.ones(1, 2, 3),
|
||||
}
|
||||
|
||||
initialize_kv_cache_for_kv_sharing(
|
||||
add_kv_sharing_layers_to_kv_cache_groups(
|
||||
shared_kv_cache_layers=shared_kv_cache_layers,
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
kv_caches=kv_caches,
|
||||
attn_groups=attn_groups,
|
||||
)
|
||||
|
||||
# Check that the KV caches were shared correctly
|
||||
assert kv_caches["model.layers.2"].data_ptr(
|
||||
) == kv_caches["model.layers.0"].data_ptr()
|
||||
assert kv_caches["model.layers.3"].data_ptr(
|
||||
) == kv_caches["model.layers.1"].data_ptr()
|
||||
|
||||
# Check that the layers were added to the correct KV cache group
|
||||
assert len(kv_cache_groups) == 1
|
||||
assert kv_cache_groups[0].layer_names == [
|
||||
"model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
|
||||
]
|
||||
|
||||
# Check that the layers were added to the attention groups
|
||||
assert len(attn_groups) == 1 and len(attn_groups[0]) == 1
|
||||
assert attn_groups[0][0].layer_names == [
|
||||
"model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
|
||||
]
|
||||
|
||||
|
||||
def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
|
||||
"""
|
||||
@ -162,23 +85,11 @@ def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
|
||||
KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()),
|
||||
]
|
||||
|
||||
kv_caches = {
|
||||
"model.layers.0": torch.zeros(1, 2, 3),
|
||||
"model.layers.1": torch.ones(1, 2, 3),
|
||||
}
|
||||
|
||||
initialize_kv_cache_for_kv_sharing(
|
||||
add_kv_sharing_layers_to_kv_cache_groups(
|
||||
shared_kv_cache_layers=shared_kv_cache_layers,
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
kv_caches=kv_caches,
|
||||
)
|
||||
|
||||
# Check that the KV caches were shared correctly
|
||||
assert kv_caches["model.layers.2"].data_ptr(
|
||||
) == kv_caches["model.layers.0"].data_ptr()
|
||||
assert kv_caches["model.layers.3"].data_ptr(
|
||||
) == kv_caches["model.layers.1"].data_ptr()
|
||||
|
||||
# Check that the layers were added to the correct KV cache group
|
||||
assert len(kv_cache_groups) == 2
|
||||
assert kv_cache_groups[0].layer_names == [
|
||||
|
@ -54,26 +54,3 @@ def test_v1_llm_by_default(monkeypatch):
|
||||
print(llm.generate("Hello my name is"))
|
||||
assert hasattr(llm.llm_engine, "engine_core")
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
def test_v1_attn_backend(monkeypatch):
|
||||
with monkeypatch.context() as m:
|
||||
if os.getenv("VLLM_USE_V1", None):
|
||||
m.delenv("VLLM_USE_V1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
|
||||
|
||||
# Fall back to V0.
|
||||
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
|
||||
assert not envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Reject if V1.
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
with pytest.raises(NotImplementedError):
|
||||
AsyncEngineArgs(model=MODEL).create_engine_config()
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA")
|
||||
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
|
||||
assert envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
@ -2,9 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadata,
|
||||
AttentionMetadataBuilder,
|
||||
AttentionState, AttentionType)
|
||||
AttentionMetadata, AttentionType)
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
|
||||
@ -13,7 +11,5 @@ __all__ = [
|
||||
"AttentionBackend",
|
||||
"AttentionMetadata",
|
||||
"AttentionType",
|
||||
"AttentionMetadataBuilder",
|
||||
"AttentionState",
|
||||
"get_attn_backend",
|
||||
]
|
||||
|
@ -2,10 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, fields
|
||||
from typing import (Any, Dict, Generic, List, Optional, Protocol, Set, Tuple,
|
||||
Type, TypeVar)
|
||||
from typing import Generic, List, Optional, Protocol, Tuple, Type, TypeVar
|
||||
|
||||
import torch
|
||||
|
||||
@ -34,6 +31,14 @@ class AttentionBackend(ABC):
|
||||
# makes sure the output tensor is allocated inside the cudagraph.
|
||||
accept_output_buffer: bool = False
|
||||
|
||||
# Whether this backend supports receiving pre-quantized query input.
|
||||
# If True, the attention layer will handle query quantization instead
|
||||
# of the backend, allowing torch.compile to fuse quantization with
|
||||
# previous operations.
|
||||
# Needs to be worked through for all backends
|
||||
# https://github.com/vllm-project/vllm/issues/25584
|
||||
supports_quant_query_input: bool = False
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_name() -> str:
|
||||
@ -49,18 +54,13 @@ class AttentionBackend(ABC):
|
||||
def get_metadata_cls() -> Type["AttentionMetadata"]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_state_cls() -> Type["AttentionState"]:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata":
|
||||
return cls.get_metadata_cls()(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
|
||||
def get_builder_cls(): # -> Type["AttentionMetadataBuilder"]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@ -77,149 +77,18 @@ class AttentionBackend(ABC):
|
||||
def get_kv_cache_stride_order() -> Tuple[int, ...]:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dst: torch.Tensor,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dists: torch.Tensor,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def full_cls_name(cls) -> tuple[str, str]:
|
||||
return (cls.__module__, cls.__qualname__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AttentionMetadata:
|
||||
"""Attention metadata for prefill and decode batched together."""
|
||||
# Total number of prefill requests.
|
||||
num_prefills: int
|
||||
# Number of prefill tokens.
|
||||
num_prefill_tokens: int
|
||||
# Number of decode tokens. Note that it is equivalent to the number of
|
||||
# decode requests.
|
||||
num_decode_tokens: int
|
||||
# (num_tokens,). The indices of the token slots that input tokens will be
|
||||
# stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
|
||||
# is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
|
||||
# in block 0, and 1st slot in block 1, respectively.
|
||||
slot_mapping: torch.Tensor
|
||||
|
||||
# Enable/disable KV scales calculation. This is so that we can disable the
|
||||
# calculation until after prefill and cuda graph capture.
|
||||
enable_kv_scales_calculation: bool
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def prefill_metadata(self) -> Optional["AttentionMetadata"]:
|
||||
"""Return the attention metadata that's required to run prefill
|
||||
attention."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def decode_metadata(self) -> Optional["AttentionMetadata"]:
|
||||
"""Return the attention metadata that's required to run decode
|
||||
attention."""
|
||||
pass
|
||||
|
||||
def asdict_zerocopy(self,
|
||||
skip_fields: Optional[Set[str]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Similar to dataclasses.asdict, but avoids deepcopying."""
|
||||
if skip_fields is None:
|
||||
skip_fields = set()
|
||||
# Note that if we add dataclasses as fields, they will need
|
||||
# similar handling.
|
||||
return {
|
||||
field.name: getattr(self, field.name)
|
||||
for field in fields(self) if field.name not in skip_fields
|
||||
}
|
||||
pass
|
||||
|
||||
|
||||
T = TypeVar("T", bound=AttentionMetadata)
|
||||
|
||||
|
||||
class AttentionState(ABC, Generic[T]):
|
||||
"""Holds attention backend-specific objects reused during the
|
||||
lifetime of the model runner."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, runner: Any):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
@contextmanager
|
||||
def graph_capture(self, max_batch_size: int):
|
||||
"""Context manager used when capturing CUDA graphs."""
|
||||
yield
|
||||
|
||||
@abstractmethod
|
||||
def graph_clone(self, batch_size: int) -> "AttentionState[T]":
|
||||
"""Clone attention state to save in CUDA graph metadata."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def graph_capture_get_metadata_for_batch(
|
||||
self,
|
||||
batch_size: int,
|
||||
is_encoder_decoder_model: bool = False) -> T:
|
||||
"""Get attention metadata for CUDA graph capture of batch_size."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_graph_input_buffers(
|
||||
self,
|
||||
attn_metadata: T,
|
||||
is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
|
||||
"""Get attention-specific input buffers for CUDA graph capture."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def prepare_graph_input_buffers(
|
||||
self,
|
||||
input_buffers: Dict[str, Any],
|
||||
attn_metadata: T,
|
||||
is_encoder_decoder_model: bool = False) -> None:
|
||||
"""In-place modify input buffers dict for CUDA graph replay."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def begin_forward(self, model_input) -> None:
|
||||
"""Prepare state for forward pass."""
|
||||
...
|
||||
|
||||
|
||||
class AttentionMetadataBuilder(ABC, Generic[T]):
|
||||
"""Abstract class for attention metadata builders."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, input_builder) -> None:
|
||||
"""Create the builder, remember some configuration and parameters."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def prepare(self) -> None:
|
||||
"""Prepare for one batch."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def build(self, seq_lens: List[int], query_lens: List[int],
|
||||
cuda_graph_pad_size: int, batch_size: int) -> T:
|
||||
"""Build attention metadata with on-device tensors."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class AttentionLayer(Protocol):
|
||||
|
||||
_q_scale: torch.Tensor
|
||||
|
@ -1,559 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Attention backend utils"""
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from itertools import accumulate
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
|
||||
AttentionState)
|
||||
from vllm.attention.backends.abstract import AttentionType
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
PAD_SLOT_ID = -1
|
||||
|
||||
# Switch to numpy implementation of compute_slot_mapping
|
||||
# if we have at least this many elements. Could be tuned further.
|
||||
_COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256
|
||||
|
||||
|
||||
def is_block_tables_empty(block_tables: Union[None, Dict]):
|
||||
"""
|
||||
Check if block_tables is None or a dictionary with all None values.
|
||||
"""
|
||||
if block_tables is None:
|
||||
return True
|
||||
return (isinstance(block_tables, dict)
|
||||
and all(value is None for value in block_tables.values()))
|
||||
|
||||
|
||||
def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int,
|
||||
context_len: int, sliding_window: int):
|
||||
"""
|
||||
Compute the start index of slot mapping.
|
||||
"""
|
||||
start_idx = 0
|
||||
if is_prompt and sliding_window is not None:
|
||||
start_idx = max(0, query_len - sliding_window)
|
||||
return start_idx
|
||||
|
||||
|
||||
def _compute_slot_mapping_python(slot_mapping: List[int],
|
||||
block_table: List[int], range_start: int,
|
||||
range_end: int, block_size: int):
|
||||
for i in range(range_start, range_end):
|
||||
block_number = block_table[i // block_size]
|
||||
block_offset = i % block_size
|
||||
slot = block_number * block_size + block_offset
|
||||
slot_mapping.append(slot)
|
||||
|
||||
|
||||
def _compute_slot_mapping_numpy(slot_mapping: List[int],
|
||||
block_table: List[int], range_start: int,
|
||||
range_end: int, block_size: int):
|
||||
block_table_array = np.array(block_table)
|
||||
idx = np.arange(range_start, range_end)
|
||||
block_offset = idx % block_size
|
||||
idx //= block_size
|
||||
seq_slot_mapping_array = block_table_array[idx]
|
||||
seq_slot_mapping_array *= block_size
|
||||
seq_slot_mapping_array += block_offset
|
||||
slot_mapping.extend(seq_slot_mapping_array)
|
||||
|
||||
|
||||
def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int],
|
||||
seq_id: int, seq_len: int, context_len: int,
|
||||
start_idx: int, block_size: int,
|
||||
block_tables: Dict[int, List[int]]):
|
||||
"""
|
||||
Compute slot mapping.
|
||||
"""
|
||||
if is_profile_run:
|
||||
# During memory profiling, the block tables are not
|
||||
# initialized yet. In this case, we just use a dummy
|
||||
# slot mapping.
|
||||
# In embeddings, the block tables are {seq_id: None}.
|
||||
slot_mapping.extend([PAD_SLOT_ID] * seq_len)
|
||||
return
|
||||
|
||||
# Mask the [0, start_idx) tokens of the prompt with
|
||||
# PAD_SLOT_ID, where start_idx is max(0, seq_len -
|
||||
# sliding_window). For example, if the prompt len is 10,
|
||||
# sliding window is 8, and block size is 4, the first two
|
||||
# tokens are masked and the slot mapping will be
|
||||
# [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
|
||||
padding_mask_len = max(0, start_idx - context_len)
|
||||
slot_mapping.extend([PAD_SLOT_ID] * padding_mask_len)
|
||||
|
||||
range_start = max(start_idx, context_len)
|
||||
range_end = seq_len
|
||||
numel = range_end - range_start
|
||||
block_table = block_tables[seq_id]
|
||||
|
||||
# numpy implementation will be faster than python if we have
|
||||
# many elements, otherwise it will be slower.
|
||||
if numel < _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL:
|
||||
_compute_slot_mapping_python(slot_mapping, block_table, range_start,
|
||||
range_end, block_size)
|
||||
else:
|
||||
_compute_slot_mapping_numpy(slot_mapping, block_table, range_start,
|
||||
range_end, block_size)
|
||||
|
||||
|
||||
TAttentionMetadata = TypeVar("TAttentionMetadata", bound='AttentionMetadata')
|
||||
|
||||
|
||||
class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
|
||||
|
||||
_metadata_cls: Type[TAttentionMetadata]
|
||||
|
||||
def __init__(self, input_builder):
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
|
||||
self.sliding_window = input_builder.sliding_window
|
||||
self.block_size = input_builder.block_size
|
||||
|
||||
def prepare(self):
|
||||
self.slot_mapping: List[int] = []
|
||||
self.prefill_seq_lens: List[int] = []
|
||||
self.context_lens: List[int] = []
|
||||
self.block_tables: List[List[int]] = []
|
||||
self.curr_seq_lens: List[int] = []
|
||||
self.num_prefills = 0
|
||||
self.num_prefill_tokens = 0
|
||||
self.num_decode_tokens = 0
|
||||
|
||||
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
|
||||
is_prompt = inter_data.is_prompt
|
||||
block_tables = inter_data.block_tables
|
||||
|
||||
for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
|
||||
curr_sliding_window_block) in zip(
|
||||
inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
|
||||
inter_data.orig_seq_lens, inter_data.seq_lens,
|
||||
inter_data.query_lens, inter_data.context_lens,
|
||||
inter_data.curr_sliding_window_blocks):
|
||||
self.context_lens.append(context_len)
|
||||
if is_prompt:
|
||||
self.num_prefills += 1
|
||||
self.num_prefill_tokens += token_len
|
||||
self.prefill_seq_lens.append(seq_len)
|
||||
else:
|
||||
assert query_len == 1, (
|
||||
"seq_len: {}, context_len: {}, query_len: {}".format(
|
||||
seq_len, context_len, query_len))
|
||||
self.num_decode_tokens += query_len
|
||||
self.curr_seq_lens.append(curr_seq_len)
|
||||
|
||||
# Compute block table.
|
||||
# TODO(sang): Combine chunked prefill and prefix caching by
|
||||
# only allowing multiple of block_size chunk size.
|
||||
# NOTE: This only works for oooooooxxx style attention.
|
||||
block_table = []
|
||||
if inter_data.prefix_cache_hit:
|
||||
block_table = block_tables[seq_id]
|
||||
elif ((chunked_prefill_enabled or not is_prompt)
|
||||
and block_tables is not None):
|
||||
if curr_sliding_window_block == 0:
|
||||
block_table = block_tables[seq_id]
|
||||
else:
|
||||
block_table = block_tables[seq_id][
|
||||
-curr_sliding_window_block:]
|
||||
self.block_tables.append(block_table)
|
||||
|
||||
# Compute slot mapping.
|
||||
is_profile_run = is_block_tables_empty(block_tables)
|
||||
start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
|
||||
context_len,
|
||||
self.sliding_window)
|
||||
compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
|
||||
seq_len, context_len, start_idx,
|
||||
self.block_size, inter_data.block_tables)
|
||||
|
||||
def build(self, seq_lens: List[int], query_lens: List[int],
|
||||
cuda_graph_pad_size: int, batch_size: int):
|
||||
"""Build attention metadata with on-device tensors.
|
||||
|
||||
Args:
|
||||
seq_lens: The maybe padded sequence lengths of the input sequences.
|
||||
query_lens: The query lengths of the input sequences.
|
||||
cuda_graph_pad_size: The padding size for cuda graph.
|
||||
-1 if cuda graph is not used.
|
||||
batch_size: The maybe padded batch size.
|
||||
"""
|
||||
for inter_data in self.input_builder.inter_data_list:
|
||||
self._add_seq_group(inter_data,
|
||||
self.input_builder.chunked_prefill_enabled)
|
||||
|
||||
device = self.runner.device
|
||||
use_captured_graph = cuda_graph_pad_size != -1
|
||||
|
||||
max_query_len = max(query_lens)
|
||||
max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
|
||||
max_decode_seq_len = max(self.curr_seq_lens, default=0)
|
||||
num_decode_tokens = self.num_decode_tokens
|
||||
query_start_loc = list(accumulate(query_lens, initial=0))
|
||||
seq_start_loc = list(accumulate(seq_lens, initial=0))
|
||||
|
||||
if use_captured_graph:
|
||||
self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
|
||||
self.block_tables.extend([] * cuda_graph_pad_size)
|
||||
num_decode_tokens = batch_size
|
||||
|
||||
# The shape of graph_block_tables is
|
||||
# [max batch size, max context len // block size].
|
||||
input_block_tables = self.runner.graph_block_tables[:batch_size]
|
||||
for i, block_table in enumerate(self.block_tables):
|
||||
if block_table:
|
||||
input_block_tables[i, :len(block_table)] = block_table
|
||||
block_tables = torch.from_numpy(input_block_tables).to(
|
||||
device, non_blocking=True)
|
||||
else:
|
||||
block_tables = make_tensor_with_pad(
|
||||
self.block_tables,
|
||||
pad=0,
|
||||
dtype=torch.int,
|
||||
device=device,
|
||||
)
|
||||
assert max_query_len > 0, "query_lens: {}".format(query_lens)
|
||||
|
||||
assert device is not None
|
||||
context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
|
||||
device, self.runner.pin_memory)
|
||||
seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
|
||||
self.runner.pin_memory)
|
||||
slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
|
||||
device, self.runner.pin_memory)
|
||||
query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
|
||||
device,
|
||||
self.runner.pin_memory)
|
||||
seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
|
||||
device, self.runner.pin_memory)
|
||||
|
||||
return self._metadata_cls( # type: ignore
|
||||
num_prefills=self.num_prefills,
|
||||
slot_mapping=slot_mapping_tensor,
|
||||
enable_kv_scales_calculation=True,
|
||||
num_prefill_tokens=self.num_prefill_tokens,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_tensor=seq_lens_tensor,
|
||||
max_query_len=max_query_len,
|
||||
max_prefill_seq_len=max_prefill_seq_len,
|
||||
max_decode_seq_len=max_decode_seq_len,
|
||||
query_start_loc=query_start_loc_tensor,
|
||||
seq_start_loc=seq_start_loc_tensor,
|
||||
context_lens_tensor=context_lens_tensor,
|
||||
block_tables=block_tables,
|
||||
use_cuda_graph=use_captured_graph,
|
||||
)
|
||||
|
||||
|
||||
class CommonAttentionState(AttentionState):
|
||||
|
||||
def __init__(self, runner):
|
||||
self.runner = runner
|
||||
self._is_graph_capturing = False
|
||||
|
||||
@contextmanager
|
||||
def graph_capture(self, max_batch_size: int):
|
||||
|
||||
self._is_graph_capturing = True
|
||||
|
||||
self._graph_slot_mapping = torch.full((max_batch_size, ),
|
||||
PAD_SLOT_ID,
|
||||
dtype=torch.long,
|
||||
device=self.runner.device)
|
||||
self._graph_seq_lens = torch.ones(max_batch_size,
|
||||
dtype=torch.int32,
|
||||
device=self.runner.device)
|
||||
self._graph_block_tables = torch.from_numpy(
|
||||
self.runner.graph_block_tables).to(device=self.runner.device)
|
||||
|
||||
yield
|
||||
|
||||
self._is_graph_capturing = False
|
||||
del self._graph_slot_mapping
|
||||
del self._graph_seq_lens
|
||||
del self._graph_block_tables
|
||||
|
||||
def graph_clone(self, batch_size: int) -> "CommonAttentionState":
|
||||
assert self._is_graph_capturing
|
||||
return self.__class__(self.runner)
|
||||
|
||||
def graph_capture_get_metadata_for_batch(
|
||||
self, batch_size: int, is_encoder_decoder_model: bool = False):
|
||||
assert self._is_graph_capturing
|
||||
attn_metadata = self.runner.attn_backend.make_metadata(
|
||||
num_prefills=0,
|
||||
num_prefill_tokens=0,
|
||||
num_decode_tokens=batch_size,
|
||||
slot_mapping=self._graph_slot_mapping[:batch_size],
|
||||
enable_kv_scales_calculation=True,
|
||||
seq_lens=None,
|
||||
seq_lens_tensor=self._graph_seq_lens[:batch_size],
|
||||
max_query_len=1,
|
||||
max_decode_query_len=1,
|
||||
max_prefill_seq_len=0,
|
||||
max_decode_seq_len=self.runner.max_model_len,
|
||||
query_start_loc=None,
|
||||
seq_start_loc=None,
|
||||
context_lens_tensor=None,
|
||||
block_tables=self._graph_block_tables[:batch_size],
|
||||
use_cuda_graph=True,
|
||||
)
|
||||
if is_encoder_decoder_model:
|
||||
# The encoder decoder model works only with XFormers and
|
||||
# Flash Attention backend. Assert the same.
|
||||
assert self.runner.attn_backend.get_name() in \
|
||||
["XFORMERS", "FLASH_ATTN"], \
|
||||
f"Expected attn_backend name to be either 'XFORMERS' or " \
|
||||
f"'FLASH_ATTN', but got '{self.runner.attn_backend.get_name()}'"
|
||||
self._update_captured_metadata_for_enc_dec_model(
|
||||
batch_size=batch_size, attn_metadata=attn_metadata)
|
||||
|
||||
return attn_metadata
|
||||
|
||||
def get_graph_input_buffers(
|
||||
self,
|
||||
attn_metadata,
|
||||
is_encoder_decoder_model: bool = False) -> Dict[str, Any]:
|
||||
input_buffers = {
|
||||
"slot_mapping": attn_metadata.slot_mapping,
|
||||
"seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
|
||||
"block_tables": attn_metadata.decode_metadata.block_tables,
|
||||
}
|
||||
if is_encoder_decoder_model:
|
||||
# The encoder decoder model works only with XFormers and
|
||||
# Flash Attention backend. Assert the same.
|
||||
assert self.runner.attn_backend.get_name() in \
|
||||
["XFORMERS", "FLASH_ATTN"], \
|
||||
f"Expected attn_backend name to be either 'XFORMERS' or " \
|
||||
f"'FLASH_ATTN', but got '{self.runner.attn_backend.get_name()}'"
|
||||
self._add_additional_input_buffers_for_enc_dec_model(
|
||||
attn_metadata=attn_metadata, input_buffers=input_buffers)
|
||||
return input_buffers
|
||||
|
||||
def prepare_graph_input_buffers(
|
||||
self,
|
||||
input_buffers,
|
||||
attn_metadata,
|
||||
is_encoder_decoder_model: bool = False) -> None:
|
||||
input_buffers["seq_lens_tensor"].copy_(
|
||||
attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
|
||||
input_buffers["block_tables"].copy_(
|
||||
attn_metadata.decode_metadata.block_tables, non_blocking=True)
|
||||
if is_encoder_decoder_model:
|
||||
# The encoder decoder model works only with XFormers and
|
||||
# Flash Attention backend. Assert the same.
|
||||
assert self.runner.attn_backend.get_name() in\
|
||||
["XFORMERS", "FLASH_ATTN"], \
|
||||
f"Expected attn_backend name to be either 'XFORMERS' or "\
|
||||
f"'FLASH_ATTN', but "\
|
||||
f"got '{self.runner.attn_backend.get_name()}'"
|
||||
self._prepare_input_buffers_for_enc_dec_model(
|
||||
attn_metadata, input_buffers)
|
||||
|
||||
def begin_forward(self, model_input) -> None:
|
||||
return
|
||||
|
||||
def _update_captured_metadata_for_enc_dec_model(self, batch_size: int,
|
||||
attn_metadata):
|
||||
"""
|
||||
Updates the attention metadata parameters for CUDA graph capture in an
|
||||
encoder-decoder model.
|
||||
|
||||
This method modifies attention-related tensors and metadata required
|
||||
for CUDA graph capture in encoder-decoder models. Specifically, it
|
||||
updates the cross-attention and encoder sequence tensors in the
|
||||
AttentionMetadata object.
|
||||
"""
|
||||
# During decode phase the cross_slot_mapping will be empty. Hence set
|
||||
# an empty tensor for CUDA Graph capture.
|
||||
attn_metadata.cross_slot_mapping = torch.tensor(
|
||||
[], dtype=torch.int).cuda()
|
||||
attn_metadata.cross_block_tables = torch.full(
|
||||
(batch_size, self.runner.get_max_block_per_batch()),
|
||||
1,
|
||||
dtype=torch.int).cuda()
|
||||
attn_metadata.encoder_seq_lens = torch.full((batch_size, ),
|
||||
1,
|
||||
dtype=torch.int).cuda()
|
||||
attn_metadata.encoder_seq_lens_tensor = torch.full(
|
||||
(batch_size, ), 1, dtype=torch.int).cuda()
|
||||
attn_metadata.max_encoder_seq_len = self.runner.max_model_len
|
||||
attn_metadata.num_encoder_tokens = 0
|
||||
|
||||
def _add_additional_input_buffers_for_enc_dec_model(
|
||||
self, attn_metadata, input_buffers: Dict[str, Any]):
|
||||
"""
|
||||
Saves additional input buffers specific to the encoder-decoder model
|
||||
from the attention metadata.
|
||||
|
||||
This method extracts and stores encoder-decoder related input buffers
|
||||
from the `attn_metadata` into the `input_buffers` dictionary. The
|
||||
buffers include encoder sequence lengths, cross-slot mappings, and
|
||||
cross-block tables, which are essential for the encoder-decoder model
|
||||
during CUDA graph replay.
|
||||
"""
|
||||
input_buffers["encoder_seq_lens_tensor"] = (
|
||||
attn_metadata.decode_metadata.encoder_seq_lens_tensor)
|
||||
input_buffers["cross_slot_mapping"] = (
|
||||
attn_metadata.decode_metadata.cross_slot_mapping)
|
||||
input_buffers["cross_block_tables"] = (
|
||||
attn_metadata.decode_metadata.cross_block_tables)
|
||||
|
||||
def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata,
|
||||
input_buffers: Dict[str,
|
||||
Any]):
|
||||
"""
|
||||
Populates input buffers with data from the encoder-decoder model's
|
||||
attention metadata.
|
||||
|
||||
This method fills the input buffers with encoder-decoder specific
|
||||
tensors. It copies data from the `attn_metadata` and keyword arguments
|
||||
(`kwargs`) into corresponding buffers in the `input_buffers` dictionary.
|
||||
The copied data includes attention-related metadata as well as input
|
||||
IDs and positional information for the encoder.
|
||||
"""
|
||||
input_buffers["encoder_seq_lens_tensor"].copy_(
|
||||
attn_metadata.decode_metadata.encoder_seq_lens_tensor,
|
||||
non_blocking=True)
|
||||
input_buffers["cross_slot_mapping"].copy_(
|
||||
attn_metadata.decode_metadata.cross_slot_mapping,
|
||||
non_blocking=True)
|
||||
input_buffers["cross_block_tables"].copy_(
|
||||
attn_metadata.decode_metadata.cross_block_tables,
|
||||
non_blocking=True)
|
||||
|
||||
|
||||
def is_all_encoder_attn_metadata_set(attn_metadata):
|
||||
'''
|
||||
All attention metadata required for encoder attention is set.
|
||||
'''
|
||||
return ((attn_metadata.encoder_seq_lens is not None)
|
||||
and (attn_metadata.encoder_seq_lens_tensor is not None)
|
||||
and (attn_metadata.max_encoder_seq_len is not None))
|
||||
|
||||
|
||||
def is_all_cross_attn_metadata_set(attn_metadata):
|
||||
'''
|
||||
All attention metadata required for enc/dec cross-attention is set.
|
||||
|
||||
Superset of encoder attention required metadata.
|
||||
'''
|
||||
return (attn_metadata.is_all_encoder_attn_metadata_set
|
||||
and (attn_metadata.cross_slot_mapping is not None)
|
||||
and (attn_metadata.cross_block_tables is not None))
|
||||
|
||||
|
||||
def get_seq_len_block_table_args(
|
||||
attn_metadata,
|
||||
is_prompt: bool,
|
||||
attn_type: str,
|
||||
) -> tuple:
|
||||
'''
|
||||
The particular choice of sequence-length- and block-table-related
|
||||
attributes which should be extracted from attn_metadata is dependent
|
||||
on the type of attention operation.
|
||||
|
||||
Decoder attn -> select entirely decoder self-attention-related fields
|
||||
Encoder/decoder cross-attn -> select encoder sequence lengths &
|
||||
cross-attn block-tables fields
|
||||
Encoder attn -> select encoder sequence lengths fields & no block tables
|
||||
|
||||
Arguments:
|
||||
|
||||
* attn_metadata: Attention metadata structure associated with attention op
|
||||
* is_prompt: True if prefill, False otherwise
|
||||
* attn_type: encoder attention, decoder self-attention,
|
||||
encoder/decoder cross-attention
|
||||
|
||||
Returns:
|
||||
|
||||
* Appropriate sequence-lengths tensor
|
||||
* Appropriate max sequence-length scalar
|
||||
* Appropriate block tables (or None)
|
||||
'''
|
||||
|
||||
if attn_type == AttentionType.DECODER:
|
||||
# Decoder self-attention
|
||||
# Choose max_seq_len based on whether we are in prompt_run
|
||||
if is_prompt:
|
||||
max_seq_len = attn_metadata.max_prefill_seq_len
|
||||
else:
|
||||
max_seq_len = attn_metadata.max_decode_seq_len
|
||||
return (attn_metadata.seq_lens_tensor, max_seq_len,
|
||||
attn_metadata.block_tables)
|
||||
elif attn_type == AttentionType.ENCODER_DECODER:
|
||||
# Enc/dec cross-attention KVs match encoder sequence length;
|
||||
# cross-attention utilizes special "cross" block tables
|
||||
return (attn_metadata.encoder_seq_lens_tensor,
|
||||
attn_metadata.max_encoder_seq_len,
|
||||
attn_metadata.cross_block_tables)
|
||||
elif attn_type == AttentionType.ENCODER:
|
||||
# No block tables associated with encoder attention
|
||||
return (attn_metadata.encoder_seq_lens_tensor,
|
||||
attn_metadata.max_encoder_seq_len, None)
|
||||
else:
|
||||
raise AttributeError(f"Invalid attention type {str(attn_type)}")
|
||||
|
||||
|
||||
def get_num_prefill_decode_query_kv_tokens(
|
||||
attn_metadata,
|
||||
attn_type: str,
|
||||
) -> Tuple[int, int, int]:
|
||||
"""
|
||||
Calculate the number of prefill and decode tokens for query, key/value
|
||||
based on the attention metadata and the specified attention type.
|
||||
|
||||
Args:
|
||||
attn_metadata (AttentionMetadata): Attention Metadata object.
|
||||
attn_type (AttentionType): The type of attention being used.
|
||||
Returns:
|
||||
Tuple[int, int, int]: A tuple containing three integers:
|
||||
- The number of prefill query tokens.
|
||||
- The number of prefill key/value tokens.
|
||||
- The number of decode query tokens.
|
||||
|
||||
Raises:
|
||||
AssertionError: If the number of encoder tokens in `attn_metadata`
|
||||
is `None` when required for the calculations.
|
||||
"""
|
||||
num_prefill_query_tokens = 0
|
||||
num_decode_query_tokens = 0
|
||||
num_prefill_kv_tokens = 0
|
||||
if attn_type == AttentionType.ENCODER:
|
||||
# Encoder attention is only invoked during prefill phase.
|
||||
# The same input servers a both query and key.
|
||||
assert attn_metadata.num_encoder_tokens is not None
|
||||
num_prefill_query_tokens = attn_metadata.num_encoder_tokens
|
||||
num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
|
||||
num_decode_query_tokens = 0
|
||||
elif attn_type == AttentionType.ENCODER_DECODER:
|
||||
assert attn_metadata.num_encoder_tokens is not None
|
||||
num_prefill_query_tokens = attn_metadata.num_prefill_tokens
|
||||
# The key is the encoder/cross-attention.
|
||||
num_prefill_kv_tokens = attn_metadata.num_encoder_tokens
|
||||
num_decode_query_tokens = attn_metadata.num_decode_tokens
|
||||
else: # attn_type == AttentionType.DECODER or
|
||||
# attn_type == AttentionType.ENCODER_ONLY
|
||||
num_prefill_query_tokens = attn_metadata.num_prefill_tokens
|
||||
num_prefill_kv_tokens = attn_metadata.num_prefill_tokens
|
||||
num_decode_query_tokens = attn_metadata.num_decode_tokens
|
||||
|
||||
return (num_prefill_query_tokens, num_prefill_kv_tokens,
|
||||
num_decode_query_tokens)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MLADims:
|
||||
|
@ -22,7 +22,10 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape)
|
||||
from vllm.model_executor.models.vision import get_vit_attn_backend
|
||||
from vllm.platforms import _Backend, current_platform
|
||||
from vllm.utils import GiB_bytes, direct_register_custom_op
|
||||
@ -247,6 +250,13 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
"This may be caused by insufficient memory to allocate "
|
||||
"kv cache.") from e
|
||||
|
||||
# for attn backends supporting query quantization
|
||||
self.query_quant = None
|
||||
if self.kv_cache_dtype.startswith(
|
||||
"fp8") and self.attn_backend.supports_quant_query_input:
|
||||
self.query_quant = QuantFP8(static=True,
|
||||
group_shape=GroupShape.PER_TENSOR)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
@ -270,11 +280,22 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
if attn_metadata.enable_kv_scales_calculation:
|
||||
self.calc_kv_scales(query, key, value)
|
||||
|
||||
output_dtype = query.dtype
|
||||
if self.query_quant is not None:
|
||||
# quantizing with a simple torch operation enables
|
||||
# torch.compile to fuse this into previous ops
|
||||
# which reduces overheads during decoding.
|
||||
# Otherwise queries are quantized using custom ops
|
||||
# which causes decoding overheads
|
||||
assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
|
||||
query, _ = self.query_quant(query, self._q_scale)
|
||||
|
||||
if self.use_output:
|
||||
output_shape = (output_shape
|
||||
if output_shape is not None else query.shape)
|
||||
output = torch.zeros(output_shape,
|
||||
dtype=query.dtype,
|
||||
dtype=output_dtype,
|
||||
device=query.device)
|
||||
hidden_size = output_shape[-1]
|
||||
# We skip reshaping query, key and value tensors for the MLA
|
||||
@ -343,7 +364,7 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
self.impl.process_weights_after_loading(act_dtype)
|
||||
|
||||
# FlashInfer requires attention sinks to be float32
|
||||
if (self.backend == _Backend.FLASHINFER_VLLM_V1
|
||||
if (self.backend == _Backend.FLASHINFER
|
||||
and hasattr(self.impl, 'sinks')):
|
||||
from vllm.v1.attention.backends.flashinfer import FlashInferImpl
|
||||
assert isinstance(self.impl, FlashInferImpl)
|
||||
@ -399,21 +420,17 @@ class MultiHeadAttention(nn.Module):
|
||||
|
||||
self.attn_backend = backend if backend in {
|
||||
_Backend.TORCH_SDPA,
|
||||
_Backend.TORCH_SDPA_VLLM_V1,
|
||||
_Backend.XFORMERS,
|
||||
_Backend.PALLAS_VLLM_V1,
|
||||
_Backend.PALLAS,
|
||||
_Backend.ROCM_AITER_FA,
|
||||
_Backend.FLASH_ATTN,
|
||||
_Backend.FLASH_ATTN_VLLM_V1,
|
||||
} else _Backend.TORCH_SDPA
|
||||
|
||||
if (self.attn_backend == _Backend.XFORMERS
|
||||
and not check_xformers_availability()):
|
||||
self.attn_backend = _Backend.TORCH_SDPA
|
||||
|
||||
if self.attn_backend in {
|
||||
_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1
|
||||
}:
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
if use_upstream_fa:
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
self._flash_attn_varlen_func = flash_attn_varlen_func
|
||||
@ -447,11 +464,7 @@ class MultiHeadAttention(nn.Module):
|
||||
key = torch.repeat_interleave(key, num_repeat, dim=2)
|
||||
value = torch.repeat_interleave(value, num_repeat, dim=2)
|
||||
|
||||
if self.attn_backend in {
|
||||
_Backend.FLASH_ATTN,
|
||||
_Backend.FLASH_ATTN_VLLM_V1,
|
||||
}:
|
||||
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
|
||||
step=q_len,
|
||||
dtype=torch.int32,
|
||||
@ -478,8 +491,7 @@ class MultiHeadAttention(nn.Module):
|
||||
key,
|
||||
value,
|
||||
scale=self.scale)
|
||||
elif (self.attn_backend == _Backend.TORCH_SDPA
|
||||
or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1):
|
||||
elif self.attn_backend == _Backend.TORCH_SDPA:
|
||||
query, key, value = (x.transpose(1, 2)
|
||||
for x in (query, key, value))
|
||||
out = F.scaled_dot_product_attention(query,
|
||||
@ -487,7 +499,7 @@ class MultiHeadAttention(nn.Module):
|
||||
value,
|
||||
scale=self.scale)
|
||||
out = out.transpose(1, 2)
|
||||
elif self.attn_backend == _Backend.PALLAS_VLLM_V1:
|
||||
elif self.attn_backend == _Backend.PALLAS:
|
||||
query, key, value = (x.transpose(1, 2)
|
||||
for x in (query, key, value))
|
||||
from torch_xla.experimental.custom_kernel import flash_attention
|
||||
|
@ -18,12 +18,14 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
|
||||
final attention output.
|
||||
|
||||
Args:
|
||||
output: [ B, H, D ]
|
||||
lses : [ N, B, H ]
|
||||
cp, batch, q_heads, v_head_dim
|
||||
Return:
|
||||
output: [ B, H, D ]
|
||||
lse : [ B, H ]
|
||||
outputs_ptr (triton.PointerType):
|
||||
Pointer to input tensor of shape [ B, H, D ]
|
||||
lses_ptr (triton.PointerType):
|
||||
Pointer to input tensor of shape [ N, B, H ]
|
||||
new_output_ptr (triton.PointerType):
|
||||
Pointer to output tensor of shape [ B, H, D ]
|
||||
vlse_ptr (triton.PointerType):
|
||||
Pointer to output tensor of shape [ B, H ]
|
||||
"""
|
||||
batch_idx = tl.program_id(axis=0).to(tl.int64)
|
||||
head_idx = tl.program_id(axis=1).to(tl.int64)
|
||||
@ -81,19 +83,19 @@ class CPTritonContext:
|
||||
self.inner_kernel[grid](*regular_args)
|
||||
|
||||
|
||||
def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
|
||||
ctx: CPTritonContext):
|
||||
"""
|
||||
Apply the all-gathered lses to correct each local rank's attention
|
||||
output. we still need perform a cross-rank reduction to obtain the
|
||||
final attention output.
|
||||
def correct_attn_out(
|
||||
out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
|
||||
ctx: CPTritonContext) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Correct the attention output using the all-gathered lses.
|
||||
|
||||
Args:
|
||||
output: [ B, H, D ]
|
||||
lses : [ N, B, H ]
|
||||
Return:
|
||||
output: [ B, H, D ]
|
||||
lse : [ B, H ]
|
||||
out: Tensor of shape [ B, H, D ]
|
||||
lses: Tensor of shape [ N, B, H ]
|
||||
cp_rank: Current rank in the context-parallel group
|
||||
ctx: Triton context to avoid recompilation
|
||||
|
||||
Returns:
|
||||
Tuple of (out, lse) with corrected attention and final log-sum-exp.
|
||||
"""
|
||||
if ctx is None:
|
||||
ctx = CPTritonContext()
|
||||
|
@ -474,12 +474,14 @@ def _decode_grouped_att_m_fwd(
|
||||
def _fwd_kernel_stage2(
|
||||
Mid_O,
|
||||
o,
|
||||
lse,
|
||||
B_Seqlen,
|
||||
stride_mid_ob,
|
||||
stride_mid_oh,
|
||||
stride_mid_os,
|
||||
stride_obs,
|
||||
stride_oh,
|
||||
stride_lse_bs,
|
||||
NUM_KV_SPLITS: tl.constexpr,
|
||||
BLOCK_DV: tl.constexpr,
|
||||
Lv: tl.constexpr,
|
||||
@ -525,12 +527,18 @@ def _fwd_kernel_stage2(
|
||||
acc / e_sum,
|
||||
mask=mask_d,
|
||||
)
|
||||
lse_val = e_max + tl.log(e_sum)
|
||||
tl.store(
|
||||
lse + cur_batch * stride_lse_bs + cur_head,
|
||||
lse_val,
|
||||
)
|
||||
|
||||
|
||||
def _decode_softmax_reducev_fwd(
|
||||
logits,
|
||||
q,
|
||||
o,
|
||||
lse,
|
||||
v_buffer,
|
||||
b_seq_len,
|
||||
num_kv_splits,
|
||||
@ -555,12 +563,14 @@ def _decode_softmax_reducev_fwd(
|
||||
_fwd_kernel_stage2[grid](
|
||||
logits,
|
||||
o,
|
||||
lse,
|
||||
b_seq_len,
|
||||
logits.stride(0),
|
||||
logits.stride(1),
|
||||
logits.stride(2),
|
||||
o.stride(0),
|
||||
o.stride(1),
|
||||
lse.stride(0),
|
||||
NUM_KV_SPLITS=NUM_KV_SPLITS,
|
||||
BLOCK_DV=BLOCK_DV,
|
||||
Lv=Lv,
|
||||
@ -575,6 +585,7 @@ def decode_attention_fwd_normal(
|
||||
k_buffer,
|
||||
v_buffer,
|
||||
o,
|
||||
lse,
|
||||
req_to_token,
|
||||
b_seq_len,
|
||||
attn_logits,
|
||||
@ -595,7 +606,7 @@ def decode_attention_fwd_normal(
|
||||
page_size,
|
||||
logit_cap,
|
||||
)
|
||||
_decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
|
||||
_decode_softmax_reducev_fwd(attn_logits, q, o, lse, v_buffer, b_seq_len,
|
||||
num_kv_splits)
|
||||
|
||||
|
||||
@ -604,6 +615,7 @@ def decode_attention_fwd_grouped(
|
||||
k_buffer,
|
||||
v_buffer,
|
||||
o,
|
||||
lse,
|
||||
req_to_token,
|
||||
b_seq_len,
|
||||
attn_logits,
|
||||
@ -624,7 +636,7 @@ def decode_attention_fwd_grouped(
|
||||
page_size,
|
||||
logit_cap,
|
||||
)
|
||||
_decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len,
|
||||
_decode_softmax_reducev_fwd(attn_logits, q, o, lse, v_buffer, b_seq_len,
|
||||
num_kv_splits)
|
||||
|
||||
|
||||
@ -633,6 +645,7 @@ def decode_attention_fwd(
|
||||
k_buffer,
|
||||
v_buffer,
|
||||
o,
|
||||
lse,
|
||||
req_to_token,
|
||||
b_seq_len,
|
||||
attn_logits,
|
||||
@ -651,6 +664,7 @@ def decode_attention_fwd(
|
||||
k_buffer,
|
||||
v_buffer,
|
||||
o,
|
||||
lse,
|
||||
req_to_token,
|
||||
b_seq_len,
|
||||
attn_logits,
|
||||
@ -666,6 +680,7 @@ def decode_attention_fwd(
|
||||
k_buffer,
|
||||
v_buffer,
|
||||
o,
|
||||
lse,
|
||||
req_to_token,
|
||||
b_seq_len,
|
||||
attn_logits,
|
||||
|
@ -137,7 +137,7 @@ def triton_reshape_and_cache_flash(
|
||||
|
||||
# heuristics instead of autotuning
|
||||
TILE_SIZE = min(2048, triton.next_power_of_2(n))
|
||||
if torch.version.hip:
|
||||
if torch.version.hip or torch.version.xpu:
|
||||
num_stages = 4
|
||||
num_warps = 8
|
||||
else: # cuda
|
||||
|
@ -186,6 +186,14 @@ def _cached_get_attn_backend(
|
||||
# Check the environment variable and override if specified
|
||||
backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
|
||||
if backend_by_env_var is not None:
|
||||
if backend_by_env_var.endswith("_VLLM_V1"):
|
||||
logger.warning(
|
||||
"The suffix '_VLLM_V1' in the environment variable "
|
||||
"%s is no longer necessary as V0 backends have been "
|
||||
"deprecated. Please remove this suffix from your "
|
||||
"environment variable setting.", STR_BACKEND_ENV_VAR)
|
||||
backend_by_env_var = backend_by_env_var.removesuffix(
|
||||
"_VLLM_V1")
|
||||
selected_backend = backend_name_to_enum(backend_by_env_var)
|
||||
if selected_backend is None:
|
||||
raise ValueError(
|
||||
|
@ -270,6 +270,7 @@ class VllmConfig:
|
||||
f"{model_config.dtype} is not supported for quantization "
|
||||
f"method {model_config.quantization}. Supported dtypes: "
|
||||
f"{supported_dtypes}")
|
||||
quant_config.maybe_update_config(model_config.model)
|
||||
return quant_config
|
||||
return None
|
||||
|
||||
@ -364,9 +365,11 @@ class VllmConfig:
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
|
||||
# pooling model does not support full cudagraphs
|
||||
# pooling models and encoder-decoder models
|
||||
# do not support full cudagraphs
|
||||
if self.model_config is not None and \
|
||||
self.model_config.pooler_config is not None:
|
||||
(self.model_config.pooler_config is not None
|
||||
or self.model_config.is_encoder_decoder):
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
@ -384,19 +387,7 @@ class VllmConfig:
|
||||
else:
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
if self.cache_config.cpu_offload_gb > 0 and \
|
||||
self.compilation_config.level != CompilationLevel.NO_COMPILATION \
|
||||
and not envs.VLLM_USE_V1:
|
||||
logger.warning(
|
||||
"CPU offload is not supported with `torch.compile` in v0 yet."
|
||||
" Disabling `torch.compile`.")
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
if self.cache_config.kv_sharing_fast_prefill:
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Fast prefill optimization for KV sharing is not supported "
|
||||
"in V0 currently.")
|
||||
|
||||
if self.speculative_config is not None and \
|
||||
self.speculative_config.use_eagle():
|
||||
@ -410,14 +401,6 @@ class VllmConfig:
|
||||
"--kv-sharing-fast-prefill requires changes on model side for "
|
||||
"correctness and to realize prefill savings. ")
|
||||
|
||||
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
|
||||
and self.compilation_config.level
|
||||
!= CompilationLevel.NO_COMPILATION):
|
||||
logger.warning(
|
||||
"LoRA for V0 is not supported with `torch.compile` yet. "
|
||||
"Disabling `torch.compile`.")
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
disable_chunked_prefill_reasons: list[str] = []
|
||||
|
||||
if self.model_config:
|
||||
@ -545,23 +528,6 @@ class VllmConfig:
|
||||
# local attention.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
|
||||
def has_blocked_weights():
|
||||
if self.quant_config is not None:
|
||||
if hasattr(self.quant_config, "weight_block_size"):
|
||||
return self.quant_config.weight_block_size is not None
|
||||
elif hasattr(self.quant_config, "has_blocked_weights"):
|
||||
return self.quant_config.has_blocked_weights()
|
||||
return False
|
||||
|
||||
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
|
||||
# On H100 the CUDA kernel is faster than
|
||||
# native implementation
|
||||
# https://github.com/vllm-project/vllm/issues/25094
|
||||
if has_blocked_weights():
|
||||
custom_ops = self.compilation_config.custom_ops
|
||||
if "none" not in custom_ops and "-quant_fp8" not in custom_ops:
|
||||
custom_ops.append("+quant_fp8")
|
||||
|
||||
def update_sizes_for_sequence_parallelism(self,
|
||||
possible_sizes: list) -> list:
|
||||
# remove the sizes that not multiple of tp_size when
|
||||
@ -621,57 +587,27 @@ class VllmConfig:
|
||||
"""
|
||||
|
||||
# calculate the default `batch_size_capture_list`
|
||||
if not envs.VLLM_USE_V1:
|
||||
batch_size_capture_list = []
|
||||
if self.scheduler_config is not None and \
|
||||
self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
|
||||
possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
possible_sizes = self.update_sizes_for_sequence_parallelism(
|
||||
possible_sizes)
|
||||
|
||||
# find the minimum size that is larger than max_num_seqs,
|
||||
# which then becomes the max_batchsize_to_capture
|
||||
larger_sizes = [
|
||||
x for x in possible_sizes
|
||||
if x >= self.scheduler_config.max_num_seqs
|
||||
]
|
||||
if larger_sizes:
|
||||
max_batchsize_to_capture = larger_sizes[0]
|
||||
else:
|
||||
max_batchsize_to_capture = possible_sizes[-1]
|
||||
|
||||
# filter out the sizes that are
|
||||
# larger than max_batchsize_to_capture
|
||||
batch_size_capture_list = [
|
||||
size for size in possible_sizes
|
||||
if size <= max_batchsize_to_capture
|
||||
]
|
||||
else:
|
||||
batch_size_capture_list = []
|
||||
if self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
|
||||
if len(cuda_graph_sizes) == 1:
|
||||
batch_size_capture_list = [1, 2, 4] + [
|
||||
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
|
||||
]
|
||||
elif len(cuda_graph_sizes) > 1:
|
||||
batch_size_capture_list = sorted(cuda_graph_sizes)
|
||||
else:
|
||||
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
batch_size_capture_list = \
|
||||
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
batch_size_capture_list = [
|
||||
size for size in batch_size_capture_list
|
||||
if size <= max_num_tokens
|
||||
batch_size_capture_list = []
|
||||
if self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
|
||||
if len(cuda_graph_sizes) == 1:
|
||||
batch_size_capture_list = [1, 2, 4] + [
|
||||
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
|
||||
]
|
||||
elif len(cuda_graph_sizes) > 1:
|
||||
batch_size_capture_list = sorted(cuda_graph_sizes)
|
||||
else:
|
||||
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
batch_size_capture_list = \
|
||||
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
batch_size_capture_list = [
|
||||
size for size in batch_size_capture_list
|
||||
if size <= max_num_tokens
|
||||
]
|
||||
|
||||
self.compilation_config.init_with_cudagraph_sizes(
|
||||
batch_size_capture_list)
|
||||
|
@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
|
||||
from pydantic import TypeAdapter, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
@ -75,11 +74,11 @@ class PassConfig:
|
||||
don't all have access to full configuration - that would create a cycle as
|
||||
the `PassManager` is set as a property of config."""
|
||||
|
||||
enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
|
||||
enable_fusion: bool = False
|
||||
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
|
||||
enable_attn_fusion: bool = False
|
||||
"""Whether to enable the custom attention+quant fusion pass."""
|
||||
enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
|
||||
enable_noop: bool = False
|
||||
"""Whether to enable the custom no-op elimination pass."""
|
||||
enable_sequence_parallelism: bool = False
|
||||
"""Whether to enable sequence parallelism."""
|
||||
|
@ -14,7 +14,6 @@ from pydantic import (ConfigDict, SkipValidation, field_validator,
|
||||
model_validator)
|
||||
from pydantic.dataclasses import dataclass
|
||||
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
|
||||
from typing_extensions import assert_never
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
|
||||
@ -64,13 +63,12 @@ ConvertType = Literal["none", "embed", "classify", "reward"]
|
||||
ConvertOption = Literal["auto", ConvertType]
|
||||
TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
|
||||
"score", "reward", "transcription", "draft"]
|
||||
_ResolvedTask = Literal["generate", "transcription", "encode", "embed",
|
||||
"classify", "reward", "draft"]
|
||||
TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
|
||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
LogprobsMode = Literal["raw_logits", "raw_logprobs", "processed_logits",
|
||||
"processed_logprobs"]
|
||||
HfOverrides = Union[dict[str, Any], Callable[[type], type]]
|
||||
HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
|
||||
PretrainedConfig]]
|
||||
ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]
|
||||
|
||||
_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
|
||||
@ -285,6 +283,7 @@ class ModelConfig:
|
||||
mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
|
||||
interleave_mm_strings: InitVar[Optional[bool]] = None
|
||||
skip_mm_profiling: InitVar[Optional[bool]] = None
|
||||
video_pruning_rate: InitVar[Optional[float]] = None
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
@ -313,6 +312,7 @@ class ModelConfig:
|
||||
factors.append(self.override_generation_config)
|
||||
factors.append(self.rope_scaling)
|
||||
factors.append(self.rope_theta)
|
||||
factors.append(self.video_pruning_rate)
|
||||
|
||||
# hf_config can control how the model looks!
|
||||
try:
|
||||
@ -340,17 +340,19 @@ class ModelConfig:
|
||||
return hashlib.sha256(str(factors).encode()).hexdigest()
|
||||
|
||||
def __post_init__(
|
||||
self,
|
||||
# Multimodal config init vars
|
||||
limit_mm_per_prompt: Optional[dict[str, int]],
|
||||
media_io_kwargs: Optional[dict[str, dict[str, Any]]],
|
||||
mm_processor_kwargs: Optional[dict[str, Any]],
|
||||
mm_processor_cache_gb: Optional[float],
|
||||
mm_processor_cache_type: Optional[MMCacheType],
|
||||
mm_shm_cache_max_object_size_mb: Optional[int],
|
||||
mm_encoder_tp_mode: Optional[MMEncoderTPMode],
|
||||
interleave_mm_strings: Optional[bool],
|
||||
skip_mm_profiling: Optional[bool]) -> None:
|
||||
self,
|
||||
# Multimodal config init vars
|
||||
limit_mm_per_prompt: Optional[dict[str, int]],
|
||||
media_io_kwargs: Optional[dict[str, dict[str, Any]]],
|
||||
mm_processor_kwargs: Optional[dict[str, Any]],
|
||||
mm_processor_cache_gb: Optional[float],
|
||||
mm_processor_cache_type: Optional[MMCacheType],
|
||||
mm_shm_cache_max_object_size_mb: Optional[int],
|
||||
mm_encoder_tp_mode: Optional[MMEncoderTPMode],
|
||||
interleave_mm_strings: Optional[bool],
|
||||
skip_mm_profiling: Optional[bool],
|
||||
video_pruning_rate: Optional[float],
|
||||
) -> None:
|
||||
# Set the default seed to 0 in V1.
|
||||
# NOTE(woosuk): In V0, we set the default seed to None because the
|
||||
# driver worker shares the same process as the user process, and thus
|
||||
@ -534,9 +536,6 @@ class ModelConfig:
|
||||
f"You can pass `--convert {convert_option} to adapt "
|
||||
"it into a pooling model.")
|
||||
|
||||
self.supported_tasks = self._get_supported_tasks(
|
||||
architectures, self.runner_type, self.convert_type)
|
||||
|
||||
# Note: Initialize these attributes early because transformers fallback
|
||||
# may fail to load dynamic modules in child processes
|
||||
model_info, arch = registry.inspect_model_cls(architectures, self)
|
||||
@ -617,6 +616,7 @@ class ModelConfig:
|
||||
mm_encoder_tp_mode=mm_encoder_tp_mode,
|
||||
interleave_mm_strings=interleave_mm_strings,
|
||||
skip_mm_profiling=skip_mm_profiling,
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
)
|
||||
|
||||
mm_config_kwargs = {
|
||||
@ -834,27 +834,6 @@ class ModelConfig:
|
||||
|
||||
return convert_type
|
||||
|
||||
def _get_supported_generation_tasks(
|
||||
self,
|
||||
architectures: list[str],
|
||||
convert_type: ConvertType,
|
||||
) -> list[_ResolvedTask]:
|
||||
registry = self.registry
|
||||
|
||||
if registry.is_transcription_only_model(architectures, self):
|
||||
return ["transcription"]
|
||||
|
||||
# TODO: Use get_supported_generation_tasks once V0 is removed
|
||||
supported_tasks = list[_ResolvedTask]()
|
||||
if (registry.is_text_generation_model(architectures, self)
|
||||
or convert_type in _RUNNER_CONVERTS["generate"]):
|
||||
supported_tasks.append("generate")
|
||||
|
||||
if registry.is_transcription_model(architectures, self):
|
||||
supported_tasks.append("transcription")
|
||||
|
||||
return supported_tasks
|
||||
|
||||
def _get_default_pooling_task(
|
||||
self,
|
||||
architectures: list[str],
|
||||
@ -872,42 +851,6 @@ class ModelConfig:
|
||||
|
||||
return "embed"
|
||||
|
||||
def _get_supported_pooling_tasks(
|
||||
self,
|
||||
architectures: list[str],
|
||||
convert_type: ConvertType,
|
||||
) -> list[_ResolvedTask]:
|
||||
registry = self.registry
|
||||
|
||||
# TODO: Use get_supported_pooling_tasks once V0 is removed
|
||||
supported_tasks = list[_ResolvedTask]()
|
||||
if (registry.is_pooling_model(architectures, self)
|
||||
or convert_type in _RUNNER_CONVERTS["pooling"]):
|
||||
supported_tasks.append("encode")
|
||||
|
||||
extra_task = (self._get_default_pooling_task(architectures)
|
||||
if convert_type == "none" else convert_type)
|
||||
supported_tasks.append(extra_task)
|
||||
|
||||
return supported_tasks
|
||||
|
||||
def _get_supported_tasks(
|
||||
self,
|
||||
architectures: list[str],
|
||||
runner_type: RunnerType,
|
||||
convert_type: ConvertType,
|
||||
) -> list[_ResolvedTask]:
|
||||
if runner_type == "generate":
|
||||
return self._get_supported_generation_tasks(
|
||||
architectures, convert_type)
|
||||
if runner_type == "pooling":
|
||||
return self._get_supported_pooling_tasks(architectures,
|
||||
convert_type)
|
||||
if runner_type == "draft":
|
||||
return ["draft"]
|
||||
|
||||
assert_never(runner_type)
|
||||
|
||||
def _parse_quant_hf_config(self, hf_config: PretrainedConfig):
|
||||
quant_cfg = getattr(hf_config, "quantization_config", None)
|
||||
if quant_cfg is None:
|
||||
@ -1131,7 +1074,8 @@ class ModelConfig:
|
||||
if not hasattr(self.hf_text_config, "model_type"):
|
||||
return False
|
||||
elif self.hf_text_config.model_type in \
|
||||
('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'kimi_k2'):
|
||||
('deepseek_v2', 'deepseek_v3', 'deepseek_mtp',
|
||||
'kimi_k2', 'longcat_flash'):
|
||||
return self.hf_text_config.kv_lora_rank is not None
|
||||
elif self.hf_text_config.model_type == 'eagle':
|
||||
# if the model is an EAGLE module, check for the
|
||||
@ -1257,6 +1201,9 @@ class ModelConfig:
|
||||
or self.hf_config.model_type == "qwen3_next_mtp"):
|
||||
total_num_hidden_layers = getattr(self.hf_text_config,
|
||||
"num_nextn_predict_layers", 0)
|
||||
elif (self.hf_config.model_type == "longcat_flash_mtp"):
|
||||
total_num_hidden_layers = getattr(self.hf_text_config,
|
||||
"num_nextn_predict_layers", 1)
|
||||
else:
|
||||
total_num_hidden_layers = getattr(self.hf_text_config,
|
||||
"num_hidden_layers", 0)
|
||||
|
@ -78,6 +78,11 @@ class MultiModalConfig:
|
||||
This reduces engine startup time but shifts the responsibility to users for
|
||||
estimating the peak memory usage of the activation of multimodal encoder and
|
||||
embedding cache."""
|
||||
video_pruning_rate: Optional[float] = None
|
||||
"""Sets pruning rate for video pruning via Efficient Video Sampling.
|
||||
Value sits in range [0;1) and determines fraction of media tokens
|
||||
from each video to be pruned.
|
||||
"""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
@ -118,3 +123,7 @@ class MultiModalConfig:
|
||||
"""
|
||||
kwargs = self.mm_processor_kwargs or {}
|
||||
return kwargs | dict(inference_kwargs)
|
||||
|
||||
def is_multimodal_pruning_enabled(self):
|
||||
return (self.video_pruning_rate is not None
|
||||
and self.video_pruning_rate > 0)
|
||||
|
@ -31,7 +31,8 @@ logger = init_logger(__name__)
|
||||
|
||||
SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
|
||||
"mlp_speculator", "draft_model", "deepseek_mtp",
|
||||
"ernie_mtp", "qwen3_next_mtp", "mimo_mtp"]
|
||||
"ernie_mtp", "qwen3_next_mtp", "mimo_mtp",
|
||||
"longcat_flash_mtp"]
|
||||
|
||||
|
||||
@config
|
||||
@ -186,6 +187,13 @@ class SpeculativeConfig:
|
||||
"n_predict": n_predict,
|
||||
"architectures": ["Qwen3NextMTP"]
|
||||
})
|
||||
if hf_config.model_type == "longcat_flash":
|
||||
hf_config.model_type = "longcat_flash_mtp"
|
||||
n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
|
||||
hf_config.update({
|
||||
"n_predict": n_predict,
|
||||
"architectures": ["LongCatFlashMTPModel"]
|
||||
})
|
||||
|
||||
return hf_config
|
||||
|
||||
@ -201,12 +209,9 @@ class SpeculativeConfig:
|
||||
|
||||
if self.model is None and self.num_speculative_tokens is not None:
|
||||
# TODO(Shangming): Refactor mtp configuration logic when supporting
|
||||
# mtp acceleration for more models besides deepseek_v3
|
||||
if self.target_model_config and \
|
||||
(self.target_model_config.hf_text_config.model_type \
|
||||
== "deepseek_v3" or
|
||||
self.target_model_config.hf_text_config.model_type in
|
||||
("mimo","ernie4_5_moe", "qwen3_next")):
|
||||
if (self.target_model_config
|
||||
and self.target_model_config.hf_text_config.model_type
|
||||
in ("deepseek_v3", "mimo", "ernie4_5_moe", "qwen3_next")):
|
||||
# use the draft model from the same model:
|
||||
self.model = self.target_model_config.model
|
||||
# Align the quantization of draft model for cases such as
|
||||
@ -216,8 +221,9 @@ class SpeculativeConfig:
|
||||
elif self.method in ("ngram", "[ngram]"):
|
||||
self.model = "ngram"
|
||||
else:
|
||||
raise ValueError("num_speculative_tokens was provided without "
|
||||
"speculative model.")
|
||||
raise ValueError(
|
||||
"num_speculative_tokens was provided but without "
|
||||
"speculative model.")
|
||||
|
||||
# Automatically configure the method for ngram when "model" is used
|
||||
# instead of "method"
|
||||
@ -332,6 +338,15 @@ class SpeculativeConfig:
|
||||
"one layer. Might need some code changes " \
|
||||
"to support multiple layers."
|
||||
)
|
||||
elif (self.draft_model_config.hf_config.model_type
|
||||
in ("longcat_flash_mtp")):
|
||||
self.method = "longcat_flash_mtp"
|
||||
if self.num_speculative_tokens > 1:
|
||||
logger.warning(
|
||||
"LongCat MTP models only have " \
|
||||
"one layer. Might need some code changes " \
|
||||
"to support multiple layers."
|
||||
)
|
||||
else:
|
||||
self.method = "draft_model"
|
||||
raise NotImplementedError(
|
||||
@ -525,7 +540,7 @@ class SpeculativeConfig:
|
||||
"speculative decoding is > 1, but got "
|
||||
f"{self.disable_by_batch_size=}")
|
||||
|
||||
eagle3_target_supported = ["llama", "qwen", "gpt_oss"]
|
||||
eagle3_target_supported = ["llama", "qwen", "minicpm", "gpt_oss"]
|
||||
if self.method == "eagle3" and self.target_model_config and not any(
|
||||
supported_model in
|
||||
self.target_model_config.hf_text_config.model_type
|
||||
@ -548,7 +563,7 @@ class SpeculativeConfig:
|
||||
|
||||
def use_eagle(self) -> bool:
|
||||
return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp",
|
||||
"qwen3_next_mtp")
|
||||
"qwen3_next_mtp", "longcat_flash_mtp")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
method = self.method
|
||||
|
@ -10,9 +10,15 @@ from vllm.distributed import get_dp_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import has_deep_ep, has_pplx
|
||||
from vllm.utils.flashinfer import has_flashinfer_all2all
|
||||
|
||||
from .base_device_communicator import All2AllManagerBase, Cache
|
||||
|
||||
if has_flashinfer_all2all():
|
||||
from flashinfer.comm import Mapping
|
||||
from flashinfer.comm.mnnvl import MnnvlConfig
|
||||
from flashinfer.comm.trtllm_alltoall import MnnvlMoe
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -47,24 +53,22 @@ class NaiveAll2AllManager(All2AllManagerBase):
|
||||
|
||||
def dispatch(self, hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor):
|
||||
cu_tokens_across_dp_cpu = get_forward_context(
|
||||
).dp_metadata.cu_tokens_across_dp_cpu
|
||||
sizes = get_forward_context(
|
||||
).dp_metadata.get_chunk_sizes_across_dp_rank()
|
||||
hidden_states, router_logits = get_dp_group().all_gatherv(
|
||||
[hidden_states, router_logits],
|
||||
dim=0,
|
||||
sizes=sizes,
|
||||
)
|
||||
|
||||
hidden_states = self.naive_multicast(hidden_states,
|
||||
cu_tokens_across_dp_cpu)
|
||||
router_logits = self.naive_multicast(router_logits,
|
||||
cu_tokens_across_dp_cpu)
|
||||
return hidden_states, router_logits
|
||||
|
||||
def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
cu_tokens_across_dp_cpu = get_forward_context(
|
||||
).dp_metadata.cu_tokens_across_dp_cpu
|
||||
start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
|
||||
self.dp_rank - 1]
|
||||
end = cu_tokens_across_dp_cpu[self.dp_rank]
|
||||
|
||||
all_hidden_states = self.dp_group.all_reduce(hidden_states)
|
||||
hidden_states = all_hidden_states[start:end, :]
|
||||
sizes = get_forward_context(
|
||||
).dp_metadata.get_chunk_sizes_across_dp_rank()
|
||||
hidden_states = get_dp_group().reduce_scatterv(hidden_states,
|
||||
dim=0,
|
||||
sizes=sizes)
|
||||
return hidden_states
|
||||
|
||||
def destroy(self):
|
||||
@ -300,4 +304,95 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
|
||||
|
||||
# DeepEP LL uses RDMA so no SMs are used for communication
|
||||
def max_sms_used(self) -> Optional[int]:
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
class FlashInferAllToAllManager(All2AllManagerBase):
|
||||
"""
|
||||
All2All communication based on flashinfer kernels.
|
||||
"""
|
||||
|
||||
def __init__(self, cpu_group):
|
||||
assert has_flashinfer_all2all(
|
||||
), "flashinfer all2all module not found. Please install/check flashinfer" # noqa
|
||||
super().__init__(cpu_group)
|
||||
logger.debug(
|
||||
"Initialize for flashinfer All2All "
|
||||
"rank=%d, world size=%d", self.rank, self.world_size)
|
||||
self.initialized = False
|
||||
self.alltoall_info = None
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
world_size: int,
|
||||
rank: int,
|
||||
gpus_per_node: int,
|
||||
):
|
||||
"""Initialize workspace"""
|
||||
if self.initialized:
|
||||
return
|
||||
|
||||
self.cleanup()
|
||||
logger.debug("making map: "
|
||||
"rank=%d, world size=%d", rank, world_size)
|
||||
self.mapping = Mapping(
|
||||
world_size,
|
||||
rank,
|
||||
gpus_per_node,
|
||||
tp_size=world_size,
|
||||
)
|
||||
|
||||
from vllm.distributed.device_communicators.mnnvl_compat import (
|
||||
CustomCommunicator)
|
||||
dp_config = MnnvlConfig(
|
||||
comm_backend=CustomCommunicator(get_dp_group().cpu_group),
|
||||
fabric_page_size=1 << 29, # 512MB
|
||||
allocation_granularity=0 # Auto-detect
|
||||
)
|
||||
|
||||
self.workspace_tensor = MnnvlMoe.get_moe_workspaces(
|
||||
self.mapping, dp_config)
|
||||
self.prepare_workspace_tensor = MnnvlMoe.get_moe_prepare_workspace(
|
||||
self.mapping, dp_config)
|
||||
|
||||
self.world_size = world_size
|
||||
self.rank = rank
|
||||
self.gpus_per_node = gpus_per_node
|
||||
self.initialized = True
|
||||
|
||||
logger.info("FlashInfer All2All initialized for rank %s, size %s",
|
||||
rank, world_size)
|
||||
|
||||
def ensure_alltoall_workspace_initialized(self):
|
||||
"""Ensure workspace is initialized"""
|
||||
if not has_flashinfer_all2all():
|
||||
return False
|
||||
|
||||
if self.world_size <= 1:
|
||||
return False
|
||||
|
||||
if not self.initialized:
|
||||
self.initialize(
|
||||
world_size=self.world_size,
|
||||
rank=self.rank,
|
||||
gpus_per_node=torch.cuda.device_count,
|
||||
)
|
||||
return self.initialized
|
||||
|
||||
def get_handle(self, kwargs):
|
||||
return self
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up workspace"""
|
||||
if self.initialized and self.workspace_tensor is not None \
|
||||
and self.prepare_workspace_tensor is not None:
|
||||
try:
|
||||
del self.workspace_tensor
|
||||
del self.prepare_workspace_tensor
|
||||
except Exception as e:
|
||||
logger.warning("Failed to cleanup FlashInfer workspace: %s", e)
|
||||
finally:
|
||||
self.workspace_tensor = None
|
||||
self.prepare_workspace_tensor = None
|
||||
self.mapping = None
|
||||
self.initialized = False
|
@ -114,6 +114,11 @@ class CudaCommunicator(DeviceCommunicatorBase):
|
||||
from .all2all import DeepEPLLAll2AllManager
|
||||
self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
|
||||
logger.info("Using DeepEP Low-Latency all2all manager.")
|
||||
elif all2all_backend == "flashinfer_all2allv":
|
||||
from .all2all import FlashInferAllToAllManager
|
||||
self.all2all_manager = FlashInferAllToAllManager(
|
||||
self.cpu_group)
|
||||
logger.info("Using Flashinfer all2allv manager.")
|
||||
else:
|
||||
raise ValueError(f"Unknown all2all backend: {all2all_backend}")
|
||||
|
||||
|
28
vllm/distributed/device_communicators/mnnvl_compat.py
Normal file
28
vllm/distributed/device_communicators/mnnvl_compat.py
Normal file
@ -0,0 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch.distributed as dist
|
||||
from flashinfer.comm.mnnvl import CommBackend as CommBackend
|
||||
|
||||
from vllm.utils.flashinfer import has_flashinfer_all2all
|
||||
|
||||
assert has_flashinfer_all2all(), "Flashinfer alltoallv module cannot be found"
|
||||
|
||||
|
||||
class CustomCommunicator(CommBackend):
|
||||
|
||||
def __init__(self, group):
|
||||
self._group = group
|
||||
|
||||
def Get_rank(self) -> int:
|
||||
return self._group.rank()
|
||||
|
||||
def Get_size(self) -> int:
|
||||
return self._group.size()
|
||||
|
||||
def allgather(self, data: int):
|
||||
gathered = [None] * self.Get_size()
|
||||
dist.all_gather_object(gathered, data, group=self._group)
|
||||
return gathered
|
||||
|
||||
def Split(self, color: int, key: int) -> 'CustomCommunicator':
|
||||
return self
|
@ -178,7 +178,7 @@ class RayPPCommunicator(Communicator):
|
||||
|
||||
def recv(
|
||||
self,
|
||||
shape: tuple[int],
|
||||
shape: tuple[int, ...],
|
||||
dtype: "torch.dtype",
|
||||
peer_rank: int,
|
||||
allocator: TorchTensorAllocator,
|
||||
|
@ -136,8 +136,8 @@ class ShmRingBuffer:
|
||||
self.shared_memory = shared_memory.SharedMemory(
|
||||
create=True, size=self.total_bytes_of_buffer)
|
||||
# initialize the metadata section to 0
|
||||
with memoryview(self.shared_memory.buf[self.metadata_offset:]
|
||||
) as metadata_buffer:
|
||||
with self.shared_memory.buf[self.
|
||||
metadata_offset:] as metadata_buffer:
|
||||
torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
|
||||
else:
|
||||
# we are opening an existing buffer
|
||||
@ -182,14 +182,14 @@ class ShmRingBuffer:
|
||||
def get_data(self, current_idx: int):
|
||||
start = self.data_offset + current_idx * self.max_chunk_bytes
|
||||
end = start + self.max_chunk_bytes
|
||||
with memoryview(self.shared_memory.buf[start:end]) as buf:
|
||||
with self.shared_memory.buf[start:end] as buf:
|
||||
yield buf
|
||||
|
||||
@contextmanager
|
||||
def get_metadata(self, current_idx: int):
|
||||
start = self.metadata_offset + current_idx * self.metadata_size
|
||||
end = start + self.metadata_size
|
||||
with memoryview(self.shared_memory.buf[start:end]) as buf:
|
||||
with self.shared_memory.buf[start:end] as buf:
|
||||
yield buf
|
||||
|
||||
|
||||
@ -387,22 +387,21 @@ class MessageQueue:
|
||||
# Release the processor to other threads
|
||||
sched_yield()
|
||||
|
||||
# if we wait for a long time, log a message
|
||||
if (time.monotonic() - start_time
|
||||
> VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
|
||||
logger.debug(
|
||||
("No available shared memory broadcast block found"
|
||||
" in %s seconds. This typically happens when some"
|
||||
" processes are hanging."),
|
||||
VLLM_RINGBUFFER_WARNING_INTERVAL,
|
||||
)
|
||||
n_warning += 1
|
||||
|
||||
# if we time out, raise an exception
|
||||
if (timeout is not None
|
||||
and time.monotonic() - start_time > timeout):
|
||||
elapsed = time.monotonic() - start_time
|
||||
if timeout is not None and elapsed > timeout:
|
||||
raise TimeoutError
|
||||
|
||||
# if we wait for a long time, log a message
|
||||
if elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning:
|
||||
logger.info(
|
||||
"No available shared memory broadcast block found"
|
||||
" in %s seconds. This typically happens when some"
|
||||
" processes are hanging or doing some"
|
||||
" time-consuming work (e.g. compilation)",
|
||||
VLLM_RINGBUFFER_WARNING_INTERVAL)
|
||||
n_warning += 1
|
||||
|
||||
continue
|
||||
# found a block that is either
|
||||
# (1) not written
|
||||
@ -431,7 +430,8 @@ class MessageQueue:
|
||||
@contextmanager
|
||||
def acquire_read(self,
|
||||
timeout: Optional[float] = None,
|
||||
cancel: Optional[Event] = None):
|
||||
cancel: Optional[Event] = None,
|
||||
indefinite: bool = False):
|
||||
assert self._is_local_reader, "Only readers can acquire read"
|
||||
start_time = time.monotonic()
|
||||
n_warning = 1
|
||||
@ -451,25 +451,26 @@ class MessageQueue:
|
||||
# Release the processor to other threads
|
||||
self._read_spin_timer.spin()
|
||||
|
||||
# if we wait for a long time, log a message
|
||||
if (time.monotonic() - start_time
|
||||
> VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
|
||||
logger.debug(
|
||||
("No available shared memory broadcast block found"
|
||||
" in %s seconds. This typically happens when some"
|
||||
" processes are hanging."),
|
||||
VLLM_RINGBUFFER_WARNING_INTERVAL,
|
||||
)
|
||||
n_warning += 1
|
||||
|
||||
if cancel is not None and cancel.is_set():
|
||||
raise RuntimeError("cancelled")
|
||||
|
||||
# if we time out, raise an exception
|
||||
if (timeout is not None
|
||||
and time.monotonic() - start_time > timeout):
|
||||
elapsed = time.monotonic() - start_time
|
||||
if timeout is not None and elapsed > timeout:
|
||||
raise TimeoutError
|
||||
|
||||
# if we wait for a long time, log a message
|
||||
if not indefinite and (elapsed
|
||||
> VLLM_RINGBUFFER_WARNING_INTERVAL *
|
||||
n_warning):
|
||||
logger.info(
|
||||
"No available shared memory broadcast block found"
|
||||
" in %s seconds. This typically happens when some"
|
||||
" processes are hanging or doing some"
|
||||
" time-consuming work (e.g. compilation).",
|
||||
VLLM_RINGBUFFER_WARNING_INTERVAL)
|
||||
n_warning += 1
|
||||
|
||||
continue
|
||||
# found a block that is not read by this reader
|
||||
# let caller read from the buffer
|
||||
@ -503,10 +504,11 @@ class MessageQueue:
|
||||
|
||||
def dequeue(self,
|
||||
timeout: Optional[float] = None,
|
||||
cancel: Optional[Event] = None):
|
||||
cancel: Optional[Event] = None,
|
||||
indefinite: bool = False):
|
||||
""" Read from message queue with optional timeout (in seconds) """
|
||||
if self._is_local_reader:
|
||||
with self.acquire_read(timeout, cancel) as buf:
|
||||
with self.acquire_read(timeout, cancel, indefinite) as buf:
|
||||
overflow = buf[0] == 1
|
||||
if not overflow:
|
||||
# no need to know the size of serialized object
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user