Update Optional[x] -> x | None and Union[x, y] to x | y (#26633)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-12 17:51:31 +01:00
committed by GitHub
parent 9bb38130cb
commit 8fcaaf6a16
944 changed files with 9490 additions and 10121 deletions

View File

@ -8,7 +8,6 @@ import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp
import huggingface_hub.constants
@ -28,13 +27,13 @@ class RequestFuncInput:
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict | list[dict]] = None
model_name: str | None = None
logprobs: int | None = None
extra_body: dict | None = None
multi_modal_content: dict | list[dict] | None = None
ignore_eos: bool = False
language: Optional[str] = None
request_id: Optional[str] = None
language: str | None = None
request_id: str | None = None
@dataclass
@ -52,7 +51,7 @@ class RequestFuncOutput:
async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@ -133,7 +132,7 @@ async def async_request_tgi(
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@ -204,7 +203,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@ -367,7 +366,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), (
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
@ -610,7 +609,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path
):

View File

@ -32,7 +32,6 @@ import dataclasses
import json
import random
import time
from typing import Optional
from transformers import PreTrainedTokenizerBase
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -128,7 +127,7 @@ def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
prefix_len: int,
) -> list[Request]:
requests = []

View File

@ -7,7 +7,6 @@ import dataclasses
import json
import random
import time
from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -24,7 +23,7 @@ def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")

View File

@ -32,7 +32,6 @@ import uuid
import warnings
from collections.abc import AsyncGenerator
from dataclasses import dataclass
from typing import Optional
import datasets
import numpy as np
@ -316,7 +315,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
@ -436,9 +435,9 @@ async def benchmark(
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
ignore_eos: bool,
max_concurrency: Optional[int],
max_concurrency: int | None,
structured_output_ratio: float,
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]

View File

@ -6,7 +6,7 @@ import math
import os
import time
from types import TracebackType
from typing import Any, Optional, Union
from typing import Any
def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None:
self.cnt: int = 0
self._sum: int = 0
self._max: Optional[int] = None
self._max: int | None = None
self.scale = scale
self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
else:
self._max = max(self._max, v)
def avg(self) -> Union[float, str]:
def avg(self) -> float | str:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> Union[float, str]:
def max(self) -> float | str:
return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[Union[float, str]]:
def dump_avg_max(self) -> list[float | str]:
return [self.avg(), self.max()]
def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_traceback: Optional[TracebackType],
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
self.collect(time.monotonic_ns() - self.start_time)

View File

@ -6,8 +6,7 @@ import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable
from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark

View File

@ -6,8 +6,7 @@ import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable, Optional
from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark
@ -53,7 +52,7 @@ def bench_int8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels."""
assert dtype == torch.int8
@ -108,7 +107,7 @@ def bench_fp8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn
@ -183,7 +182,7 @@ def bench(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(
dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]],
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:

View File

@ -3,10 +3,9 @@
import pickle as pkl
import time
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@ -68,7 +67,7 @@ def unfused_int8_impl(
def unfused_fp8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@ -85,7 +84,7 @@ def unfused_fp8_impl(
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
out, _ = ops.rms_norm_dynamic_per_token_quant(

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
from collections.abc import Callable
from unittest.mock import patch
import pandas as pd

View File

@ -22,8 +22,8 @@ Example:
import json
import os
import time
from collections.abc import Callable
from contextlib import nullcontext
from typing import Callable, Optional
import torch
import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
def benchmark_allreduce_single(
self,
sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
should_use_fn: Callable[[torch.Tensor], bool],
context,
num_warmup: int,
num_trials: int,
) -> Optional[float]:
) -> float | None:
"""Benchmark method with CUDA graph optimization."""
try:
# Create test tensor (2D: sequence_length x hidden_size)

View File

@ -6,11 +6,12 @@ import copy
import json
import pickle
import time
from collections.abc import Callable
from dataclasses import dataclass
from enum import Enum, auto
from itertools import product
from pathlib import Path
from typing import Any, Callable, Optional
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@ -158,7 +159,7 @@ def ref_group_gemm(
seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor,
scaling: float,
add_inputs: Optional[bool],
add_inputs: bool | None,
):
"""
Torch group gemm reference implementation to test correctness of
@ -316,8 +317,8 @@ class BenchmarkContext:
lora_rank: int
sort_by_lora_id: bool
dtype: torch.dtype
seq_length: Optional[int] = None
num_slices: Optional[int] = None # num_slices for slice based ops
seq_length: int | None = None
num_slices: int | None = None # num_slices for slice based ops
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
ctx = copy.copy(self)
@ -561,7 +562,7 @@ class BenchmarkTensors:
}
def bench_fn_kwargs(
self, op_type: OpType, add_inputs: Optional[bool] = None
self, op_type: OpType, add_inputs: bool | None = None
) -> dict[str, Any]:
if op_type.is_shrink_fn():
assert add_inputs is None
@ -575,7 +576,7 @@ class BenchmarkTensors:
raise ValueError(f"Unrecognized optype {self}")
def test_correctness(
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
self, op_type: OpType, expand_fn_add_inputs: bool | None
) -> bool:
"""
Test correctness of op_type implementation against a grouped gemm
@ -611,8 +612,8 @@ def bench_optype(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
cuda_graph_nops: Optional[int] = None,
expand_fn_add_inputs: Optional[bool] = None,
cuda_graph_nops: int | None = None,
expand_fn_add_inputs: bool | None = None,
test_correctness: bool = False,
) -> TMeasurement:
assert arg_pool_size >= 1
@ -679,7 +680,7 @@ def bench_torch_mm(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
cuda_graph_nops: Optional[int] = None,
cuda_graph_nops: int | None = None,
) -> TMeasurement:
"""
Benchmark basic torch.mm as a roofline.
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
"""
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
compare = TBenchmark.Compare(timers)
compare.print()

View File

@ -8,10 +8,9 @@ import math
import os
import pickle as pkl
import time
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
import pandas as pd
import torch
@ -63,23 +62,23 @@ class BenchmarkTensors:
a: torch.Tensor
w_q: torch.Tensor
group_size: Optional[int]
group_size: int | None
wtype: ScalarType
w_g_s: torch.Tensor
w_g_zp: Optional[torch.Tensor]
w_ch_s: Optional[torch.Tensor]
w_tok_s: Optional[torch.Tensor]
w_g_zp: torch.Tensor | None
w_ch_s: torch.Tensor | None
w_tok_s: torch.Tensor | None
@dataclass
class TypeConfig:
act_type: torch.dtype
weight_type: ScalarType
output_type: Optional[torch.dtype]
group_scale_type: Optional[torch.dtype]
group_zero_type: Optional[torch.dtype]
channel_scale_type: Optional[torch.dtype]
token_scale_type: Optional[torch.dtype]
output_type: torch.dtype | None
group_scale_type: torch.dtype | None
group_zero_type: torch.dtype | None
channel_scale_type: torch.dtype | None
token_scale_type: torch.dtype | None
def rand_data(shape, dtype=torch.float16, scale=1):
@ -93,8 +92,8 @@ def quantize_and_pack(
atype: torch.dtype,
w: torch.Tensor,
wtype: ScalarType,
stype: Optional[torch.dtype],
group_size: Optional[int],
stype: torch.dtype | None,
group_size: int | None,
zero_points: bool = False,
):
assert wtype.is_integer(), "TODO: support floating point weights"
@ -113,7 +112,7 @@ def quantize_and_pack(
def create_bench_tensors(
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
) -> list[BenchmarkTensors]:
m, n, k = shape
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
return res
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
def bench(

View File

@ -3,7 +3,6 @@
import random
import time
from typing import Optional
import torch
@ -37,7 +36,7 @@ def main(
seed: int,
do_profile: bool,
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
kv_cache_dtype: str | None = None,
) -> None:
current_platform.seed_everything(seed)

View File

@ -3,8 +3,8 @@
import argparse
import math
from collections.abc import Callable
from contextlib import contextmanager
from typing import Callable
from unittest.mock import patch
import torch

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Optional, Union
import torch
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
def forward(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
residual: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
orig_dtype = x.dtype
x = x.to(torch.float32)
if residual is not None:
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
def rmsnorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -65,7 +64,7 @@ def rmsnorm_naive(
def rmsnorm_flashinfer(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
def rmsnorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
residual: Optional[torch.Tensor] = None,
residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate
from typing import Optional
import nvtx
import torch
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
seq_len: int,
num_heads: int,
head_size: int,
rotary_dim: Optional[int],
rotary_dim: int | None,
dtype: torch.dtype,
seed: int,
device: str,

View File

@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
from typing import Optional
import flashinfer
import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_decode(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),

View File

@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
from typing import Optional
import flashinfer
import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_prefill(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),

View File

@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Iterable
from typing import Any, Callable, Optional
from collections.abc import Callable, Iterable
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
def __init__(
self,
cuda_graph_params: Optional[CudaGraphBenchParams],
cuda_graph_params: CudaGraphBenchParams | None,
label: str,
sub_label: str,
description: str,

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from statistics import mean
from typing import Any, NamedTuple, Optional, Union
from typing import Any, NamedTuple
import numpy as np # type: ignore
import pandas as pd # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
class UniformDistribution(Distribution):
def __init__(
self,
min_val: Union[int, float],
max_val: Union[int, float],
min_val: int | float,
max_val: int | float,
is_integer: bool = True,
) -> None:
self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
class ConstantDistribution(Distribution):
def __init__(self, value: Union[int, float]) -> None:
def __init__(self, value: int | float) -> None:
self.value = value
self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution):
def __init__(
self,
mean: Optional[float] = None,
sigma: Optional[float] = None,
average: Optional[int] = None,
median_ratio: Optional[float] = None,
max_val: Optional[int] = None,
mean: float | None = None,
sigma: float | None = None,
average: int | None = None,
median_ratio: float | None = None,
max_val: int | None = None,
) -> None:
self.average = average
self.median_ratio = median_ratio

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum
from http import HTTPStatus
from statistics import mean
from typing import NamedTuple, Union
from typing import NamedTuple
import aiohttp # type: ignore
import numpy as np # type: ignore
@ -169,7 +169,7 @@ class MovingAverage:
class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
self.metrics: dict[str, MovingAverage | MetricStats] = {
"moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(),
@ -636,7 +636,7 @@ async def client_main(
if args.verbose:
curr_time_sec: float = time.perf_counter()
time_since_last_turn: Union[str, float] = "N/A"
time_since_last_turn: str | float = "N/A"
if conv_id in time_of_last_turn:
time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3
@ -928,13 +928,13 @@ async def main_mp(
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
)
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed)
rps = "N/A"
runtime_left_sec: Union[str, float] = round(
runtime_left_sec: str | float = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
)
if percent < 0.05:

View File

@ -13,7 +13,7 @@ import argparse
import json
import random
from statistics import mean
from typing import Any, Optional
from typing import Any
import pandas as pd # type: ignore
import tqdm # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
def content_is_valid(
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
content: str, min_content_len: int | None, max_content_len: int | None
) -> bool:
if min_content_len and len(content) < min_content_len:
return False
@ -37,7 +37,7 @@ def content_is_valid(
def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
) -> None:
# Collect statistics
stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
seed: int,
input_file: str,
output_file: str,
max_items: Optional[int],
min_content_len: Optional[int] = None,
max_content_len: Optional[int] = None,
min_turns: Optional[int] = None,
max_turns: Optional[int] = None,
model: Optional[str] = None,
max_items: int | None,
min_content_len: int | None = None,
max_content_len: int | None = None,
min_turns: int | None = None,
max_turns: int | None = None,
model: str | None = None,
) -> None:
if min_turns and max_turns:
assert min_turns <= max_turns

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum
from typing import Union
from cutlass_library import *
@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative = enum_auto()
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
**DataTypeNames, # type: ignore
**{
VLLMDataType.u4b8: "u4b8",
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
},
}
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
**DataTypeTag, # type: ignore
**{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
},
}
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
**DataTypeSize, # type: ignore
**{
VLLMDataType.u4b8: 4,
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
},
}
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
VLLMDataType.u4b8: "vllm::kU4B8",
VLLMDataType.u8b128: "vllm::kU8B128",
DataType.u4: "vllm::kU4",
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.bf16: "vllm::kBfloat16",
}
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.u8: "at::ScalarType::Byte",
DataType.s8: "at::ScalarType::Char",
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.f32: "at::ScalarType::Float",
}
VLLMKernelScheduleTag: dict[
Union[MixedInputKernelScheduleType, KernelScheduleType], str
] = {
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
**KernelScheduleTag, # type: ignore
**{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501

View File

@ -9,7 +9,6 @@ from collections.abc import Iterable
from copy import deepcopy
from dataclasses import dataclass, fields
from functools import reduce
from typing import Optional, Union
import jinja2
from vllm_cutlass_library_extension import (
@ -259,7 +258,7 @@ class ScheduleConfig:
@dataclass(frozen=True)
class TypeConfig:
a: DataType
b: Union[DataType, VLLMDataType]
b: DataType | VLLMDataType
b_group_scale: DataType
b_group_zeropoint: DataType
b_channel_scale: DataType
@ -280,7 +279,7 @@ class PrepackTypeConfig:
class ImplConfig:
types: TypeConfig
schedules: list[ScheduleConfig]
heuristic: list[tuple[Optional[str], ScheduleConfig]]
heuristic: list[tuple[str | None, ScheduleConfig]]
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:

View File

@ -16,7 +16,7 @@ Declare supported languages and capabilities:
??? code "supported_languages and supports_transcription_only"
```python
from typing import ClassVar, Mapping, Optional, Literal
from typing import ClassVar, Mapping, Literal
import numpy as np
import torch
from torch import nn
@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
language: Optional[str],
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
to_language: Optional[str],
to_language: str | None,
) -> PromptType:
# Example with a free-form instruction prompt
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
language: Optional[str],
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
to_language: Optional[str],
to_language: str | None,
) -> PromptType:
if language is None:
raise ValueError("Language must be specified")
@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method (
??? code "validate_language()"
```python
@classmethod
def validate_language(cls, language: Optional[str]) -> Optional[str]:
def validate_language(cls, language: str | None) -> str | None:
if language is None:
logger.warning(
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
) -> Optional[int]:
) -> int | None:
# Return None if unknown; otherwise return an estimate.
return int(audio_duration_s * stt_config.sample_rate // 320) # example
```

View File

@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
from collections.abc import Sequence
from dataclasses import dataclass
from enum import Enum, auto
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
import torch
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
@abstractmethod
def update_state(
self,
batch_update: Optional["BatchUpdate"],
batch_update: "BatchUpdate" | None,
) -> None:
"""Called when there are new output tokens, prior
to each forward pass.
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
* Use the `BatchUpdate` members to update logits processor internal state
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.

View File

@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes
??? code "Example custom logits processor definition"
``` python
from typing import Optional
import torch
from vllm.config import VllmConfig
from vllm.sampling_params import SamplingParams
@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes
"""Never impacts greedy sampling"""
return False
def update_state(self, batch_update: Optional[BatchUpdate]):
def update_state(self, batch_update: BatchUpdate | None):
if not batch_update:
return

View File

@ -10,7 +10,7 @@ on HuggingFace model repository.
import os
from dataclasses import asdict
from typing import Any, NamedTuple, Optional
from typing import Any, NamedTuple
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
@ -30,11 +30,11 @@ question_per_audio_count = {
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: Optional[str] = None
prompt_token_ids: Optional[dict[str, list[int]]] = None
multi_modal_data: Optional[dict[str, Any]] = None
stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None
prompt: str | None = None
prompt_token_ids: dict[str, list[int]] | None = None
multi_modal_data: dict[str, Any] | None = None
stop_token_ids: list[int] | None = None
lora_requests: list[LoRARequest] | None = None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on

View File

@ -3,7 +3,7 @@
# ruff: noqa: E501
import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
def get_finished(
self, finished_req_ids: set[str]
) -> tuple[Optional[set[str]], Optional[set[str]]]:
) -> tuple[set[str] | None, set[str] | None]:
if self._async_load:
meta = self._get_connector_metadata()
assert isinstance(meta, RogueSharedStorageConnectorMetadata)

View File

@ -33,8 +33,6 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the'
------------------------------------------------------------
"""
from typing import Optional
import torch
from vllm import LLM, SamplingParams
@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor):
def is_argmax_invariant(self) -> bool:
return False
def update_state(self, batch_update: Optional[BatchUpdate]):
def update_state(self, batch_update: BatchUpdate | None):
process_dict_updates(
self.req_info,
batch_update,

View File

@ -39,7 +39,7 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the'
------------------------------------------------------------
"""
from typing import Any, Optional
from typing import Any
import torch
@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
def new_req_logits_processor(
self,
params: SamplingParams,
) -> Optional[RequestLogitsProcessor]:
) -> RequestLogitsProcessor | None:
"""This method returns a new request-level logits processor, customized
to the `target_token` value associated with a particular request.
@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
Returns:
`Callable` request logits processor, or None
"""
target_token: Optional[Any] = params.extra_args and params.extra_args.get(
target_token: Any | None = params.extra_args and params.extra_args.get(
"target_token"
)
if target_token is None:

View File

@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda"
device, the first and third requests would not repeat the same token.
"""
from typing import Optional
import torch
from vllm import LLM, SamplingParams
@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
def new_req_logits_processor(
self,
params: SamplingParams,
) -> Optional[RequestLogitsProcessor]:
) -> RequestLogitsProcessor | None:
"""This method returns a new request-level logits processor, customized
to the `target_token` value associated with a particular request.

View File

@ -8,7 +8,6 @@ Requires HuggingFace credentials for access.
"""
import gc
from typing import Optional
import torch
from huggingface_hub import snapshot_download
@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest
def create_test_prompts(
lora_path: str,
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
return [
# this is an example of using quantization without LoRA
(
@ -56,7 +55,7 @@ def create_test_prompts(
def process_requests(
engine: LLMEngine,
test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
@ -78,7 +77,7 @@ def process_requests(
def initialize_engine(
model: str, quantization: str, lora_repo: Optional[str]
model: str, quantization: str, lora_repo: str | None
) -> LLMEngine:
"""Initialize the LLMEngine."""

View File

@ -7,8 +7,6 @@ for offline inference.
Requires HuggingFace credentials for access to Llama2.
"""
from typing import Optional
from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest
def create_test_prompts(
lora_path: str,
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
@ -68,7 +66,7 @@ def create_test_prompts(
def process_requests(
engine: LLMEngine,
test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0

View File

@ -3,7 +3,6 @@
import argparse
import datetime
import os
from typing import Union
import albumentations
import numpy as np
@ -160,7 +159,7 @@ def load_example(
file_paths: list[str],
mean: list[float] = None,
std: list[float] = None,
indices: Union[list[int], None] = None,
indices: list[int] | None = None,
):
"""Build an input example by loading images in *file_paths*.

View File

@ -1,7 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
from typing import Callable, Optional, TypedDict
from collections.abc import Callable
from typing import TypedDict
import torch
import zmq
@ -71,7 +72,7 @@ class WorkerExtension:
def rebuild_ipc(
handle: tuple[Callable, tuple], device_id: Optional[int] = None
handle: tuple[Callable, tuple], device_id: int | None = None
) -> torch.Tensor:
func, args = handle
list_args = list(args)
@ -109,7 +110,7 @@ class ColocateWorkerExtension:
self._zmq_ctx = zmq.Context()
socket = self._zmq_ctx.socket(zmq.REP)
socket.connect(zmq_handles[self.report_device_id()])
buffer: Optional[torch.Tensor] = None
buffer: torch.Tensor | None = None
while True:
payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
socket.recv_pyobj()

View File

@ -12,7 +12,7 @@ import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple, Optional
from typing import NamedTuple
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompts: list[str]
stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None
stop_token_ids: list[int] | None = None
lora_requests: list[LoRARequest] | None = None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on

View File

@ -9,7 +9,7 @@ using the chat template defined by the model.
import os
from argparse import Namespace
from dataclasses import asdict
from typing import NamedTuple, Optional
from typing import NamedTuple
from huggingface_hub import snapshot_download
from PIL.Image import Image
@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: str
image_data: list[Image]
stop_token_ids: Optional[list[int]] = None
chat_template: Optional[str] = None
lora_requests: Optional[list[LoRARequest]] = None
stop_token_ids: list[int] | None = None
chat_template: str | None = None
lora_requests: list[LoRARequest] | None = None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@ -1251,7 +1251,7 @@ model_example_map = {
}
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory

View File

@ -11,7 +11,7 @@ on HuggingFace model repository.
from argparse import Namespace
from dataclasses import asdict
from pathlib import Path
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
from PIL.Image import Image
@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict):
QueryModality = Literal["text", "image", "text+image", "text+images"]
Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: Optional[str] = None
image: Optional[Image] = None
query: Optional[str] = None
documents: Optional[ScoreMultiModalParam] = None
prompt: str | None = None
image: Image | None = None
query: str | None = None
documents: ScoreMultiModalParam | None = None
def run_clip(query: Query) -> ModelRequestData:
@ -281,7 +281,7 @@ def get_query(modality: QueryModality):
raise ValueError(msg)
def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
def run_encode(model: str, modality: QueryModality, seed: int | None):
query = get_query(modality)
req_data = model_example_map[model](query)
@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
print("-" * 50)
def run_score(model: str, modality: QueryModality, seed: Optional[int]):
def run_score(model: str, modality: QueryModality, seed: int | None):
query = get_query(modality)
req_data = model_example_map[model](query)

View File

@ -23,7 +23,7 @@ import logging
import os
import sys
from abc import ABC, abstractmethod
from typing import Callable, Optional
from collections.abc import Callable
import aiohttp
import requests
@ -49,12 +49,9 @@ class Proxy:
decode_instances: list[str],
model: str,
scheduling_policy: SchedulingPolicy,
custom_create_completion: Optional[
Callable[[Request], StreamingResponse]
] = None,
custom_create_chat_completion: Optional[
Callable[[Request], StreamingResponse]
] = None,
custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
custom_create_chat_completion: Callable[[Request], StreamingResponse]
| None = None,
):
self.prefill_instances = prefill_instances
self.decode_instances = decode_instances
@ -348,9 +345,9 @@ class ProxyServer:
def __init__(
self,
args: argparse.Namespace,
scheduling_policy: Optional[SchedulingPolicy] = None,
create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
scheduling_policy: SchedulingPolicy | None = None,
create_completion: Callable[[Request], StreamingResponse] | None = None,
create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
):
self.validate_parsed_serve_args(args)
self.port = args.port

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional, Union
from typing import Any
import msgspec
import zmq
@ -25,16 +25,16 @@ class KVCacheEvent(
class BlockStored(KVCacheEvent):
block_hashes: list[ExternalBlockHash]
parent_block_hash: Optional[ExternalBlockHash]
parent_block_hash: ExternalBlockHash | None
token_ids: list[int]
block_size: int
lora_id: Optional[int]
medium: Optional[str]
lora_id: int | None
medium: str | None
class BlockRemoved(KVCacheEvent):
block_hashes: list[ExternalBlockHash]
medium: Optional[str]
medium: str | None
class AllBlocksCleared(KVCacheEvent):
@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent):
class KVEventBatch(EventBatch):
events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
events: list[BlockStored | BlockRemoved | AllBlocksCleared]
def process_event(event_batch):

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from typing import Optional
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -43,7 +42,7 @@ async def main():
)
prompt = "Who won the 2004 World Series?"
final_output: Optional[RequestOutput] = None
final_output: RequestOutput | None = None
async for output in engine_client.generate(
prompt=prompt,
sampling_params=sampling_params,

View File

@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example.
run: vllm serve BAAI/bge-reranker-base
"""
from typing import Union
import cohere
from cohere import Client, ClientV2
@ -25,7 +23,7 @@ documents = [
def cohere_rerank(
client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
client: Client | ClientV2, model: str, query: str, documents: list[str]
) -> dict:
return client.rerank(model=model, query=query, documents=documents)

View File

@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model.
import argparse
import base64
import io
from typing import Literal, Union
from typing import Literal
from openai import OpenAI
from openai._types import NOT_GIVEN, NotGiven
@ -29,7 +29,7 @@ def create_chat_embeddings(
*,
messages: list[ChatCompletionMessageParam],
model: str,
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
) -> CreateEmbeddingResponse:
"""
Convenience function for accessing vLLM's Chat Embeddings API,

View File

@ -1,21 +1,15 @@
# ruff: noqa: E501
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import argparse
import asyncio
import enum
import os
from typing import TYPE_CHECKING, Any, Literal
from typing import Any, Literal
import openai
import pydantic
if TYPE_CHECKING:
from openai.types.chat import ChatCompletionChunk
from openai.types.chat import ChatCompletionChunk
ConstraintsFormat = Literal[
"choice",

View File

@ -84,12 +84,6 @@ ignore = [
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
"UP027",
"UP035",
"UP038",
"UP045",
]
[tool.ruff.format]

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
from typing import Any, NamedTuple, Optional, cast
from typing import Any, NamedTuple, cast
import numpy as np
import pytest
@ -185,8 +185,8 @@ def _collect_mm_samples(
output_len: int = 5,
base_items_per_request: int = 2,
num_mm_items_range_ratio: float = 0.0,
limit_mm_per_prompt: Optional[dict[str, int]] = None,
bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
limit_mm_per_prompt: dict[str, int] | None = None,
bucket_config: dict[tuple[int, int, int], float] | None = None,
enable_multimodal_chat: bool = False,
) -> list[SampleRequest]:
if limit_mm_per_prompt is None:

View File

@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need!
"""
import os
from typing import TYPE_CHECKING, Any, Callable, Optional
from collections.abc import Callable
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
VLLM_CI_NO_SKIP: bool = False
VLLM_CI_DTYPE: Optional[str] = None
VLLM_CI_HEAD_DTYPE: Optional[str] = None
VLLM_CI_HF_DTYPE: Optional[str] = None
VLLM_CI_DTYPE: str | None = None
VLLM_CI_HEAD_DTYPE: str | None = None
VLLM_CI_HF_DTYPE: str | None = None
environment_variables: dict[str, Callable[[], Any]] = {
# A model family has many models with the same architecture.

View File

@ -2,9 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
from collections.abc import Sequence
from collections.abc import Callable, Sequence
from copy import deepcopy
from typing import Callable, Union
from torch import fx
from torch._ops import OpOverload
@ -44,7 +43,7 @@ class TestBackend:
Inductor config is default-initialized from VllmConfig.CompilationConfig.
"""
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
self.custom_passes = list(passes)
compile_config = get_current_vllm_config().compilation_config
self.inductor_config = compile_config.inductor_compile_config

View File

@ -10,7 +10,7 @@ initialized randomly with a fixed seed.
"""
from dataclasses import dataclass
from typing import Any, Optional
from typing import Any
import pytest
import torch
@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module):
self,
positions: torch.Tensor,
hidden_states: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
For tractable computation:
@ -217,7 +217,7 @@ class LlamaModel(nn.Module):
def forward(
self,
input_ids: Optional[torch.Tensor],
input_ids: torch.Tensor | None,
positions: torch.Tensor,
) -> torch.Tensor:
hidden_states = self.embedding_tokens(input_ids)

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import dataclasses
import pytest

View File

@ -1,11 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import logging
import tempfile
from typing import Any, Union
from typing import Any
import pytest
import torch
@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
def run_model(
compile_config: Union[int, CompilationConfig],
compile_config: int | CompilationConfig,
model: str,
model_kwargs: dict[str, Any],
):

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from typing import Optional
import pytest
import torch._dynamo
@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype()
FP4_DTYPE = torch.uint8
# globals needed for string-import custom Dynamo backend field
backend: Optional[TestBackend] = None
backend_unfused: Optional[TestBackend] = None
backend: TestBackend | None = None
backend_unfused: TestBackend | None = None
class AttentionQuantPatternModel(torch.nn.Module):

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import torch
@ -10,7 +9,7 @@ from vllm.config import CompilationLevel
class MyMod(torch.nn.Module):
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
if cache is not None:
return x + cache
return x * 2
@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
)
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
# this is the function to be compiled
return self.model(x, cache)
def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
# let torch.compile compile twice
if len(self.compiled_codes) == 2:
dispatch_id = 0 if cache is None else 1

View File

@ -21,7 +21,7 @@ import threading
from collections.abc import Generator
from contextlib import nullcontext
from enum import Enum
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
from typing import Any, Callable, TypedDict, TypeVar, cast
import numpy as np
import pytest
@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M = TypeVar("_M")
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
_PromptMultiModalInput = list[_M] | list[list[_M]]
PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
@ -267,7 +267,7 @@ class HfRunner:
return "cpu" if current_platform.is_cpu() else current_platform.device_type
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
def wrap_device(self, x: _T, device: str | None = None) -> _T:
if x is None or isinstance(x, (bool,)):
return x
@ -287,14 +287,14 @@ class HfRunner:
model_name: str,
dtype: str = "auto",
*,
model_kwargs: Optional[dict[str, Any]] = None,
model_kwargs: dict[str, Any] | None = None,
trust_remote_code: bool = True,
is_sentence_transformer: bool = False,
is_cross_encoder: bool = False,
skip_tokenizer_init: bool = False,
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
# Set this to avoid hanging issue
default_torch_num_threads: Optional[int] = None,
default_torch_num_threads: int | None = None,
) -> None:
init_ctx = (
nullcontext()
@ -319,7 +319,7 @@ class HfRunner:
model_name: str,
dtype: str = "auto",
*,
model_kwargs: Optional[dict[str, Any]] = None,
model_kwargs: dict[str, Any] | None = None,
trust_remote_code: bool = True,
is_sentence_transformer: bool = False,
is_cross_encoder: bool = False,
@ -406,11 +406,11 @@ class HfRunner:
def get_inputs(
self,
prompts: Union[list[str], list[list[int]]],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]:
prompts: list[str] | list[list[int]],
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
if images is not None:
assert len(prompts) == len(images)
@ -420,9 +420,7 @@ class HfRunner:
if audios is not None:
assert len(prompts) == len(audios)
all_inputs: list[
Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]
] = []
all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
for i, prompt in enumerate(prompts):
if isinstance(prompt, str):
processor_kwargs: dict[str, Any] = {
@ -494,10 +492,10 @@ class HfRunner:
def generate(
self,
prompts: Union[list[str], list[list[int]]],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
prompts: list[str] | list[list[int]],
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
**kwargs: Any,
) -> list[tuple[list[list[int]], list[str]]]:
all_inputs = self.get_inputs(
@ -522,11 +520,11 @@ class HfRunner:
def generate_greedy(
self,
prompts: Union[list[str], list[list[int]]],
prompts: list[str] | list[list[int]],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
**kwargs: Any,
) -> list[tuple[list[int], str]]:
outputs = self.generate(
@ -546,9 +544,9 @@ class HfRunner:
prompts: list[str],
beam_width: int,
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
) -> list[tuple[list[list[int]], list[str]]]:
outputs = self.generate(
prompts,
@ -574,9 +572,9 @@ class HfRunner:
self,
prompts: list[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
**kwargs: Any,
) -> list[list[torch.Tensor]]:
all_inputs = self.get_inputs(
@ -624,7 +622,7 @@ class HfRunner:
def _hidden_states_to_logprobs(
self,
hidden_states: tuple[tuple[torch.Tensor, ...], ...],
num_logprobs: Optional[int],
num_logprobs: int | None,
) -> tuple[list[dict[int, float]], int]:
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
output_len = len(hidden_states)
@ -652,10 +650,10 @@ class HfRunner:
self,
prompts: list[str],
max_tokens: int,
num_logprobs: Optional[int],
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
num_logprobs: int | None,
images: PromptImageInput | None = None,
audios: PromptAudioInput | None = None,
videos: PromptVideoInput | None = None,
**kwargs: Any,
) -> list[TokensTextLogprobs]:
all_inputs = self.get_inputs(
@ -734,20 +732,20 @@ class VllmRunner:
model_name: str,
runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer_name: Optional[str] = None,
tokenizer_name: str | None = None,
tokenizer_mode: str = "auto",
trust_remote_code: bool = True,
seed: Optional[int] = 0,
max_model_len: Optional[int] = 1024,
seed: int | None = 0,
max_model_len: int | None = 1024,
dtype: str = "auto",
disable_log_stats: bool = True,
tensor_parallel_size: int = 1,
block_size: int = 16 if not torch.xpu.is_available() else 64,
enable_chunked_prefill: Optional[bool] = False,
enable_chunked_prefill: bool | None = False,
swap_space: int = 4,
enforce_eager: Optional[bool] = False,
enforce_eager: bool | None = False,
# Set this to avoid hanging issue
default_torch_num_threads: Optional[int] = None,
default_torch_num_threads: int | None = None,
**kwargs,
) -> None:
init_ctx = (
@ -785,10 +783,10 @@ class VllmRunner:
def get_inputs(
self,
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
prompts: list[str] | list[torch.Tensor] | list[list[int]],
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
) -> list[dict[str, Any]]:
if any(
x is not None and len(x) != len(prompts) for x in [images, videos, audios]
@ -824,11 +822,11 @@ class VllmRunner:
def generate(
self,
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
prompts: list[str] | list[torch.Tensor] | list[list[int]],
sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
**kwargs: Any,
) -> list[tuple[list[list[int]], list[str]]]:
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@ -871,11 +869,11 @@ class VllmRunner:
self,
prompts: list[str],
sampling_params: SamplingParams,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
images: PromptImageInput | None = None,
audios: PromptAudioInput | None = None,
videos: PromptVideoInput | None = None,
**kwargs: Any,
) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
req_outputs = self.llm.generate(
@ -894,11 +892,11 @@ class VllmRunner:
def generate_greedy(
self,
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
prompts: list[str] | list[torch.Tensor] | list[list[int]],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
**kwargs: Any,
) -> list[tuple[list[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
@ -916,15 +914,15 @@ class VllmRunner:
self,
prompts: list[str],
max_tokens: int,
num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int] = None,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
stop_token_ids: Optional[list[int]] = None,
stop: Optional[list[str]] = None,
num_logprobs: int | None,
num_prompt_logprobs: int | None = None,
images: PromptImageInput | None = None,
audios: PromptAudioInput | None = None,
videos: PromptVideoInput | None = None,
stop_token_ids: list[int] | None = None,
stop: list[str] | None = None,
**kwargs: Any,
) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
greedy_logprobs_params = SamplingParams(
temperature=0.0,
max_tokens=max_tokens,
@ -957,7 +955,7 @@ class VllmRunner:
perplexities = []
for output in outputs:
output = cast(TokensTextLogprobsPromptLogprobs, output)
token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
token_datas = cast(list[dict[int, Logprob] | None], output[3])
assert token_datas[0] is None
token_log_probs = []
for token_data in token_datas[1:]:
@ -976,10 +974,10 @@ class VllmRunner:
prompts: list[str],
beam_width: int,
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
concurrency_limit: Optional[int] = None,
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
concurrency_limit: int | None = None,
) -> list[tuple[list[list[int]], list[str]]]:
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@ -1002,9 +1000,9 @@ class VllmRunner:
def embed(
self,
prompts: list[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
*args,
**kwargs,
) -> list[list[float]]:
@ -1023,8 +1021,8 @@ class VllmRunner:
def score(
self,
text_1: Union[str, list[str]],
text_2: Union[str, list[str]],
text_1: list[str] | str,
text_2: list[str] | str,
*args,
**kwargs,
) -> list[float]:
@ -1226,8 +1224,8 @@ def _find_free_port() -> int:
class LocalAssetServer:
address: str
port: int
server: Optional[http.server.ThreadingHTTPServer]
thread: Optional[threading.Thread]
server: http.server.ThreadingHTTPServer | None
thread: threading.Thread | None
def __init__(self, address: str = "127.0.0.1") -> None:
self.address = address

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional
from typing import Any
import pytest
@ -15,8 +15,8 @@ def _test_stopping(
llm: LLM,
expected_output: str,
expected_reason: Any,
stop: Optional[list[str]] = None,
stop_token_ids: Optional[list[int]] = None,
stop: list[str] | None = None,
stop_token_ids: list[int] | None = None,
include_in_output: bool = False,
) -> None:
output = llm.generate(

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
from typing import Optional, Union
import msgspec
import msgspec.msgpack
@ -78,8 +77,8 @@ class MockSubscriber:
def __init__(
self,
pub_endpoints: Union[str, list[str]],
replay_endpoints: Optional[Union[str, list[str]]] = None,
pub_endpoints: str | list[str],
replay_endpoints: str | list[str] | None = None,
topic: str = "",
decode_type=SampleBatch,
):
@ -111,7 +110,7 @@ class MockSubscriber:
self.last_seq = -1
self.decoder = msgspec.msgpack.Decoder(type=decode_type)
def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
"""Receive a single message with timeout"""
if not self.sub.poll(timeout):
return None

View File

@ -5,9 +5,8 @@
Run `pytest tests/distributed/test_comm_ops.py`.
"""
from __future__ import annotations
from typing import Any, Callable
from collections.abc import Callable
from typing import Any
import pytest
import ray

View File

@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import json
import os
from dataclasses import dataclass
from typing import Literal, NamedTuple, Optional
from typing import Literal, NamedTuple
import pytest
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
class CPTestOptions(NamedTuple):
multi_node_only: bool
load_format: Optional[str] = None
load_format: str | None = None
@dataclass
@ -54,7 +54,7 @@ class CPTestSettings:
dcp_base: int = 1,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: Optional[str] = None,
load_format: str | None = None,
):
parallel_setups = []
for eager_mode_val in [False]:

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from typing import Literal, NamedTuple, Optional
from typing import Literal, NamedTuple
import pytest
@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple):
class EPTestOptions(NamedTuple):
trust_remote_code: bool
tokenizer_mode: Optional[str]
load_format: Optional[str] = None
hf_overrides: Optional[str] = None
tokenizer_mode: str | None
load_format: str | None = None
hf_overrides: str | None = None
@dataclass
@ -40,9 +40,9 @@ class EPTestSettings:
tp_base: int = 2,
runner: RunnerOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
tokenizer_mode: str | None = None,
load_format: str | None = None,
hf_overrides: str | None = None,
):
return EPTestSettings(
parallel_setups=[
@ -72,9 +72,9 @@ class EPTestSettings:
tp_base: int = 2,
runner: RunnerOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
tokenizer_mode: str | None = None,
load_format: str | None = None,
hf_overrides: str | None = None,
):
return EPTestSettings(
parallel_setups=[

View File

@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import json
import os
from dataclasses import dataclass
from typing import Literal, NamedTuple, Optional
from typing import Literal, NamedTuple
import pytest
@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple):
class PPTestOptions(NamedTuple):
multi_node_only: bool
load_format: Optional[str] = None
load_format: str | None = None
@dataclass
@ -52,7 +52,7 @@ class PPTestSettings:
pp_base: int = 2,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: Optional[str] = None,
load_format: str | None = None,
):
return PPTestSettings(
parallel_setups=[
@ -76,7 +76,7 @@ class PPTestSettings:
pp_base: int = 2,
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: Optional[str] = None,
load_format: str | None = None,
):
return PPTestSettings(
parallel_setups=[

View File

@ -1,16 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from typing_extensions import LiteralString
from ..utils import compare_two_settings, create_new_process_for_each_test
if TYPE_CHECKING:
from typing_extensions import LiteralString
@pytest.mark.parametrize(
"PP_SIZE, MODEL_NAME",

View File

@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import json
import os
from dataclasses import dataclass
from typing import Literal, NamedTuple, Optional
from typing import Literal, NamedTuple
import pytest
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
class SPTestOptions(NamedTuple):
multi_node_only: bool
load_format: Optional[str] = None
load_format: str | None = None
@dataclass
@ -53,7 +53,7 @@ class SPTestSettings:
pp_base: int = 1,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: Optional[str] = None,
load_format: str | None = None,
):
parallel_setups = []
for eager_mode_val in [False, True]:
@ -84,7 +84,7 @@ class SPTestSettings:
pp_base: int = 1,
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: Optional[str] = None,
load_format: str | None = None,
):
parallel_setups = []
for eager_mode_val in [False, True]:
@ -115,7 +115,7 @@ class SPTestSettings:
pp_base: int = 1,
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: Optional[str] = None,
load_format: str | None = None,
):
parallel_setups = []
for fusion_val in [False, True]:

View File

@ -5,7 +5,7 @@ import json
from argparse import ArgumentError
from contextlib import nullcontext
from dataclasses import dataclass, field
from typing import Annotated, Literal, Optional, Union
from typing import Annotated, Literal
import pytest
@ -115,9 +115,9 @@ class NestedConfig:
class DummyConfig:
regular_bool: bool = True
"""Regular bool with default True"""
optional_bool: Optional[bool] = None
optional_bool: bool | None = None
"""Optional bool with default None"""
optional_literal: Optional[Literal["x", "y"]] = None
optional_literal: Literal["x", "y"] | None = None
"""Optional literal with default None"""
tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
"""Tuple with variable length"""
@ -127,7 +127,7 @@ class DummyConfig:
"""List with variable length"""
list_literal: list[Literal[1, 2]] = field(default_factory=list)
"""List with literal choices"""
list_union: list[Union[str, type[object]]] = field(default_factory=list)
list_union: list[str | type[object]] = field(default_factory=list)
"""List with union type"""
literal_literal: Literal[Literal[1], Literal[2]] = 1
"""Literal of literals with default 1"""
@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected):
("type_hint", "expected"),
[
(Annotated[int, "annotation"], {int}),
(Optional[int], {int, type(None)}),
(Annotated[Optional[int], "annotation"], {int, type(None)}),
(Optional[Annotated[int, "annotation"]], {int, type(None)}),
(int | None, {int, type(None)}),
(Annotated[int | None, "annotation"], {int, type(None)}),
(Annotated[int, "annotation"] | None, {int, type(None)}),
],
ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"],
ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"],
)
def test_get_type_hints(type_hint, expected):
assert get_type_hints(type_hint) == expected

View File

@ -3,7 +3,7 @@
import asyncio
import random
from typing import Callable
from collections.abc import Callable
import openai
import pytest

View File

@ -3,7 +3,6 @@
# imports for structured outputs tests
import json
from typing import Optional
import jsonschema
import openai # use the official client for correctness check
@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st
[(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
)
async def test_prompt_logprobs_chat(
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
):
params: dict = {
"messages": [

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
from typing import Union
import openai # use the official client for correctness check
import pytest
@ -166,7 +165,7 @@ async def test_function_tool_use(
client: openai.AsyncOpenAI,
model_name: str,
stream: bool,
tool_choice: Union[str, dict],
tool_choice: str | dict,
enable_thinking: bool,
):
if not stream:

View File

@ -4,7 +4,6 @@
from contextlib import suppress
from dataclasses import dataclass, field
from http import HTTPStatus
from typing import Optional
from unittest.mock import AsyncMock, MagicMock
import pytest
@ -38,13 +37,13 @@ class MockModelConfig:
trust_remote_code: bool = False
tokenizer_mode: str = "auto"
max_model_len: int = 100
tokenizer_revision: Optional[str] = None
tokenizer_revision: str | None = None
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
logits_processor_pattern: Optional[str] = None
diff_sampling_param: Optional[dict] = None
logits_processor_pattern: str | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: Optional[list[str]] = None
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
skip_tokenizer_init: bool = False
@ -56,7 +55,7 @@ class MockModelConfig:
class MockLoRAResolver(LoRAResolver):
async def resolve_lora(
self, base_model_name: str, lora_name: str
) -> Optional[LoRARequest]:
) -> LoRARequest | None:
if lora_name == "test-lora":
return LoRARequest(
lora_name="test-lora",

View File

@ -1,16 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import asyncio
from contextlib import suppress
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any
from typing import Any
from unittest.mock import AsyncMock, MagicMock
import pytest
import pytest_asyncio
from openai import OpenAI
from vllm.config.multimodal import MultiModalConfig
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import RemoteOpenAIServer
if TYPE_CHECKING:
from openai import OpenAI
GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Union
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
@ -84,10 +83,10 @@ class StreamingToolReconstructor:
def run_tool_extraction(
tool_parser: ToolParser,
model_output: str,
request: Union[ChatCompletionRequest, None] = None,
request: ChatCompletionRequest | None = None,
streaming: bool = False,
assert_one_tool_per_delta: bool = True,
) -> tuple[Union[str, None], list[ToolCall]]:
) -> tuple[str | None, list[ToolCall]]:
if streaming:
reconstructor = run_tool_extraction_streaming(
tool_parser,
@ -105,7 +104,7 @@ def run_tool_extraction(
def run_tool_extraction_nonstreaming(
tool_parser: ToolParser,
model_output: str,
request: Union[ChatCompletionRequest, None] = None,
request: ChatCompletionRequest | None = None,
) -> ExtractedToolCallInformation:
request = request or ChatCompletionRequest(messages=[], model="test-model")
return tool_parser.extract_tool_calls(model_output, request)
@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming(
def run_tool_extraction_streaming(
tool_parser: ToolParser,
model_deltas: Iterable[str],
request: Union[ChatCompletionRequest, None] = None,
request: ChatCompletionRequest | None = None,
assert_one_tool_per_delta: bool = True,
) -> StreamingToolReconstructor:
request = request or ChatCompletionRequest(messages=[], model="test-model")

View File

@ -4,8 +4,6 @@
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
"""
from typing import Optional
import openai
import pytest
@ -103,14 +101,14 @@ async def test_matryoshka(
run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
if model_info.is_matryoshka:
valid_dimensions: list[Optional[int]] = [None]
valid_dimensions: list[int | None] = [None]
if model_info.matryoshka_dimensions is not None:
valid_dimensions += model_info.matryoshka_dimensions[:2]
for dimensions in valid_dimensions:
await make_request_and_correctness_test(dimensions)
invalid_dimensions: list[Optional[int]] = [-1]
invalid_dimensions: list[int | None] = [-1]
if model_info.matryoshka_dimensions is not None:
assert 5 not in model_info.matryoshka_dimensions
invalid_dimensions.append(5)

View File

@ -5,7 +5,6 @@ import multiprocessing
import socket
import threading
import time
from typing import Optional
from unittest.mock import patch
import pytest
@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args):
assert len(manager.processes) == 3
# Create a result capture for the thread
result: dict[str, Optional[Exception]] = {"exception": None}
result: dict[str, Exception | None] = {"exception": None}
def run_with_exception_capture():
try:
@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args):
assert len(manager.processes) == 3
# Create a result capture for the thread
result: dict[str, Optional[Exception]] = {"exception": None}
result: dict[str, Exception | None] = {"exception": None}
def run_with_exception_capture():
try:

View File

@ -3,7 +3,7 @@
import warnings
from collections.abc import Mapping
from typing import Literal, Optional
from typing import Literal
import pytest
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
@ -152,9 +152,9 @@ def audio_url():
def _assert_mm_data_is_image_input(
mm_data: Optional[MultiModalDataDict],
mm_data: MultiModalDataDict | None,
image_count: int,
skipped_image_indices: Optional[list] = None,
skipped_image_indices: list | None = None,
) -> None:
assert mm_data is not None
assert set(mm_data.keys()) == {"image"}
@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input(
def _assert_mm_uuids(
mm_uuids: Optional[MultiModalUUIDDict],
mm_uuids: MultiModalUUIDDict | None,
media_count: int,
expected_uuids: list[Optional[str]],
expected_uuids: list[str | None],
modality: str = "image",
) -> None:
if len(expected_uuids) > 0:
@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int]
def _assert_mm_data_inputs(
mm_data: Optional[MultiModalDataDict],
mm_data: MultiModalDataDict | None,
data_count: MultiModalDataCounts,
skipped_media_indices: Optional[dict[str, list]] = None, # modality -> list[int]
skipped_media_indices: dict[str, list] | None = None, # modality -> list[int]
) -> None:
assert mm_data is not None
assert set(data_count.keys()) == (set(mm_data.keys()))

View File

@ -3,7 +3,6 @@
import io
from dataclasses import dataclass
from typing import Optional
from unittest.mock import AsyncMock, MagicMock
import pybase64
@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt
@dataclass
class MockModelConfig:
max_model_len: int = 100
encoder_config: Optional[dict] = None
encoder_config: dict | None = None
class MockTokenizerResult:

View File

@ -12,7 +12,6 @@ import json
import os
import time
from collections.abc import Generator
from typing import Optional, Union
import aiohttp
import numpy as np
@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm
INVALID = -9999999
def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
def download_and_cache_file(url: str, filename: str | None = None) -> str:
"""Download and cache a file from a URL."""
if filename is None:
filename = os.path.join("/tmp", url.split("/")[-1])
@ -81,9 +80,9 @@ async def call_vllm_api(
prompt: str,
temperature: float,
max_tokens: int,
stop: Optional[list[str]] = None,
url: Optional[str] = None,
seed: Optional[int] = None,
stop: list[str] | None = None,
url: str | None = None,
seed: int | None = None,
) -> str:
"""Call vLLM's OpenAI-compatible completions endpoint."""
data = {
@ -112,8 +111,8 @@ def evaluate_gsm8k(
host: str = "http://127.0.0.1",
port: int = 8000,
temperature: float = 0.0,
seed: Optional[int] = 42,
) -> dict[str, Union[float, int]]:
seed: int | None = 42,
) -> dict[str, float | int]:
"""
Evaluate GSM8K accuracy using vLLM serve endpoint.

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
@ -27,8 +26,8 @@ def ref_paged_attn(
kv_lens: list[int],
block_tables: torch.Tensor,
scale: float,
sliding_window: Optional[int] = None,
soft_cap: Optional[float] = None,
sliding_window: int | None = None,
soft_cap: float | None = None,
) -> torch.Tensor:
num_seqs = len(query_lens)
block_tables = block_tables.cpu().numpy()
@ -94,12 +93,12 @@ def test_varlen_with_paged_kv(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
sliding_window: Optional[int],
sliding_window: int | None,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
soft_cap: float | None,
num_blocks: int,
q_dtype: Optional[torch.dtype],
q_dtype: torch.dtype | None,
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
from typing import Optional
import pytest
import torch
@ -50,7 +49,7 @@ def ref_masked_attention(
key: torch.Tensor,
value: torch.Tensor,
scale: float,
attn_mask: Optional[torch.Tensor] = None,
attn_mask: torch.Tensor | None = None,
) -> torch.Tensor:
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
if attn_mask is not None:
@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention(
block_tables: torch.Tensor,
seq_lens: torch.Tensor,
scale: float,
alibi_slopes: Optional[torch.Tensor],
alibi_slopes: torch.Tensor | None,
) -> None:
num_query_heads = query.shape[1]
num_kv_heads = value_cache.shape[1]
@ -415,7 +414,7 @@ def ref_multi_query_kv_attention(
key: torch.Tensor,
value: torch.Tensor,
scale: float,
alibi_bias: Optional[list[torch.Tensor]],
alibi_bias: list[torch.Tensor] | None,
dtype: torch.dtype,
) -> torch.Tensor:
num_seqs = len(cu_seq_lens) - 1

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
@ -85,7 +84,7 @@ def test_cascade(
head_size: int,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
soft_cap: float | None,
num_blocks: int,
fa_version: int,
) -> None:

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import random
from typing import Optional
import pytest
import torch
@ -17,7 +16,7 @@ def cal_diff(
y: torch.Tensor,
name: str,
use_fp8: bool = False,
diff_threshold: Optional[float] = None,
diff_threshold: float | None = None,
) -> None:
x, y = x.double(), y.double()
cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
@ -34,8 +33,8 @@ def ref_paged_attn(
kv_lens: list[int],
block_tables: torch.Tensor,
scale: float,
sliding_window: Optional[int] = None,
soft_cap: Optional[float] = None,
sliding_window: int | None = None,
soft_cap: float | None = None,
) -> torch.Tensor:
num_seqs = len(query_lens)
block_tables = block_tables.cpu().numpy()
@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv(
head_size: int,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
soft_cap: float | None,
num_blocks: int,
sliding_window: Optional[int],
sliding_window: int | None,
fa_version: int,
q_dtype: Optional[torch.dtype],
q_dtype: torch.dtype | None,
) -> None:
torch.set_default_device("cuda")
if not is_fa_version_supported(fa_version):
@ -221,13 +220,13 @@ def test_varlen_with_paged_kv(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
sliding_window: Optional[int],
sliding_window: int | None,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
soft_cap: float | None,
num_blocks: int,
fa_version: int,
q_dtype: Optional[torch.dtype],
q_dtype: torch.dtype | None,
) -> None:
torch.set_default_device("cuda")
if not is_fa_version_supported(fa_version):

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import flashinfer
import pytest
@ -26,8 +25,8 @@ def ref_paged_attn(
kv_lens: list[int],
block_tables: torch.Tensor,
scale: float,
sliding_window: Optional[int] = None,
soft_cap: Optional[float] = None,
sliding_window: int | None = None,
soft_cap: float | None = None,
) -> torch.Tensor:
num_seqs = len(query_lens)
block_tables = block_tables.cpu().numpy()
@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv(
head_size: int,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
sliding_window: Optional[int],
soft_cap: float | None,
sliding_window: int | None,
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv(
head_size: int,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
sliding_window: Optional[int],
soft_cap: float | None,
sliding_window: int | None,
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
head_size: int,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
soft_cap: float | None,
) -> None:
pytest.skip("TODO: fix the accuracy issue")
torch.set_default_device("cuda")
@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
head_size: int,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
soft_cap: float | None,
) -> None:
# test doesn't work for num_heads = (16,16)
torch.set_default_device("cuda")

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import flashinfer
import pytest
@ -68,9 +67,7 @@ NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation.
@torch.inference_mode
def test_flashinfer_trtllm_decode_with_baseline(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_lens: tuple[int, int],
num_heads: tuple[int, int],
@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
kv_layout: str,
block_size: int,
window_left: int,
soft_cap: Optional[float],
soft_cap: float | None,
has_sinks: bool,
) -> None:
torch.set_default_device("cuda")
@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
@torch.inference_mode
def test_flashinfer_trtllm_prefill_with_baseline(
dtype: torch.dtype,
quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_lens: tuple[int, int],
num_heads: tuple[int, int],
@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
kv_layout: str,
block_size: int,
window_left: int,
soft_cap: Optional[float],
soft_cap: float | None,
has_sinks: bool,
) -> None:
torch.set_default_device("cuda")

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
@ -20,7 +19,7 @@ def merge_attn_states_torch(
prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
output_lse: Optional[torch.Tensor] = None, # [NUM_HEADS, NUM_TOKENS]
output_lse: torch.Tensor | None = None, # [NUM_HEADS, NUM_TOKENS]
):
p_lse = prefix_lse
s_lse = suffix_lse

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
@ -32,8 +31,8 @@ def ref_paged_attn(
kv_lens: list[int],
block_tables: torch.Tensor,
scale: float,
sliding_window: Optional[int] = None,
soft_cap: Optional[float] = None,
sliding_window: int | None = None,
soft_cap: float | None = None,
) -> torch.Tensor:
num_seqs = len(query_lens)
block_tables = block_tables.cpu().numpy()
@ -98,12 +97,12 @@ def test_triton_unified_attn(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
sliding_window: Optional[int],
sliding_window: int | None,
dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float],
soft_cap: float | None,
num_blocks: int,
q_dtype: Optional[torch.dtype],
q_dtype: torch.dtype | None,
) -> None:
torch.set_default_device("cuda")

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional, Union
import pytest
import torch
@ -31,13 +30,13 @@ EPS = 1e-6
## Helpers
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
return torch.as_tensor(x, dtype=torch.float32, device="cuda")
def ref_rms_norm(
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
) -> tuple[torch.Tensor, torch.Tensor | None]:
if residual is not None:
residual = residual.clone()
out, residual = rms_norm_layer.forward_native(x, residual)
@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
residual: torch.Tensor | None,
scale_ub: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
if scale_ub is not None:
assert quant_dtype == torch.float8_e4m3fn
@ -76,9 +75,9 @@ def ref_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
residual: torch.Tensor | None,
scale_ub: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
return ref_dynamic_per_token_quant(
rms_norm_layer, x, quant_dtype, residual, scale_ub
)
@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant(
weight: torch.Tensor,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
residual: torch.Tensor | None,
scale_ub: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
if residual is not None:
residual = residual.clone()
out, scales = ops.rms_norm_dynamic_per_token_quant(
@ -103,9 +102,9 @@ def ops_impl(
weight: torch.Tensor,
x: torch.Tensor,
quant_dtype: torch.dtype,
residual: Optional[torch.Tensor],
scale_ub: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
residual: torch.Tensor | None,
scale_ub: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)

View File

@ -1,8 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
from itertools import product
from typing import Callable, Optional
import pytest
import torch
@ -68,7 +68,7 @@ def test_rotary_embedding(
seq_len: int,
num_heads: int,
head_size: int,
rotary_dim: Optional[int],
rotary_dim: int | None,
dtype: torch.dtype,
seed: int,
device: str,

View File

@ -4,8 +4,6 @@
Tests for miscellaneous utilities
"""
from typing import Optional
import pytest
import torch
@ -17,7 +15,7 @@ def rotary_embedding_opcheck(
rot,
positions: torch.Tensor,
query: torch.Tensor,
key: Optional[torch.Tensor] = None,
key: torch.Tensor | None = None,
):
cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import pytest
import torch
@ -19,11 +18,11 @@ from vllm.platforms import current_platform
def causal_conv1d_ref(
x: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None,
initial_states: Optional[torch.Tensor] = None,
bias: torch.Tensor | None = None,
initial_states: torch.Tensor | None = None,
return_final_states: bool = False,
final_states_out: Optional[torch.Tensor] = None,
activation: Optional[str] = "silu",
final_states_out: torch.Tensor | None = None,
activation: str | None = "silu",
):
"""
x: (batch, dim, seqlen)
@ -117,12 +116,12 @@ def causal_conv1d_update_ref(
def causal_conv1d_opcheck_fn(
x: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None,
cu_seq_len: Optional[torch.Tensor] = None,
cache_indices: Optional[torch.Tensor] = None,
has_initial_state: Optional[torch.Tensor] = None,
conv_states: Optional[torch.Tensor] = None,
activation: Optional[str] = "silu",
bias: torch.Tensor | None = None,
cu_seq_len: torch.Tensor | None = None,
cache_indices: torch.Tensor | None = None,
has_initial_state: torch.Tensor | None = None,
conv_states: torch.Tensor | None = None,
activation: str | None = "silu",
pad_slot_id: int = PAD_SLOT_ID,
):
"""

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from typing import Any, Optional, Union
from typing import Any
import torch
@ -35,7 +35,7 @@ from .mk_objects import (
from .parallel_utils import ProcessGroupInfo
def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
def _describe_tensor(t: torch.Tensor | None, name: str) -> str:
if t is None:
return f"{name} : None"
else:
@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
@dataclass
class Config:
Ms: Union[list[int], int]
Ms: list[int] | int
K: int
N: int
E: int
topks: Union[list[int], int]
topks: list[int] | int
dtype: torch.dtype
quant_config: Optional[TestMoEQuantConfig]
quant_config: TestMoEQuantConfig | None
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
fused_moe_chunk_size: Optional[int]
fused_moe_chunk_size: int | None
world_size: int
torch_trace_dir_path: Optional[str] = None
torch_trace_dir_path: str | None = None
def __post_init__(self):
if self.quant_config is None:
@ -93,7 +93,7 @@ class Config:
return self.Ms
@property
def quant_dtype(self) -> Union[torch.dtype, str, None]:
def quant_dtype(self) -> torch.dtype | str | None:
assert self.quant_config is not None
return self.quant_config.quant_dtype
@ -112,7 +112,7 @@ class Config:
return self.quant_config.per_out_ch_quant
@property
def quant_block_shape(self) -> Optional[list[int]]:
def quant_block_shape(self) -> list[int] | None:
assert self.quant_config is not None
return self.quant_config.block_shape
@ -209,7 +209,7 @@ class Config:
info = prepare_finalize_info(self.prepare_finalize_type)
return info.backend
def is_valid(self) -> tuple[bool, Optional[str]]:
def is_valid(self) -> tuple[bool, str | None]:
# Check prepare-finalize and fused-experts compatibility
if self.is_batched_prepare_finalize():
if not self.is_batched_fused_experts():
@ -280,10 +280,10 @@ class Config:
class WeightTensors:
w1: torch.Tensor
w2: torch.Tensor
w1_scale: Optional[torch.Tensor]
w2_scale: Optional[torch.Tensor]
w1_gs: Optional[torch.Tensor] = None
w2_gs: Optional[torch.Tensor] = None
w1_scale: torch.Tensor | None
w2_scale: torch.Tensor | None
w1_gs: torch.Tensor | None = None
w2_gs: torch.Tensor | None = None
def describe(self):
s = ""
@ -351,11 +351,11 @@ class WeightTensors:
@dataclass
class RankTensors:
hidden_states: torch.Tensor
hidden_states_scale: Optional[torch.Tensor]
hidden_states_scale: torch.Tensor | None
topk_weights: torch.Tensor
topk_ids: torch.Tensor
expert_map: Optional[torch.Tensor]
expert_map: torch.Tensor | None
def describe(self):
s = ""
@ -370,7 +370,7 @@ class RankTensors:
@staticmethod
def make_hidden_states(
config: Config,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
) -> tuple[torch.Tensor, torch.Tensor | None]:
"""
Return hidden_states
"""

View File

@ -4,7 +4,6 @@
import copy
from enum import Enum
from itertools import product
from typing import Optional
import torch
from tqdm import tqdm
@ -82,7 +81,7 @@ def make_feature_matrix(csv_file_path: str):
import pandas as pd
def add_to_results(
config: Config, success: Result, results_df: Optional[pd.DataFrame] = None
config: Config, success: Result, results_df: pd.DataFrame | None = None
):
config_dict = asdict(config)
config_dict["prepare_finalize_type"] = config_dict[
@ -121,7 +120,7 @@ def make_feature_matrix(csv_file_path: str):
product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES)
)
results_df: Optional[pd.DataFrame] = None
results_df: pd.DataFrame | None = None
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
combinations
):

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from typing import Optional, Union
import torch
@ -43,25 +42,25 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
@dataclass
class TestMoEQuantConfig:
quant_dtype: Union[torch.dtype, str, None]
quant_dtype: torch.dtype | str | None
per_out_ch_quant: bool
per_act_token_quant: bool
block_shape: Optional[list[int]]
block_shape: list[int] | None
@dataclass
class PrepareFinalizeInfo:
activation_format: mk.FusedMoEActivationFormat
supported_dtypes: list[Union[torch.dtype, str]]
supported_dtypes: list[torch.dtype | str]
blocked_quantization_support: bool
backend: Optional[str]
backend: str | None
supports_apply_weight_on_input: bool = True
@dataclass
class ExpertInfo:
activation_format: mk.FusedMoEActivationFormat
supported_dtypes: list[Union[torch.dtype, str]]
supported_dtypes: list[torch.dtype | str]
blocked_quantization_support: bool
supports_chunking: bool
supports_expert_map: bool
@ -78,7 +77,7 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []
standard_format = mk.FusedMoEActivationFormat.Standard
batched_format = mk.FusedMoEActivationFormat.BatchedExperts
common_float_types: list[Union[torch.dtype, str]] = [
common_float_types: list[torch.dtype | str] = [
torch.float8_e4m3fn,
torch.bfloat16,
torch.float16,
@ -92,9 +91,9 @@ fp8_types = [torch.float8_e4m3fn]
def register_prepare_and_finalize(
kind,
activation_format: mk.FusedMoEActivationFormat,
supported_dtypes: list[Union[torch.dtype, str]],
supported_dtypes: list[torch.dtype | str],
blocked_quantization_support: bool,
backend: Optional[str],
backend: str | None,
force_multigpu: bool = False,
supports_apply_weight_on_input: bool = True,
):
@ -121,7 +120,7 @@ def register_prepare_and_finalize(
def register_experts(
kind,
activation_format: mk.FusedMoEActivationFormat,
supported_dtypes: list[Union[torch.dtype, str]],
supported_dtypes: list[torch.dtype | str],
blocked_quantization_support: bool,
supports_chunking: bool,
supports_expert_map: bool,
@ -340,7 +339,7 @@ if cutlass_fp4_supported():
supports_expert_map=False,
)
MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [
MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
None,
# per-channel / per-column weights and per-tensor activations
TestMoEQuantConfig(
@ -395,7 +394,7 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
def make_prepare_finalize(
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
backend: Optional[str],
backend: str | None,
moe: FusedMoEConfig,
quant_config: FusedMoEQuantConfig,
) -> mk.FusedMoEPrepareAndFinalize:

View File

@ -3,11 +3,12 @@
import dataclasses
import os
import traceback
from typing import Any, Callable, Optional
from collections.abc import Callable
from typing import Any, Concatenate
import torch
from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage]
from typing_extensions import Concatenate, ParamSpec
from typing_extensions import ParamSpec
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed import init_distributed_environment, initialize_model_parallel
@ -58,9 +59,9 @@ def _worker_parallel_launch(
world_local_size: int,
node_rank: int,
init_method: str,
worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None],
vllm_config: Optional[VllmConfig],
env_dict: Optional[dict],
worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None],
vllm_config: VllmConfig | None,
env_dict: dict | None,
*args: P.args,
**kwargs: P.kwargs,
) -> None:

View File

@ -2,8 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from collections.abc import Callable
from itertools import product
from typing import Any, Callable
from typing import Any
import torch

View File

@ -7,12 +7,13 @@ DeepEP test utilities
import dataclasses
import os
import traceback
from typing import Callable, Optional
from collections.abc import Callable
from typing import Concatenate
import torch
from torch.distributed import ProcessGroup
from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage]
from typing_extensions import Concatenate, ParamSpec
from typing_extensions import ParamSpec
from vllm.utils import get_open_port, has_deep_ep
@ -126,8 +127,8 @@ def make_deepep_ht_a2a(
pgi: ProcessGroupInfo,
dp_size: int,
ht_args: DeepEPHTArgs,
q_dtype: Optional[torch.dtype] = None,
block_shape: Optional[list[int]] = None,
q_dtype: torch.dtype | None = None,
block_shape: list[int] | None = None,
):
import deep_ep
@ -153,8 +154,8 @@ def make_deepep_ll_a2a(
pg: ProcessGroup,
pgi: ProcessGroupInfo,
deepep_ll_args: DeepEPLLArgs,
q_dtype: Optional[torch.dtype] = None,
block_shape: Optional[list[int]] = None,
q_dtype: torch.dtype | None = None,
block_shape: list[int] | None = None,
):
import deep_ep
@ -185,10 +186,10 @@ def make_deepep_a2a(
pg: ProcessGroup,
pgi: ProcessGroupInfo,
dp_size: int,
deepep_ht_args: Optional[DeepEPHTArgs],
deepep_ll_args: Optional[DeepEPLLArgs],
q_dtype: Optional[torch.dtype] = None,
block_shape: Optional[list[int]] = None,
deepep_ht_args: DeepEPHTArgs | None,
deepep_ll_args: DeepEPLLArgs | None,
q_dtype: torch.dtype | None = None,
block_shape: list[int] | None = None,
):
if deepep_ht_args is not None:
assert deepep_ll_args is None

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from typing import Optional
import pytest
import torch
@ -55,7 +54,7 @@ vllm_config.scheduler_config.max_model_len = 8192
@dataclass
class BatchedMMConfig:
in_dtype: torch.dtype
quant_dtype: Optional[torch.dtype]
quant_dtype: torch.dtype | None
out_dtype: torch.dtype
num_experts: int
max_tokens_per_expert: int
@ -115,7 +114,7 @@ def test_batched_mm(
K: int,
N: int,
dtype: torch.dtype,
block_shape: Optional[list[int]],
block_shape: list[int] | None,
per_act_token_quant: bool,
):
current_platform.seed_everything(7)
@ -242,7 +241,7 @@ def test_fused_moe_batched_experts(
topk: int,
dtype: torch.dtype,
per_act_token_quant: bool,
block_shape: Optional[list[int]],
block_shape: list[int] | None,
input_scales: bool,
):
current_platform.seed_everything(7)

View File

@ -5,7 +5,6 @@ Tests compute_expert_num_tokens kernels
"""
import dataclasses
from typing import Optional
import pytest
import torch
@ -16,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
@dataclasses.dataclass
class TestTensors:
topk_ids: torch.Tensor
expert_map: Optional[torch.Tensor] = None
expert_map: torch.Tensor | None = None
def to_device(self, device: str):
self.topk_ids = self.topk_ids.to(device=device)

View File

@ -3,7 +3,6 @@
import copy
import dataclasses
from math import prod
from typing import Optional
import pytest
import torch
@ -85,16 +84,16 @@ class MOETensors:
@dataclasses.dataclass
class MOETensors8Bit(MOETensors):
# quantized
a_q: Optional[torch.Tensor] = None # a -> a_q
w1_q: Optional[torch.Tensor] = None # w1 -> w1_q
w2_q: Optional[torch.Tensor] = None # w2 -> w2_q
a_scale: Optional[torch.Tensor] = None
w1_scale: Optional[torch.Tensor] = None
w2_scale: Optional[torch.Tensor] = None
a_q: torch.Tensor | None = None # a -> a_q
w1_q: torch.Tensor | None = None # w1 -> w1_q
w2_q: torch.Tensor | None = None # w2 -> w2_q
a_scale: torch.Tensor | None = None
w1_scale: torch.Tensor | None = None
w2_scale: torch.Tensor | None = None
# dequantized
a_d: Optional[torch.Tensor] = None # a -> a_q -> a_d
w1_d: Optional[torch.Tensor] = None # w1 -> w1_q -> w1_d
w2_d: Optional[torch.Tensor] = None # w2 -> w2_q -> w2_d
a_d: torch.Tensor | None = None # a -> a_q -> a_d
w1_d: torch.Tensor | None = None # w1 -> w1_q -> w1_d
w2_d: torch.Tensor | None = None # w2 -> w2_q -> w2_d
@staticmethod
def make_moe_tensors_8bit(
@ -209,7 +208,7 @@ def run_8_bit(
topk_ids: torch.Tensor,
per_act_token: bool,
per_out_ch: bool,
num_local_experts: Optional[int] = None,
num_local_experts: int | None = None,
) -> torch.Tensor:
assert not any(
[
@ -280,7 +279,7 @@ def test_cutlass_moe_8_bit_no_graph(
per_act_token: bool,
per_out_ch: bool,
monkeypatch,
ep_size: Optional[int] = None,
ep_size: int | None = None,
):
current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")

View File

@ -7,7 +7,6 @@ fp8 block-quantized case.
"""
import dataclasses
from typing import Optional
import pytest
import torch.distributed
@ -92,13 +91,13 @@ class TestConfig:
block_size: list[int]
# configs for testing low-latency kernels
low_latency: bool
use_fp8_dispatch: Optional[bool] = False
use_fp8_dispatch: bool | None = False
@dataclasses.dataclass
class TestTensors:
rank_tokens: torch.Tensor # all ranks make this many tokens
rank_token_scales: Optional[torch.Tensor]
rank_token_scales: torch.Tensor | None
topk: torch.Tensor
topk_weights: torch.Tensor
config: TestConfig
@ -143,7 +142,7 @@ def make_ll_modular_kernel(
max_tokens_per_rank: int,
dp_size: int,
hidden_size: int,
q_dtype: Optional[torch.dtype],
q_dtype: torch.dtype | None,
test_config: TestConfig,
quant_config: FusedMoEQuantConfig,
) -> FusedMoEModularKernel:
@ -179,7 +178,7 @@ def make_ht_modular_kernel(
pgi: ProcessGroupInfo,
dp_size: int,
num_local_experts: int,
q_dtype: Optional[torch.dtype],
q_dtype: torch.dtype | None,
test_config: TestConfig,
quant_config: FusedMoEQuantConfig,
) -> FusedMoEModularKernel:
@ -249,8 +248,8 @@ def deepep_deepgemm_moe_impl(
test_tensors: TestTensors,
w1: torch.Tensor,
w2: torch.Tensor,
w1_scale: Optional[torch.Tensor],
w2_scale: Optional[torch.Tensor],
w1_scale: torch.Tensor | None,
w2_scale: torch.Tensor | None,
) -> torch.Tensor:
test_config = test_tensors.config
num_experts = test_config.num_experts

Some files were not shown because too many files have changed in this diff Show More