Update Optional[x] -> x | None and Union[x, y] to x | y (#26633)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-20 14:53:52 +08:00 · 2025-10-12 17:51:31 +01:00
parent 9bb38130cb
commit 8fcaaf6a16
944 changed files with 9490 additions and 10121 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -8,7 +8,6 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import Optional, Union

 import aiohttp
 import huggingface_hub.constants
@ -28,13 +27,13 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict | list[dict]] = None
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
    ignore_eos: bool = False
-    language: Optional[str] = None
-    request_id: Optional[str] = None
+    language: str | None = None
+    request_id: str | None = None


@dataclass
@ -52,7 +51,7 @@ class RequestFuncOutput:

 async def async_request_tgi(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -133,7 +132,7 @@ async def async_request_tgi(

 async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -204,7 +203,7 @@ async def async_request_trt_llm(

 async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(

 async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -367,7 +366,7 @@ async def async_request_openai_completions(

 async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("chat/completions", "profile")), (
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(

 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
@ -610,7 +609,7 @@ def get_tokenizer(
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
    if pretrained_model_name_or_path is not None and not os.path.exists(
        pretrained_model_name_or_path
    ):
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -32,7 +32,6 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional

 from transformers import PreTrainedTokenizerBase

@ -80,7 +79,7 @@ def sample_requests_from_dataset(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@ -128,7 +127,7 @@ def sample_requests_from_random(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
    prefix_len: int,
 ) -> list[Request]:
    requests = []
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -7,7 +7,6 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional

 from transformers import AutoTokenizer, PreTrainedTokenizerBase

@ -24,7 +23,7 @@ def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[tuple[str, int, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -32,7 +32,6 @@ import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
-from typing import Optional

 import datasets
 import numpy as np
@ -316,7 +315,7 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: list[str],
    selected_percentiles: list[float],
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
    actual_output_lens: list[int] = []
    total_input = 0
@ -436,9 +435,9 @@ async def benchmark(
    selected_percentile_metrics: list[str],
    selected_percentiles: list[str],
    ignore_eos: bool,
-    max_concurrency: Optional[int],
+    max_concurrency: int | None,
    structured_output_ratio: float,
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any, Optional, Union
+from typing import Any


 def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
    def __init__(self, scale: int) -> None:
        self.cnt: int = 0
        self._sum: int = 0
-        self._max: Optional[int] = None
+        self._max: int | None = None
        self.scale = scale
        self.start_time: int = time.monotonic_ns()

@ -104,13 +104,13 @@ class TimeCollector:
        else:
            self._max = max(self._max, v)

-    def avg(self) -> Union[float, str]:
+    def avg(self) -> float | str:
        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"

-    def max(self) -> Union[float, str]:
+    def max(self) -> float | str:
        return self._max / self.scale if self._max else "N/A"

-    def dump_avg_max(self) -> list[Union[float, str]]:
+    def dump_avg_max(self) -> list[float | str]:
        return [self.avg(), self.max()]

    def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:

    def __exit__(
        self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        exc_traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
    ) -> None:
        self.collect(time.monotonic_ns() - self.start_time)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable
+from collections.abc import Callable, Iterable

 import torch
 import torch.utils.benchmark as TBenchmark
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable, Optional
+from collections.abc import Callable, Iterable

 import torch
 import torch.utils.benchmark as TBenchmark
@ -53,7 +52,7 @@ def bench_int8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
@ -108,7 +107,7 @@ def bench_fp8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
@ -183,7 +182,7 @@ def bench(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
    dtype: torch.dtype,
    MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -3,10 +3,9 @@

 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional

 import torch
 import torch.utils.benchmark as TBenchmark
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -68,7 +67,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -85,7 +84,7 @@ def unfused_fp8_impl(
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import patch

 import pandas as pd
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -22,8 +22,8 @@ Example:
 import json
 import os
 import time
+from collections.abc import Callable
 from contextlib import nullcontext
-from typing import Callable, Optional

 import torch
 import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
    def benchmark_allreduce_single(
        self,
        sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
        should_use_fn: Callable[[torch.Tensor], bool],
        context,
        num_warmup: int,
        num_trials: int,
-    ) -> Optional[float]:
+    ) -> float | None:
        """Benchmark method with CUDA graph optimization."""
        try:
            # Create test tensor (2D: sequence_length x hidden_size)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -6,11 +6,12 @@ import copy
 import json
 import pickle
 import time
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any, Callable, Optional
+from typing import Any

 import torch
 import torch.utils.benchmark as TBenchmark
@ -158,7 +159,7 @@ def ref_group_gemm(
    seq_lens_cpu: torch.Tensor,
    prompt_lora_mapping_cpu: torch.Tensor,
    scaling: float,
-    add_inputs: Optional[bool],
+    add_inputs: bool | None,
 ):
    """
    Torch group gemm reference implementation to test correctness of
@ -316,8 +317,8 @@ class BenchmarkContext:
    lora_rank: int
    sort_by_lora_id: bool
    dtype: torch.dtype
-    seq_length: Optional[int] = None
-    num_slices: Optional[int] = None  # num_slices for slice based ops
+    seq_length: int | None = None
+    num_slices: int | None = None  # num_slices for slice based ops

    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
        ctx = copy.copy(self)
@ -561,7 +562,7 @@ class BenchmarkTensors:
        }

    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: Optional[bool] = None
+        self, op_type: OpType, add_inputs: bool | None = None
    ) -> dict[str, Any]:
        if op_type.is_shrink_fn():
            assert add_inputs is None
@ -575,7 +576,7 @@ class BenchmarkTensors:
        raise ValueError(f"Unrecognized optype {self}")

    def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
+        self, op_type: OpType, expand_fn_add_inputs: bool | None
    ) -> bool:
        """
        Test correctness of op_type implementation against a grouped gemm
@ -611,8 +612,8 @@ def bench_optype(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
-    expand_fn_add_inputs: Optional[bool] = None,
+    cuda_graph_nops: int | None = None,
+    expand_fn_add_inputs: bool | None = None,
    test_correctness: bool = False,
 ) -> TMeasurement:
    assert arg_pool_size >= 1
@ -679,7 +680,7 @@ def bench_torch_mm(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
+    cuda_graph_nops: int | None = None,
 ) -> TMeasurement:
    """
    Benchmark basic torch.mm as a roofline.
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
            """


-def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
+def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
    compare = TBenchmark.Compare(timers)
    compare.print()

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -8,10 +8,9 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional

 import pandas as pd
 import torch
@ -63,23 +62,23 @@ class BenchmarkTensors:
    a: torch.Tensor

    w_q: torch.Tensor
-    group_size: Optional[int]
+    group_size: int | None
    wtype: ScalarType
    w_g_s: torch.Tensor
-    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: Optional[torch.Tensor]
+    w_g_zp: torch.Tensor | None
+    w_ch_s: torch.Tensor | None
+    w_tok_s: torch.Tensor | None


@dataclass
 class TypeConfig:
    act_type: torch.dtype
    weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    group_zero_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None


 def rand_data(shape, dtype=torch.float16, scale=1):
@ -93,8 +92,8 @@ def quantize_and_pack(
    atype: torch.dtype,
    w: torch.Tensor,
    wtype: ScalarType,
-    stype: Optional[torch.dtype],
-    group_size: Optional[int],
+    stype: torch.dtype | None,
+    group_size: int | None,
    zero_points: bool = False,
 ):
    assert wtype.is_integer(), "TODO: support floating point weights"
@ -113,7 +112,7 @@ def quantize_and_pack(


 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
 ) -> list[BenchmarkTensors]:
    m, n, k = shape

@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
    return res


-_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
+_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None


 def bench(
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -3,7 +3,6 @@

 import random
 import time
-from typing import Optional

 import torch

@ -37,7 +36,7 @@ def main(
    seed: int,
    do_profile: bool,
    device: str = "cuda",
-    kv_cache_dtype: Optional[str] = None,
+    kv_cache_dtype: str | None = None,
 ) -> None:
    current_platform.seed_everything(seed)

--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@ -3,8 +3,8 @@

 import argparse
 import math
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Callable
 from unittest.mock import patch

 import torch
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
 import time

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
 import time

--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import itertools
-from typing import Optional, Union

 import torch
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
    def forward(
        self,
        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        orig_dtype = x.dtype
        x = x.to(torch.float32)
        if residual is not None:
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
 def rmsnorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -65,7 +64,7 @@ def rmsnorm_naive(
 def rmsnorm_flashinfer(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
 def rmsnorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from itertools import accumulate
-from typing import Optional

 import nvtx
 import torch
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
    seq_len: int,
    num_heads: int,
    head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
    dtype: torch.dtype,
    seed: int,
    device: str,
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional

 import flashinfer
 import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_decode(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional

 import flashinfer
 import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_prefill(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import dataclasses
-from collections.abc import Iterable
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Iterable
+from typing import Any

 import torch
 import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:

    def __init__(
        self,
-        cuda_graph_params: Optional[CudaGraphBenchParams],
+        cuda_graph_params: CudaGraphBenchParams | None,
        label: str,
        sub_label: str,
        description: str,
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from statistics import mean
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple

 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
 class UniformDistribution(Distribution):
    def __init__(
        self,
-        min_val: Union[int, float],
-        max_val: Union[int, float],
+        min_val: int | float,
+        max_val: int | float,
        is_integer: bool = True,
    ) -> None:
        self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):


 class ConstantDistribution(Distribution):
-    def __init__(self, value: Union[int, float]) -> None:
+    def __init__(self, value: int | float) -> None:
        self.value = value
        self.max_val = value

@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):


 class ZipfDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
        self.alpha = alpha
        self.max_val = max_val

@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):


 class PoissonDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
        self.alpha = alpha
        self.max_val = max_val

@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
 class LognormalDistribution(Distribution):
    def __init__(
        self,
-        mean: Optional[float] = None,
-        sigma: Optional[float] = None,
-        average: Optional[int] = None,
-        median_ratio: Optional[float] = None,
-        max_val: Optional[int] = None,
+        mean: float | None = None,
+        sigma: float | None = None,
+        average: int | None = None,
+        median_ratio: float | None = None,
+        max_val: int | None = None,
    ) -> None:
        self.average = average
        self.median_ratio = median_ratio
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple, Union
+from typing import NamedTuple

 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@ -169,7 +169,7 @@ class MovingAverage:
 class DebugStats:
    def __init__(self, logger: logging.Logger, window_size: int) -> None:
        self.logger = logger
-        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
+        self.metrics: dict[str, MovingAverage | MetricStats] = {
            "moving_avg_ttft_ms": MovingAverage(window_size),
            "moving_avg_tpot_ms": MovingAverage(window_size),
            "ttft_ms": MetricStats(),
@ -636,7 +636,7 @@ async def client_main(

            if args.verbose:
                curr_time_sec: float = time.perf_counter()
-                time_since_last_turn: Union[str, float] = "N/A"
+                time_since_last_turn: str | float = "N/A"
                if conv_id in time_of_last_turn:
                    time_since_last_turn = round(
                        curr_time_sec - time_of_last_turn[conv_id], 3
@ -928,13 +928,13 @@ async def main_mp(
                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
                )

-                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
+                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
                if len(client_metrics) < (5 * bench_args.num_clients):
                    # Do not estimate the RPS if the number of samples is very low
                    # (threshold can be tuned if needed)
                    rps = "N/A"

-                runtime_left_sec: Union[str, float] = round(
+                runtime_left_sec: str | float = round(
                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
                )
                if percent < 0.05:
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@ -13,7 +13,7 @@ import argparse
 import json
 import random
 from statistics import mean
-from typing import Any, Optional
+from typing import Any

 import pandas as pd  # type: ignore
 import tqdm  # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:


 def content_is_valid(
-    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
+    content: str, min_content_len: int | None, max_content_len: int | None
 ) -> bool:
    if min_content_len and len(content) < min_content_len:
        return False
@ -37,7 +37,7 @@ def content_is_valid(


 def print_stats(
-    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
+    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
 ) -> None:
    # Collect statistics
    stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
    seed: int,
    input_file: str,
    output_file: str,
-    max_items: Optional[int],
-    min_content_len: Optional[int] = None,
-    max_content_len: Optional[int] = None,
-    min_turns: Optional[int] = None,
-    max_turns: Optional[int] = None,
-    model: Optional[str] = None,
+    max_items: int | None,
+    min_content_len: int | None = None,
+    max_content_len: int | None = None,
+    min_turns: int | None = None,
+    max_turns: int | None = None,
+    model: str | None = None,
 ) -> None:
    if min_turns and max_turns:
        assert min_turns <= max_turns
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import enum
-from typing import Union

 from cutlass_library import *

@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
    TmaWarpSpecializedCooperative = enum_auto()


-VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
    **DataTypeNames,  # type: ignore
    **{
        VLLMDataType.u4b8: "u4b8",
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    },
 }

-VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
    **DataTypeTag,  # type: ignore
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    },
 }

-VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
+VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
    **DataTypeSize,  # type: ignore
    **{
        VLLMDataType.u4b8: 4,
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    },
 }

-VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
    VLLMDataType.u4b8: "vllm::kU4B8",
    VLLMDataType.u8b128: "vllm::kU8B128",
    DataType.u4: "vllm::kU4",
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.bf16: "vllm::kBfloat16",
 }

-VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
    DataType.u8: "at::ScalarType::Byte",
    DataType.s8: "at::ScalarType::Char",
    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.f32: "at::ScalarType::Float",
 }

-VLLMKernelScheduleTag: dict[
-    Union[MixedInputKernelScheduleType, KernelScheduleType], str
-] = {
+VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
    **KernelScheduleTag,  # type: ignore
    **{
        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -9,7 +9,6 @@ from collections.abc import Iterable
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
-from typing import Optional, Union

 import jinja2
 from vllm_cutlass_library_extension import (
@ -259,7 +258,7 @@ class ScheduleConfig:
@dataclass(frozen=True)
 class TypeConfig:
    a: DataType
-    b: Union[DataType, VLLMDataType]
+    b: DataType | VLLMDataType
    b_group_scale: DataType
    b_group_zeropoint: DataType
    b_channel_scale: DataType
@ -280,7 +279,7 @@ class PrepackTypeConfig:
 class ImplConfig:
    types: TypeConfig
    schedules: list[ScheduleConfig]
-    heuristic: list[tuple[Optional[str], ScheduleConfig]]
+    heuristic: list[tuple[str | None, ScheduleConfig]]


 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -16,7 +16,7 @@ Declare supported languages and capabilities:

 ??? code "supported_languages and supports_transcription_only"
    ```python
-    from typing import ClassVar, Mapping, Optional, Literal
+    from typing import ClassVar, Mapping, Literal
    import numpy as np
    import torch
    from torch import nn
@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
            audio: np.ndarray,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
            task_type: Literal["transcribe", "translate"],
            request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
        ) -> PromptType:
            # Example with a free-form instruction prompt
            task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
            audio: np.ndarray,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
            task_type: Literal["transcribe", "translate"],
            request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
        ) -> PromptType:
            if language is None:
                raise ValueError("Language must be specified")
@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method (
 ??? code "validate_language()"
    ```python
    @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
        if language is None:
            logger.warning(
                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
            audio_duration_s: float,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-        ) -> Optional[int]:
+        ) -> int | None:
            # Return None if unknown; otherwise return an estimate.
            return int(audio_duration_s * stt_config.sample_rate // 320)  # example
    ```
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
    from collections.abc import Sequence
    from dataclasses import dataclass
    from enum import Enum, auto
-    from typing import TYPE_CHECKING, Optional
+    from typing import TYPE_CHECKING

    import torch

@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
        @abstractmethod
        def update_state(
            self,
-            batch_update: Optional["BatchUpdate"],
+            batch_update: "BatchUpdate" | None,
        ) -> None:
            """Called when there are new output tokens, prior
            to each forward pass.
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
    * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
    * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling

-* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
+* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
    * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
    * Use the `BatchUpdate` members to update logits processor internal state
    * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes
 ??? code "Example custom logits processor definition"

    ``` python
-    from typing import Optional
    import torch
    from vllm.config import VllmConfig
    from vllm.sampling_params import SamplingParams
@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes
            """Never impacts greedy sampling"""
            return False

-        def update_state(self, batch_update: Optional[BatchUpdate]):
+        def update_state(self, batch_update: BatchUpdate | None):
            if not batch_update:
                return

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -10,7 +10,7 @@ on HuggingFace model repository.

 import os
 from dataclasses import asdict
-from typing import Any, NamedTuple, Optional
+from typing import Any, NamedTuple

 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@ -30,11 +30,11 @@ question_per_audio_count = {

 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[dict[str, list[int]]] = None
-    multi_modal_data: Optional[dict[str, Any]] = None
-    stop_token_ids: Optional[list[int]] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    prompt: str | None = None
+    prompt_token_ids: dict[str, list[int]] | None = None
+    multi_modal_data: dict[str, Any] | None = None
+    stop_token_ids: list[int] | None = None
+    lora_requests: list[LoRARequest] | None = None


 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
--- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
+++ b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
@ -3,7 +3,7 @@
 # ruff: noqa: E501
 import logging
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING

 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):

    def get_finished(
        self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
        if self._async_load:
            meta = self._get_connector_metadata()
            assert isinstance(meta, RogueSharedStorageConnectorMetadata)
--- a/examples/offline_inference/logits_processor/custom.py
+++ b/examples/offline_inference/logits_processor/custom.py
@ -33,8 +33,6 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """

-from typing import Optional
-
 import torch

 from vllm import LLM, SamplingParams
@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor):
    def is_argmax_invariant(self) -> bool:
        return False

-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
        process_dict_updates(
            self.req_info,
            batch_update,
--- a/examples/offline_inference/logits_processor/custom_req.py
+++ b/examples/offline_inference/logits_processor/custom_req.py
@ -39,7 +39,7 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """

-from typing import Any, Optional
+from typing import Any

 import torch

@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
    def new_req_logits_processor(
        self,
        params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
        """This method returns a new request-level logits processor, customized
        to the `target_token` value associated with a particular request.

@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
        Returns:
          `Callable` request logits processor, or None
        """
-        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+        target_token: Any | None = params.extra_args and params.extra_args.get(
            "target_token"
        )
        if target_token is None:
--- a/examples/offline_inference/logits_processor/custom_req_init.py
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda"
 device, the first and third requests would not repeat the same token.
 """

-from typing import Optional
-
 import torch

 from vllm import LLM, SamplingParams
@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
    def new_req_logits_processor(
        self,
        params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
        """This method returns a new request-level logits processor, customized
        to the `target_token` value associated with a particular request.

--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@ -8,7 +8,6 @@ Requires HuggingFace credentials for access.
 """

 import gc
-from typing import Optional

 import torch
 from huggingface_hub import snapshot_download
@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest

 def create_test_prompts(
    lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
    return [
        # this is an example of using quantization without LoRA
        (
@ -56,7 +55,7 @@ def create_test_prompts(

 def process_requests(
    engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
@ -78,7 +77,7 @@ def process_requests(


 def initialize_engine(
-    model: str, quantization: str, lora_repo: Optional[str]
+    model: str, quantization: str, lora_repo: str | None
 ) -> LLMEngine:
    """Initialize the LLMEngine."""

--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@ -7,8 +7,6 @@ for offline inference.
 Requires HuggingFace credentials for access to Llama2.
 """

-from typing import Optional
-
 from huggingface_hub import snapshot_download

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest

 def create_test_prompts(
    lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
    """Create a list of test prompts with their sampling parameters.

    2 requests for base model, 4 requests for the LoRA. We define 2
@ -68,7 +66,7 @@ def create_test_prompts(

 def process_requests(
    engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@ -3,7 +3,6 @@
 import argparse
 import datetime
 import os
-from typing import Union

 import albumentations
 import numpy as np
@ -160,7 +159,7 @@ def load_example(
    file_paths: list[str],
    mean: list[float] = None,
    std: list[float] = None,
-    indices: Union[list[int], None] = None,
+    indices: list[int] | None = None,
 ):
    """Build an input example by loading images in *file_paths*.

--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-from typing import Callable, Optional, TypedDict
+from collections.abc import Callable
+from typing import TypedDict

 import torch
 import zmq
@ -71,7 +72,7 @@ class WorkerExtension:


 def rebuild_ipc(
-    handle: tuple[Callable, tuple], device_id: Optional[int] = None
+    handle: tuple[Callable, tuple], device_id: int | None = None
 ) -> torch.Tensor:
    func, args = handle
    list_args = list(args)
@ -109,7 +110,7 @@ class ColocateWorkerExtension:
            self._zmq_ctx = zmq.Context()
        socket = self._zmq_ctx.socket(zmq.REP)
        socket.connect(zmq_handles[self.report_device_id()])
-        buffer: Optional[torch.Tensor] = None
+        buffer: torch.Tensor | None = None
        while True:
            payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
                socket.recv_pyobj()
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -12,7 +12,7 @@ import os
 import random
 from contextlib import contextmanager
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple

 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
-    stop_token_ids: Optional[list[int]] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    lora_requests: list[LoRARequest] | None = None


 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -9,7 +9,7 @@ using the chat template defined by the model.
 import os
 from argparse import Namespace
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple

 from huggingface_hub import snapshot_download
 from PIL.Image import Image
@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    image_data: list[Image]
-    stop_token_ids: Optional[list[int]] = None
-    chat_template: Optional[str] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    chat_template: str | None = None
+    lora_requests: list[LoRARequest] | None = None


 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@ -1251,7 +1251,7 @@ model_example_map = {
 }


-def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
+def run_generate(model, question: str, image_urls: list[str], seed: int | None):
    req_data = model_example_map[model](question, image_urls)

    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
        print("-" * 50)


-def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
+def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
    req_data = model_example_map[model](question, image_urls)

    # Disable other modalities to save memory
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@ -11,7 +11,7 @@ on HuggingFace model repository.
 from argparse import Namespace
 from dataclasses import asdict
 from pathlib import Path
-from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args

 from PIL.Image import Image

@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict):


 QueryModality = Literal["text", "image", "text+image", "text+images"]
-Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
+Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery


 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: Optional[str] = None
-    image: Optional[Image] = None
-    query: Optional[str] = None
-    documents: Optional[ScoreMultiModalParam] = None
+    prompt: str | None = None
+    image: Image | None = None
+    query: str | None = None
+    documents: ScoreMultiModalParam | None = None


 def run_clip(query: Query) -> ModelRequestData:
@ -281,7 +281,7 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)


-def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
+def run_encode(model: str, modality: QueryModality, seed: int | None):
    query = get_query(modality)
    req_data = model_example_map[model](query)

@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
        print("-" * 50)


-def run_score(model: str, modality: QueryModality, seed: Optional[int]):
+def run_score(model: str, modality: QueryModality, seed: int | None):
    query = get_query(modality)
    req_data = model_example_map[model](query)

--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@ -23,7 +23,7 @@ import logging
 import os
 import sys
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from collections.abc import Callable

 import aiohttp
 import requests
@ -49,12 +49,9 @@ class Proxy:
        decode_instances: list[str],
        model: str,
        scheduling_policy: SchedulingPolicy,
-        custom_create_completion: Optional[
-            Callable[[Request], StreamingResponse]
-        ] = None,
-        custom_create_chat_completion: Optional[
-            Callable[[Request], StreamingResponse]
-        ] = None,
+        custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
+        custom_create_chat_completion: Callable[[Request], StreamingResponse]
+        | None = None,
    ):
        self.prefill_instances = prefill_instances
        self.decode_instances = decode_instances
@ -348,9 +345,9 @@ class ProxyServer:
    def __init__(
        self,
        args: argparse.Namespace,
-        scheduling_policy: Optional[SchedulingPolicy] = None,
-        create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
-        create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+        scheduling_policy: SchedulingPolicy | None = None,
+        create_completion: Callable[[Request], StreamingResponse] | None = None,
+        create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
    ):
        self.validate_parsed_serve_args(args)
        self.port = args.port
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Any

 import msgspec
 import zmq
@ -25,16 +25,16 @@ class KVCacheEvent(

 class BlockStored(KVCacheEvent):
    block_hashes: list[ExternalBlockHash]
-    parent_block_hash: Optional[ExternalBlockHash]
+    parent_block_hash: ExternalBlockHash | None
    token_ids: list[int]
    block_size: int
-    lora_id: Optional[int]
-    medium: Optional[str]
+    lora_id: int | None
+    medium: str | None


 class BlockRemoved(KVCacheEvent):
    block_hashes: list[ExternalBlockHash]
-    medium: Optional[str]
+    medium: str | None


 class AllBlocksCleared(KVCacheEvent):
@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent):


 class KVEventBatch(EventBatch):
-    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+    events: list[BlockStored | BlockRemoved | AllBlocksCleared]


 def process_event(event_batch):
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-from typing import Optional

 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -43,7 +42,7 @@ async def main():
    )

    prompt = "Who won the 2004 World Series?"
-    final_output: Optional[RequestOutput] = None
+    final_output: RequestOutput | None = None
    async for output in engine_client.generate(
        prompt=prompt,
        sampling_params=sampling_params,
--- a/examples/online_serving/pooling/cohere_rerank_client.py
+++ b/examples/online_serving/pooling/cohere_rerank_client.py
@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example.
 run: vllm serve BAAI/bge-reranker-base
 """

-from typing import Union
-
 import cohere
 from cohere import Client, ClientV2

@ -25,7 +23,7 @@ documents = [


 def cohere_rerank(
-    client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
+    client: Client | ClientV2, model: str, query: str, documents: list[str]
 ) -> dict:
    return client.rerank(model=model, query=query, documents=documents)

--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model.
 import argparse
 import base64
 import io
-from typing import Literal, Union
+from typing import Literal

 from openai import OpenAI
 from openai._types import NOT_GIVEN, NotGiven
@ -29,7 +29,7 @@ def create_chat_embeddings(
    *,
    messages: list[ChatCompletionMessageParam],
    model: str,
-    encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
+    encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
 ) -> CreateEmbeddingResponse:
    """
    Convenience function for accessing vLLM's Chat Embeddings API,
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@ -1,21 +1,15 @@
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import argparse
 import asyncio
 import enum
 import os
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Any, Literal

 import openai
 import pydantic
-
-if TYPE_CHECKING:
-    from openai.types.chat import ChatCompletionChunk
-
+from openai.types.chat import ChatCompletionChunk

 ConstraintsFormat = Literal[
    "choice",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -84,12 +84,6 @@ ignore = [
    "B007",
    # f-string format
    "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-    "UP027",
-    "UP035",
-    "UP038",
-    "UP045",
 ]

 [tool.ruff.format]
--- a/tests/benchmarks/test_random_dataset.py
+++ b/tests/benchmarks/test_random_dataset.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-from typing import Any, NamedTuple, Optional, cast
+from typing import Any, NamedTuple, cast

 import numpy as np
 import pytest
@ -185,8 +185,8 @@ def _collect_mm_samples(
    output_len: int = 5,
    base_items_per_request: int = 2,
    num_mm_items_range_ratio: float = 0.0,
-    limit_mm_per_prompt: Optional[dict[str, int]] = None,
-    bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
+    limit_mm_per_prompt: dict[str, int] | None = None,
+    bucket_config: dict[tuple[int, int, int], float] | None = None,
    enable_multimodal_chat: bool = False,
 ) -> list[SampleRequest]:
    if limit_mm_per_prompt is None:
--- a/tests/ci_envs.py
+++ b/tests/ci_envs.py
@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need!
 """

 import os
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any

 if TYPE_CHECKING:
    VLLM_CI_NO_SKIP: bool = False
-    VLLM_CI_DTYPE: Optional[str] = None
-    VLLM_CI_HEAD_DTYPE: Optional[str] = None
-    VLLM_CI_HF_DTYPE: Optional[str] = None
+    VLLM_CI_DTYPE: str | None = None
+    VLLM_CI_HEAD_DTYPE: str | None = None
+    VLLM_CI_HF_DTYPE: str | None = None

 environment_variables: dict[str, Callable[[], Any]] = {
    # A model family has many models with the same architecture.
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@ -2,9 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import weakref
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from copy import deepcopy
-from typing import Callable, Union

 from torch import fx
 from torch._ops import OpOverload
@ -44,7 +43,7 @@ class TestBackend:
    Inductor config is default-initialized from VllmConfig.CompilationConfig.
    """

-    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
+    def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
        self.custom_passes = list(passes)
        compile_config = get_current_vllm_config().compilation_config
        self.inductor_config = compile_config.inductor_compile_config
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@ -10,7 +10,7 @@ initialized randomly with a fixed seed.
 """

 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any

 import pytest
 import torch
@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module):
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        For tractable computation:
@ -217,7 +217,7 @@ class LlamaModel(nn.Module):

    def forward(
        self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
    ) -> torch.Tensor:
        hidden_states = self.embedding_tokens(input_ids)
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import dataclasses

 import pytest
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from __future__ import annotations
-
 import logging
 import tempfile
-from typing import Any, Union
+from typing import Any

 import pytest
 import torch
@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):


 def run_model(
-    compile_config: Union[int, CompilationConfig],
+    compile_config: int | CompilationConfig,
    model: str,
    model_kwargs: dict[str, Any],
 ):
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-from typing import Optional

 import pytest
 import torch._dynamo
@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8

 # globals needed for string-import custom Dynamo backend field
-backend: Optional[TestBackend] = None
-backend_unfused: Optional[TestBackend] = None
+backend: TestBackend | None = None
+backend_unfused: TestBackend | None = None


 class AttentionQuantPatternModel(torch.nn.Module):
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 import torch

@ -10,7 +9,7 @@ from vllm.config import CompilationLevel


 class MyMod(torch.nn.Module):
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
        if cache is not None:
            return x + cache
        return x * 2
@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
            compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
        )

-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
        # this is the function to be compiled
        return self.model(x, cache)

-    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
        # let torch.compile compile twice
        if len(self.compiled_codes) == 2:
            dispatch_id = 0 if cache is None else 1
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -21,7 +21,7 @@ import threading
 from collections.abc import Generator
 from contextlib import nullcontext
 from enum import Enum
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
+from typing import Any, Callable, TypedDict, TypeVar, cast

 import numpy as np
 import pytest
@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")

 _M = TypeVar("_M")

-_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
+_PromptMultiModalInput = list[_M] | list[list[_M]]

 PromptImageInput = _PromptMultiModalInput[Image.Image]
 PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
@ -267,7 +267,7 @@ class HfRunner:

        return "cpu" if current_platform.is_cpu() else current_platform.device_type

-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def wrap_device(self, x: _T, device: str | None = None) -> _T:
        if x is None or isinstance(x, (bool,)):
            return x

@ -287,14 +287,14 @@ class HfRunner:
        model_name: str,
        dtype: str = "auto",
        *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
        trust_remote_code: bool = True,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
        # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
    ) -> None:
        init_ctx = (
            nullcontext()
@ -319,7 +319,7 @@ class HfRunner:
        model_name: str,
        dtype: str = "auto",
        *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
        trust_remote_code: bool = True,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
@ -406,11 +406,11 @@ class HfRunner:

    def get_inputs(
        self,
-        prompts: Union[list[str], list[list[int]]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]:
+        prompts: list[str] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+    ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
        if images is not None:
            assert len(prompts) == len(images)

@ -420,9 +420,7 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)

-        all_inputs: list[
-            Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]
-        ] = []
+        all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
        for i, prompt in enumerate(prompts):
            if isinstance(prompt, str):
                processor_kwargs: dict[str, Any] = {
@ -494,10 +492,10 @@ class HfRunner:

    def generate(
        self,
-        prompts: Union[list[str], list[list[int]]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        prompts: list[str] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[list[int]], list[str]]]:
        all_inputs = self.get_inputs(
@ -522,11 +520,11 @@ class HfRunner:

    def generate_greedy(
        self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str] | list[list[int]],
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[int], str]]:
        outputs = self.generate(
@ -546,9 +544,9 @@ class HfRunner:
        prompts: list[str],
        beam_width: int,
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
    ) -> list[tuple[list[list[int]], list[str]]]:
        outputs = self.generate(
            prompts,
@ -574,9 +572,9 @@ class HfRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[list[torch.Tensor]]:
        all_inputs = self.get_inputs(
@ -624,7 +622,7 @@ class HfRunner:
    def _hidden_states_to_logprobs(
        self,
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: Optional[int],
+        num_logprobs: int | None,
    ) -> tuple[list[dict[int, float]], int]:
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)
@ -652,10 +650,10 @@ class HfRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: Optional[int],
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
+        num_logprobs: int | None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
        **kwargs: Any,
    ) -> list[TokensTextLogprobs]:
        all_inputs = self.get_inputs(
@ -734,20 +732,20 @@ class VllmRunner:
        model_name: str,
        runner: RunnerOption = "auto",
        convert: ConvertOption = "auto",
-        tokenizer_name: Optional[str] = None,
+        tokenizer_name: str | None = None,
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
-        seed: Optional[int] = 0,
-        max_model_len: Optional[int] = 1024,
+        seed: int | None = 0,
+        max_model_len: int | None = 1024,
        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16 if not torch.xpu.is_available() else 64,
-        enable_chunked_prefill: Optional[bool] = False,
+        enable_chunked_prefill: bool | None = False,
        swap_space: int = 4,
-        enforce_eager: Optional[bool] = False,
+        enforce_eager: bool | None = False,
        # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
        **kwargs,
    ) -> None:
        init_ctx = (
@ -785,10 +783,10 @@ class VllmRunner:

    def get_inputs(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
    ) -> list[dict[str, Any]]:
        if any(
            x is not None and len(x) != len(prompts) for x in [images, videos, audios]
@ -824,11 +822,11 @@ class VllmRunner:

    def generate(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
        sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@ -871,11 +869,11 @@ class VllmRunner:
        self,
        prompts: list[str],
        sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
        **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)

        req_outputs = self.llm.generate(
@ -894,11 +892,11 @@ class VllmRunner:

    def generate_greedy(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
@ -916,15 +914,15 @@ class VllmRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: Optional[int],
-        num_prompt_logprobs: Optional[int] = None,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        stop: Optional[list[str]] = None,
+        num_logprobs: int | None,
+        num_prompt_logprobs: int | None = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
+        stop_token_ids: list[int] | None = None,
+        stop: list[str] | None = None,
        **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
@ -957,7 +955,7 @@ class VllmRunner:
        perplexities = []
        for output in outputs:
            output = cast(TokensTextLogprobsPromptLogprobs, output)
-            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            token_datas = cast(list[dict[int, Logprob] | None], output[3])
            assert token_datas[0] is None
            token_log_probs = []
            for token_data in token_datas[1:]:
@ -976,10 +974,10 @@ class VllmRunner:
        prompts: list[str],
        beam_width: int,
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        concurrency_limit: Optional[int] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+        concurrency_limit: int | None = None,
    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)

@ -1002,9 +1000,9 @@ class VllmRunner:
    def embed(
        self,
        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
        *args,
        **kwargs,
    ) -> list[list[float]]:
@ -1023,8 +1021,8 @@ class VllmRunner:

    def score(
        self,
-        text_1: Union[str, list[str]],
-        text_2: Union[str, list[str]],
+        text_1: list[str] | str,
+        text_2: list[str] | str,
        *args,
        **kwargs,
    ) -> list[float]:
@ -1226,8 +1224,8 @@ def _find_free_port() -> int:
 class LocalAssetServer:
    address: str
    port: int
-    server: Optional[http.server.ThreadingHTTPServer]
-    thread: Optional[threading.Thread]
+    server: http.server.ThreadingHTTPServer | None
+    thread: threading.Thread | None

    def __init__(self, address: str = "127.0.0.1") -> None:
        self.address = address
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Any, Optional
+from typing import Any

 import pytest

@ -15,8 +15,8 @@ def _test_stopping(
    llm: LLM,
    expected_output: str,
    expected_reason: Any,
-    stop: Optional[list[str]] = None,
-    stop_token_ids: Optional[list[int]] = None,
+    stop: list[str] | None = None,
+    stop_token_ids: list[int] | None = None,
    include_in_output: bool = False,
 ) -> None:
    output = llm.generate(
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-from typing import Optional, Union

 import msgspec
 import msgspec.msgpack
@ -78,8 +77,8 @@ class MockSubscriber:

    def __init__(
        self,
-        pub_endpoints: Union[str, list[str]],
-        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        pub_endpoints: str | list[str],
+        replay_endpoints: str | list[str] | None = None,
        topic: str = "",
        decode_type=SampleBatch,
    ):
@ -111,7 +110,7 @@ class MockSubscriber:
        self.last_seq = -1
        self.decoder = msgspec.msgpack.Decoder(type=decode_type)

-    def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+    def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
        """Receive a single message with timeout"""
        if not self.sub.poll(timeout):
            return None
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -5,9 +5,8 @@
 Run `pytest tests/distributed/test_comm_ops.py`.
 """

-from __future__ import annotations
-
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any

 import pytest
 import ray
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple

 import pytest

@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):

 class CPTestOptions(NamedTuple):
    multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None


@dataclass
@ -54,7 +54,7 @@ class CPTestSettings:
        dcp_base: int = 1,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for eager_mode_val in [False]:
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple

 import pytest

@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple):

 class EPTestOptions(NamedTuple):
    trust_remote_code: bool
-    tokenizer_mode: Optional[str]
-    load_format: Optional[str] = None
-    hf_overrides: Optional[str] = None
+    tokenizer_mode: str | None
+    load_format: str | None = None
+    hf_overrides: str | None = None


@dataclass
@ -40,9 +40,9 @@ class EPTestSettings:
        tp_base: int = 2,
        runner: RunnerOption = "auto",
        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
-        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
+        tokenizer_mode: str | None = None,
+        load_format: str | None = None,
+        hf_overrides: str | None = None,
    ):
        return EPTestSettings(
            parallel_setups=[
@ -72,9 +72,9 @@ class EPTestSettings:
        tp_base: int = 2,
        runner: RunnerOption = "auto",
        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
-        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
+        tokenizer_mode: str | None = None,
+        load_format: str | None = None,
+        hf_overrides: str | None = None,
    ):
        return EPTestSettings(
            parallel_setups=[
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple

 import pytest

@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple):

 class PPTestOptions(NamedTuple):
    multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None


@dataclass
@ -52,7 +52,7 @@ class PPTestSettings:
        pp_base: int = 2,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        return PPTestSettings(
            parallel_setups=[
@ -76,7 +76,7 @@ class PPTestSettings:
        pp_base: int = 2,
        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        return PPTestSettings(
            parallel_setups=[
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@ -1,16 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
 import pytest
+from typing_extensions import LiteralString

 from ..utils import compare_two_settings, create_new_process_for_each_test

-if TYPE_CHECKING:
-    from typing_extensions import LiteralString
-

@pytest.mark.parametrize(
    "PP_SIZE, MODEL_NAME",
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple

 import pytest

@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):

 class SPTestOptions(NamedTuple):
    multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None


@dataclass
@ -53,7 +53,7 @@ class SPTestSettings:
        pp_base: int = 1,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for eager_mode_val in [False, True]:
@ -84,7 +84,7 @@ class SPTestSettings:
        pp_base: int = 1,
        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for eager_mode_val in [False, True]:
@ -115,7 +115,7 @@ class SPTestSettings:
        pp_base: int = 1,
        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for fusion_val in [False, True]:
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -5,7 +5,7 @@ import json
 from argparse import ArgumentError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal

 import pytest

@ -115,9 +115,9 @@ class NestedConfig:
 class DummyConfig:
    regular_bool: bool = True
    """Regular bool with default True"""
-    optional_bool: Optional[bool] = None
+    optional_bool: bool | None = None
    """Optional bool with default None"""
-    optional_literal: Optional[Literal["x", "y"]] = None
+    optional_literal: Literal["x", "y"] | None = None
    """Optional literal with default None"""
    tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
    """Tuple with variable length"""
@ -127,7 +127,7 @@ class DummyConfig:
    """List with variable length"""
    list_literal: list[Literal[1, 2]] = field(default_factory=list)
    """List with literal choices"""
-    list_union: list[Union[str, type[object]]] = field(default_factory=list)
+    list_union: list[str | type[object]] = field(default_factory=list)
    """List with union type"""
    literal_literal: Literal[Literal[1], Literal[2]] = 1
    """Literal of literals with default 1"""
@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected):
    ("type_hint", "expected"),
    [
        (Annotated[int, "annotation"], {int}),
-        (Optional[int], {int, type(None)}),
-        (Annotated[Optional[int], "annotation"], {int, type(None)}),
-        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+        (int | None, {int, type(None)}),
+        (Annotated[int | None, "annotation"], {int, type(None)}),
+        (Annotated[int, "annotation"] | None, {int, type(None)}),
    ],
-    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"],
+    ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"],
 )
 def test_get_type_hints(type_hint, expected):
    assert get_type_hints(type_hint) == expected
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@ -3,7 +3,7 @@

 import asyncio
 import random
-from typing import Callable
+from collections.abc import Callable

 import openai
 import pytest
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -3,7 +3,6 @@

 # imports for structured outputs tests
 import json
-from typing import Optional

 import jsonschema
 import openai  # use the official client for correctness check
@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st
    [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
 )
 async def test_prompt_logprobs_chat(
-    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
 ):
    params: dict = {
        "messages": [
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import datetime
-from typing import Union

 import openai  # use the official client for correctness check
 import pytest
@ -166,7 +165,7 @@ async def test_function_tool_use(
    client: openai.AsyncOpenAI,
    model_name: str,
    stream: bool,
-    tool_choice: Union[str, dict],
+    tool_choice: str | dict,
    enable_thinking: bool,
 ):
    if not stream:
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -4,7 +4,6 @@
 from contextlib import suppress
 from dataclasses import dataclass, field
 from http import HTTPStatus
-from typing import Optional
 from unittest.mock import AsyncMock, MagicMock

 import pytest
@ -38,13 +37,13 @@ class MockModelConfig:
    trust_remote_code: bool = False
    tokenizer_mode: str = "auto"
    max_model_len: int = 100
-    tokenizer_revision: Optional[str] = None
+    tokenizer_revision: str | None = None
    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
-    logits_processor_pattern: Optional[str] = None
-    diff_sampling_param: Optional[dict] = None
+    logits_processor_pattern: str | None = None
+    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
-    allowed_media_domains: Optional[list[str]] = None
+    allowed_media_domains: list[str] | None = None
    encoder_config = None
    generation_config: str = "auto"
    skip_tokenizer_init: bool = False
@ -56,7 +55,7 @@ class MockModelConfig:
 class MockLoRAResolver(LoRAResolver):
    async def resolve_lora(
        self, base_model_name: str, lora_name: str
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
        if lora_name == "test-lora":
            return LoRARequest(
                lora_name="test-lora",
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -1,16 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import Any
 from unittest.mock import AsyncMock, MagicMock

 import pytest
 import pytest_asyncio
+from openai import OpenAI

 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM

 from ...utils import RemoteOpenAIServer

-if TYPE_CHECKING:
-    from openai import OpenAI
-
 GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"


--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from collections.abc import Iterable
-from typing import Union

 from vllm.entrypoints.openai.protocol import (
    ChatCompletionRequest,
@ -84,10 +83,10 @@ class StreamingToolReconstructor:
 def run_tool_extraction(
    tool_parser: ToolParser,
    model_output: str,
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
    streaming: bool = False,
    assert_one_tool_per_delta: bool = True,
-) -> tuple[Union[str, None], list[ToolCall]]:
+) -> tuple[str | None, list[ToolCall]]:
    if streaming:
        reconstructor = run_tool_extraction_streaming(
            tool_parser,
@ -105,7 +104,7 @@ def run_tool_extraction(
 def run_tool_extraction_nonstreaming(
    tool_parser: ToolParser,
    model_output: str,
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> ExtractedToolCallInformation:
    request = request or ChatCompletionRequest(messages=[], model="test-model")
    return tool_parser.extract_tool_calls(model_output, request)
@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming(
 def run_tool_extraction_streaming(
    tool_parser: ToolParser,
    model_deltas: Iterable[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
    assert_one_tool_per_delta: bool = True,
 ) -> StreamingToolReconstructor:
    request = request or ChatCompletionRequest(messages=[], model="test-model")
--- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
@ -4,8 +4,6 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """

-from typing import Optional
-
 import openai
 import pytest

@ -103,14 +101,14 @@ async def test_matryoshka(
        run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)

    if model_info.is_matryoshka:
-        valid_dimensions: list[Optional[int]] = [None]
+        valid_dimensions: list[int | None] = [None]
        if model_info.matryoshka_dimensions is not None:
            valid_dimensions += model_info.matryoshka_dimensions[:2]

        for dimensions in valid_dimensions:
            await make_request_and_correctness_test(dimensions)

-        invalid_dimensions: list[Optional[int]] = [-1]
+        invalid_dimensions: list[int | None] = [-1]
        if model_info.matryoshka_dimensions is not None:
            assert 5 not in model_info.matryoshka_dimensions
            invalid_dimensions.append(5)
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@ -5,7 +5,6 @@ import multiprocessing
 import socket
 import threading
 import time
-from typing import Optional
 from unittest.mock import patch

 import pytest
@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args):
        assert len(manager.processes) == 3

        # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}

        def run_with_exception_capture():
            try:
@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args):
        assert len(manager.processes) == 3

        # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}

        def run_with_exception_capture():
            try:
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -3,7 +3,7 @@

 import warnings
 from collections.abc import Mapping
-from typing import Literal, Optional
+from typing import Literal

 import pytest
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
@ -152,9 +152,9 @@ def audio_url():


 def _assert_mm_data_is_image_input(
-    mm_data: Optional[MultiModalDataDict],
+    mm_data: MultiModalDataDict | None,
    image_count: int,
-    skipped_image_indices: Optional[list] = None,
+    skipped_image_indices: list | None = None,
 ) -> None:
    assert mm_data is not None
    assert set(mm_data.keys()) == {"image"}
@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input(


 def _assert_mm_uuids(
-    mm_uuids: Optional[MultiModalUUIDDict],
+    mm_uuids: MultiModalUUIDDict | None,
    media_count: int,
-    expected_uuids: list[Optional[str]],
+    expected_uuids: list[str | None],
    modality: str = "image",
 ) -> None:
    if len(expected_uuids) > 0:
@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int]


 def _assert_mm_data_inputs(
-    mm_data: Optional[MultiModalDataDict],
+    mm_data: MultiModalDataDict | None,
    data_count: MultiModalDataCounts,
-    skipped_media_indices: Optional[dict[str, list]] = None,  # modality -> list[int]
+    skipped_media_indices: dict[str, list] | None = None,  # modality -> list[int]
 ) -> None:
    assert mm_data is not None
    assert set(data_count.keys()) == (set(mm_data.keys()))
--- a/tests/entrypoints/test_renderer.py
+++ b/tests/entrypoints/test_renderer.py
@ -3,7 +3,6 @@

 import io
 from dataclasses import dataclass
-from typing import Optional
 from unittest.mock import AsyncMock, MagicMock

 import pybase64
@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt
@dataclass
 class MockModelConfig:
    max_model_len: int = 100
-    encoder_config: Optional[dict] = None
+    encoder_config: dict | None = None


 class MockTokenizerResult:
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@ -12,7 +12,6 @@ import json
 import os
 import time
 from collections.abc import Generator
-from typing import Optional, Union

 import aiohttp
 import numpy as np
@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm
 INVALID = -9999999


-def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
+def download_and_cache_file(url: str, filename: str | None = None) -> str:
    """Download and cache a file from a URL."""
    if filename is None:
        filename = os.path.join("/tmp", url.split("/")[-1])
@ -81,9 +80,9 @@ async def call_vllm_api(
    prompt: str,
    temperature: float,
    max_tokens: int,
-    stop: Optional[list[str]] = None,
-    url: Optional[str] = None,
-    seed: Optional[int] = None,
+    stop: list[str] | None = None,
+    url: str | None = None,
+    seed: int | None = None,
 ) -> str:
    """Call vLLM's OpenAI-compatible completions endpoint."""
    data = {
@ -112,8 +111,8 @@ def evaluate_gsm8k(
    host: str = "http://127.0.0.1",
    port: int = 8000,
    temperature: float = 0.0,
-    seed: Optional[int] = 42,
-) -> dict[str, Union[float, int]]:
+    seed: int | None = 42,
+) -> dict[str, float | int]:
    """
    Evaluate GSM8K accuracy using vLLM serve endpoint.

--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 import pytest
 import torch
@ -27,8 +26,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -94,12 +93,12 @@ def test_varlen_with_paged_kv(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
    head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import random
-from typing import Optional

 import pytest
 import torch
@ -50,7 +49,7 @@ def ref_masked_attention(
    key: torch.Tensor,
    value: torch.Tensor,
    scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
 ) -> torch.Tensor:
    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
    if attn_mask is not None:
@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention(
    block_tables: torch.Tensor,
    seq_lens: torch.Tensor,
    scale: float,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
 ) -> None:
    num_query_heads = query.shape[1]
    num_kv_heads = value_cache.shape[1]
@ -415,7 +414,7 @@ def ref_multi_query_kv_attention(
    key: torch.Tensor,
    value: torch.Tensor,
    scale: float,
-    alibi_bias: Optional[list[torch.Tensor]],
+    alibi_bias: list[torch.Tensor] | None,
    dtype: torch.dtype,
 ) -> torch.Tensor:
    num_seqs = len(cu_seq_lens) - 1
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 import pytest
 import torch
@ -85,7 +84,7 @@ def test_cascade(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
    fa_version: int,
 ) -> None:
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
-from typing import Optional

 import pytest
 import torch
@ -17,7 +16,7 @@ def cal_diff(
    y: torch.Tensor,
    name: str,
    use_fp8: bool = False,
-    diff_threshold: Optional[float] = None,
+    diff_threshold: float | None = None,
 ) -> None:
    x, y = x.double(), y.double()
    cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 import pytest
 import torch
@ -34,8 +33,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    fa_version: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")
    if not is_fa_version_supported(fa_version):
@ -221,13 +220,13 @@ def test_varlen_with_paged_kv(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
    head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
    fa_version: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")
    if not is_fa_version_supported(fa_version):
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 import flashinfer
 import pytest
@ -26,8 +25,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
-    sliding_window: Optional[int],
+    soft_cap: float | None,
+    sliding_window: int | None,
 ) -> None:
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
-    sliding_window: Optional[int],
+    soft_cap: float | None,
+    sliding_window: int | None,
 ) -> None:
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
 ) -> None:
    pytest.skip("TODO: fix the accuracy issue")
    torch.set_default_device("cuda")
@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
 ) -> None:
    # test doesn't work for num_heads = (16,16)
    torch.set_default_device("cuda")
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional

 import flashinfer
 import pytest
@ -68,9 +67,7 @@ NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
@torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
    batch_size: int,
    max_seq_lens: tuple[int, int],
    num_heads: tuple[int, int],
@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
    kv_layout: str,
    block_size: int,
    window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    has_sinks: bool,
 ) -> None:
    torch.set_default_device("cuda")
@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
@torch.inference_mode
 def test_flashinfer_trtllm_prefill_with_baseline(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
    batch_size: int,
    max_seq_lens: tuple[int, int],
    num_heads: tuple[int, int],
@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
    kv_layout: str,
    block_size: int,
    window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    has_sinks: bool,
 ) -> None:
    torch.set_default_device("cuda")
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional

 import pytest
 import torch
@ -20,7 +19,7 @@ def merge_attn_states_torch(
    prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
    suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
    suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-    output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+    output_lse: torch.Tensor | None = None,  # [NUM_HEADS, NUM_TOKENS]
 ):
    p_lse = prefix_lse
    s_lse = suffix_lse
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 import pytest
 import torch
@ -32,8 +31,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -98,12 +97,12 @@ def test_triton_unified_attn(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
    head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")

--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional, Union

 import pytest
 import torch
@ -31,13 +30,13 @@ EPS = 1e-6
 ## Helpers


-def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
    return torch.as_tensor(x, dtype=torch.float32, device="cuda")


 def ref_rms_norm(
-    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
+) -> tuple[torch.Tensor, torch.Tensor | None]:
    if residual is not None:
        residual = residual.clone()
        out, residual = rms_norm_layer.forward_native(x, residual)
@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    if scale_ub is not None:
        assert quant_dtype == torch.float8_e4m3fn

@ -76,9 +75,9 @@ def ref_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    return ref_dynamic_per_token_quant(
        rms_norm_layer, x, quant_dtype, residual, scale_ub
    )
@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant(
    weight: torch.Tensor,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    if residual is not None:
        residual = residual.clone()
    out, scales = ops.rms_norm_dynamic_per_token_quant(
@ -103,9 +102,9 @@ def ops_impl(
    weight: torch.Tensor,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)


--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Callable
 from itertools import product
-from typing import Callable, Optional

 import pytest
 import torch
@ -68,7 +68,7 @@ def test_rotary_embedding(
    seq_len: int,
    num_heads: int,
    head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
    dtype: torch.dtype,
    seed: int,
    device: str,
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@ -4,8 +4,6 @@
 Tests for miscellaneous utilities
 """

-from typing import Optional
-
 import pytest
 import torch

@ -17,7 +15,7 @@ def rotary_embedding_opcheck(
    rot,
    positions: torch.Tensor,
    query: torch.Tensor,
-    key: Optional[torch.Tensor] = None,
+    key: torch.Tensor | None = None,
 ):
    cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)

--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 import pytest
 import torch
@ -19,11 +18,11 @@ from vllm.platforms import current_platform
 def causal_conv1d_ref(
    x: torch.Tensor,
    weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    initial_states: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
+    initial_states: torch.Tensor | None = None,
    return_final_states: bool = False,
-    final_states_out: Optional[torch.Tensor] = None,
-    activation: Optional[str] = "silu",
+    final_states_out: torch.Tensor | None = None,
+    activation: str | None = "silu",
 ):
    """
    x: (batch, dim, seqlen)
@ -117,12 +116,12 @@ def causal_conv1d_update_ref(
 def causal_conv1d_opcheck_fn(
    x: torch.Tensor,
    weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    cu_seq_len: Optional[torch.Tensor] = None,
-    cache_indices: Optional[torch.Tensor] = None,
-    has_initial_state: Optional[torch.Tensor] = None,
-    conv_states: Optional[torch.Tensor] = None,
-    activation: Optional[str] = "silu",
+    bias: torch.Tensor | None = None,
+    cu_seq_len: torch.Tensor | None = None,
+    cache_indices: torch.Tensor | None = None,
+    has_initial_state: torch.Tensor | None = None,
+    conv_states: torch.Tensor | None = None,
+    activation: str | None = "silu",
    pad_slot_id: int = PAD_SLOT_ID,
 ):
    """
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any

 import torch

@ -35,7 +35,7 @@ from .mk_objects import (
 from .parallel_utils import ProcessGroupInfo


-def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
+def _describe_tensor(t: torch.Tensor | None, name: str) -> str:
    if t is None:
        return f"{name} : None"
    else:
@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:

@dataclass
 class Config:
-    Ms: Union[list[int], int]
+    Ms: list[int] | int
    K: int
    N: int
    E: int
-    topks: Union[list[int], int]
+    topks: list[int] | int
    dtype: torch.dtype
-    quant_config: Optional[TestMoEQuantConfig]
+    quant_config: TestMoEQuantConfig | None

    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
    fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute

-    fused_moe_chunk_size: Optional[int]
+    fused_moe_chunk_size: int | None
    world_size: int

-    torch_trace_dir_path: Optional[str] = None
+    torch_trace_dir_path: str | None = None

    def __post_init__(self):
        if self.quant_config is None:
@ -93,7 +93,7 @@ class Config:
        return self.Ms

    @property
-    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+    def quant_dtype(self) -> torch.dtype | str | None:
        assert self.quant_config is not None
        return self.quant_config.quant_dtype

@ -112,7 +112,7 @@ class Config:
        return self.quant_config.per_out_ch_quant

    @property
-    def quant_block_shape(self) -> Optional[list[int]]:
+    def quant_block_shape(self) -> list[int] | None:
        assert self.quant_config is not None
        return self.quant_config.block_shape

@ -209,7 +209,7 @@ class Config:
        info = prepare_finalize_info(self.prepare_finalize_type)
        return info.backend

-    def is_valid(self) -> tuple[bool, Optional[str]]:
+    def is_valid(self) -> tuple[bool, str | None]:
        # Check prepare-finalize and fused-experts compatibility
        if self.is_batched_prepare_finalize():
            if not self.is_batched_fused_experts():
@ -280,10 +280,10 @@ class Config:
 class WeightTensors:
    w1: torch.Tensor
    w2: torch.Tensor
-    w1_scale: Optional[torch.Tensor]
-    w2_scale: Optional[torch.Tensor]
-    w1_gs: Optional[torch.Tensor] = None
-    w2_gs: Optional[torch.Tensor] = None
+    w1_scale: torch.Tensor | None
+    w2_scale: torch.Tensor | None
+    w1_gs: torch.Tensor | None = None
+    w2_gs: torch.Tensor | None = None

    def describe(self):
        s = ""
@ -351,11 +351,11 @@ class WeightTensors:
@dataclass
 class RankTensors:
    hidden_states: torch.Tensor
-    hidden_states_scale: Optional[torch.Tensor]
+    hidden_states_scale: torch.Tensor | None

    topk_weights: torch.Tensor
    topk_ids: torch.Tensor
-    expert_map: Optional[torch.Tensor]
+    expert_map: torch.Tensor | None

    def describe(self):
        s = ""
@ -370,7 +370,7 @@ class RankTensors:
    @staticmethod
    def make_hidden_states(
        config: Config,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        """
        Return hidden_states
        """
--- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@ -4,7 +4,6 @@
 import copy
 from enum import Enum
 from itertools import product
-from typing import Optional

 import torch
 from tqdm import tqdm
@ -82,7 +81,7 @@ def make_feature_matrix(csv_file_path: str):
    import pandas as pd

    def add_to_results(
-        config: Config, success: Result, results_df: Optional[pd.DataFrame] = None
+        config: Config, success: Result, results_df: pd.DataFrame | None = None
    ):
        config_dict = asdict(config)
        config_dict["prepare_finalize_type"] = config_dict[
@ -121,7 +120,7 @@ def make_feature_matrix(csv_file_path: str):
        product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES)
    )

-    results_df: Optional[pd.DataFrame] = None
+    results_df: pd.DataFrame | None = None
    for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
        combinations
    ):
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional, Union

 import torch

@ -43,25 +42,25 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe

@dataclass
 class TestMoEQuantConfig:
-    quant_dtype: Union[torch.dtype, str, None]
+    quant_dtype: torch.dtype | str | None
    per_out_ch_quant: bool
    per_act_token_quant: bool
-    block_shape: Optional[list[int]]
+    block_shape: list[int] | None


@dataclass
 class PrepareFinalizeInfo:
    activation_format: mk.FusedMoEActivationFormat
-    supported_dtypes: list[Union[torch.dtype, str]]
+    supported_dtypes: list[torch.dtype | str]
    blocked_quantization_support: bool
-    backend: Optional[str]
+    backend: str | None
    supports_apply_weight_on_input: bool = True


@dataclass
 class ExpertInfo:
    activation_format: mk.FusedMoEActivationFormat
-    supported_dtypes: list[Union[torch.dtype, str]]
+    supported_dtypes: list[torch.dtype | str]
    blocked_quantization_support: bool
    supports_chunking: bool
    supports_expert_map: bool
@ -78,7 +77,7 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []

 standard_format = mk.FusedMoEActivationFormat.Standard
 batched_format = mk.FusedMoEActivationFormat.BatchedExperts
-common_float_types: list[Union[torch.dtype, str]] = [
+common_float_types: list[torch.dtype | str] = [
    torch.float8_e4m3fn,
    torch.bfloat16,
    torch.float16,
@ -92,9 +91,9 @@ fp8_types = [torch.float8_e4m3fn]
 def register_prepare_and_finalize(
    kind,
    activation_format: mk.FusedMoEActivationFormat,
-    supported_dtypes: list[Union[torch.dtype, str]],
+    supported_dtypes: list[torch.dtype | str],
    blocked_quantization_support: bool,
-    backend: Optional[str],
+    backend: str | None,
    force_multigpu: bool = False,
    supports_apply_weight_on_input: bool = True,
 ):
@ -121,7 +120,7 @@ def register_prepare_and_finalize(
 def register_experts(
    kind,
    activation_format: mk.FusedMoEActivationFormat,
-    supported_dtypes: list[Union[torch.dtype, str]],
+    supported_dtypes: list[torch.dtype | str],
    blocked_quantization_support: bool,
    supports_chunking: bool,
    supports_expert_map: bool,
@ -340,7 +339,7 @@ if cutlass_fp4_supported():
        supports_expert_map=False,
    )

-MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [
+MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
    None,
    # per-channel / per-column weights and per-tensor activations
    TestMoEQuantConfig(
@ -395,7 +394,7 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():

 def make_prepare_finalize(
    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
-    backend: Optional[str],
+    backend: str | None,
    moe: FusedMoEConfig,
    quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEPrepareAndFinalize:
--- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@ -3,11 +3,12 @@
 import dataclasses
 import os
 import traceback
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Concatenate

 import torch
 from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
+from typing_extensions import ParamSpec

 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import init_distributed_environment, initialize_model_parallel
@ -58,9 +59,9 @@ def _worker_parallel_launch(
    world_local_size: int,
    node_rank: int,
    init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None],
-    vllm_config: Optional[VllmConfig],
-    env_dict: Optional[dict],
+    worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None],
+    vllm_config: VllmConfig | None,
+    env_dict: dict | None,
    *args: P.args,
    **kwargs: P.kwargs,
 ) -> None:
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import copy
+from collections.abc import Callable
 from itertools import product
-from typing import Any, Callable
+from typing import Any

 import torch

--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@ -7,12 +7,13 @@ DeepEP test utilities
 import dataclasses
 import os
 import traceback
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Concatenate

 import torch
 from torch.distributed import ProcessGroup
 from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
+from typing_extensions import ParamSpec

 from vllm.utils import get_open_port, has_deep_ep

@ -126,8 +127,8 @@ def make_deepep_ht_a2a(
    pgi: ProcessGroupInfo,
    dp_size: int,
    ht_args: DeepEPHTArgs,
-    q_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
 ):
    import deep_ep

@ -153,8 +154,8 @@ def make_deepep_ll_a2a(
    pg: ProcessGroup,
    pgi: ProcessGroupInfo,
    deepep_ll_args: DeepEPLLArgs,
-    q_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
 ):
    import deep_ep

@ -185,10 +186,10 @@ def make_deepep_a2a(
    pg: ProcessGroup,
    pgi: ProcessGroupInfo,
    dp_size: int,
-    deepep_ht_args: Optional[DeepEPHTArgs],
-    deepep_ll_args: Optional[DeepEPLLArgs],
-    q_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    deepep_ht_args: DeepEPHTArgs | None,
+    deepep_ll_args: DeepEPLLArgs | None,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
 ):
    if deepep_ht_args is not None:
        assert deepep_ll_args is None
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
-from typing import Optional

 import pytest
 import torch
@ -55,7 +54,7 @@ vllm_config.scheduler_config.max_model_len = 8192
@dataclass
 class BatchedMMConfig:
    in_dtype: torch.dtype
-    quant_dtype: Optional[torch.dtype]
+    quant_dtype: torch.dtype | None
    out_dtype: torch.dtype
    num_experts: int
    max_tokens_per_expert: int
@ -115,7 +114,7 @@ def test_batched_mm(
    K: int,
    N: int,
    dtype: torch.dtype,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
    per_act_token_quant: bool,
 ):
    current_platform.seed_everything(7)
@ -242,7 +241,7 @@ def test_fused_moe_batched_experts(
    topk: int,
    dtype: torch.dtype,
    per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
    input_scales: bool,
 ):
    current_platform.seed_everything(7)
--- a/tests/kernels/moe/test_count_expert_num_tokens.py
+++ b/tests/kernels/moe/test_count_expert_num_tokens.py
@ -5,7 +5,6 @@ Tests compute_expert_num_tokens kernels
 """

 import dataclasses
-from typing import Optional

 import pytest
 import torch
@ -16,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
@dataclasses.dataclass
 class TestTensors:
    topk_ids: torch.Tensor
-    expert_map: Optional[torch.Tensor] = None
+    expert_map: torch.Tensor | None = None

    def to_device(self, device: str):
        self.topk_ids = self.topk_ids.to(device=device)
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@ -3,7 +3,6 @@
 import copy
 import dataclasses
 from math import prod
-from typing import Optional

 import pytest
 import torch
@ -85,16 +84,16 @@ class MOETensors:
@dataclasses.dataclass
 class MOETensors8Bit(MOETensors):
    # quantized
-    a_q: Optional[torch.Tensor] = None  # a -> a_q
-    w1_q: Optional[torch.Tensor] = None  # w1 -> w1_q
-    w2_q: Optional[torch.Tensor] = None  # w2 -> w2_q
-    a_scale: Optional[torch.Tensor] = None
-    w1_scale: Optional[torch.Tensor] = None
-    w2_scale: Optional[torch.Tensor] = None
+    a_q: torch.Tensor | None = None  # a -> a_q
+    w1_q: torch.Tensor | None = None  # w1 -> w1_q
+    w2_q: torch.Tensor | None = None  # w2 -> w2_q
+    a_scale: torch.Tensor | None = None
+    w1_scale: torch.Tensor | None = None
+    w2_scale: torch.Tensor | None = None
    # dequantized
-    a_d: Optional[torch.Tensor] = None  # a -> a_q -> a_d
-    w1_d: Optional[torch.Tensor] = None  # w1 -> w1_q -> w1_d
-    w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
+    a_d: torch.Tensor | None = None  # a -> a_q -> a_d
+    w1_d: torch.Tensor | None = None  # w1 -> w1_q -> w1_d
+    w2_d: torch.Tensor | None = None  # w2 -> w2_q -> w2_d

    @staticmethod
    def make_moe_tensors_8bit(
@ -209,7 +208,7 @@ def run_8_bit(
    topk_ids: torch.Tensor,
    per_act_token: bool,
    per_out_ch: bool,
-    num_local_experts: Optional[int] = None,
+    num_local_experts: int | None = None,
 ) -> torch.Tensor:
    assert not any(
        [
@ -280,7 +279,7 @@ def test_cutlass_moe_8_bit_no_graph(
    per_act_token: bool,
    per_out_ch: bool,
    monkeypatch,
-    ep_size: Optional[int] = None,
+    ep_size: int | None = None,
 ):
    current_platform.seed_everything(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@ -7,7 +7,6 @@ fp8 block-quantized case.
 """

 import dataclasses
-from typing import Optional

 import pytest
 import torch.distributed
@ -92,13 +91,13 @@ class TestConfig:
    block_size: list[int]
    # configs for testing low-latency kernels
    low_latency: bool
-    use_fp8_dispatch: Optional[bool] = False
+    use_fp8_dispatch: bool | None = False


@dataclasses.dataclass
 class TestTensors:
    rank_tokens: torch.Tensor  # all ranks make this many tokens
-    rank_token_scales: Optional[torch.Tensor]
+    rank_token_scales: torch.Tensor | None
    topk: torch.Tensor
    topk_weights: torch.Tensor
    config: TestConfig
@ -143,7 +142,7 @@ def make_ll_modular_kernel(
    max_tokens_per_rank: int,
    dp_size: int,
    hidden_size: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
    test_config: TestConfig,
    quant_config: FusedMoEQuantConfig,
 ) -> FusedMoEModularKernel:
@ -179,7 +178,7 @@ def make_ht_modular_kernel(
    pgi: ProcessGroupInfo,
    dp_size: int,
    num_local_experts: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
    test_config: TestConfig,
    quant_config: FusedMoEQuantConfig,
 ) -> FusedMoEModularKernel:
@ -249,8 +248,8 @@ def deepep_deepgemm_moe_impl(
    test_tensors: TestTensors,
    w1: torch.Tensor,
    w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
 ) -> torch.Tensor:
    test_config = test_tensors.config
    num_experts = test_config.num_experts
--- a/Show More
+++ b/Show More