mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Update Optional[x]
-> x | None
and Union[x, y]
to x | y
(#26633)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@ -8,7 +8,6 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
import aiohttp
|
||||
import huggingface_hub.constants
|
||||
@ -28,13 +27,13 @@ class RequestFuncInput:
|
||||
prompt_len: int
|
||||
output_len: int
|
||||
model: str
|
||||
model_name: Optional[str] = None
|
||||
logprobs: Optional[int] = None
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict | list[dict]] = None
|
||||
model_name: str | None = None
|
||||
logprobs: int | None = None
|
||||
extra_body: dict | None = None
|
||||
multi_modal_content: dict | list[dict] | None = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
request_id: Optional[str] = None
|
||||
language: str | None = None
|
||||
request_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
|
||||
|
||||
async def async_request_tgi(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -133,7 +132,7 @@ async def async_request_tgi(
|
||||
|
||||
async def async_request_trt_llm(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
||||
|
||||
async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
||||
|
||||
async def async_request_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
||||
|
||||
async def async_request_openai_chat_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("chat/completions", "profile")), (
|
||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
||||
|
||||
async def async_request_openai_audio(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
@ -610,7 +609,7 @@ def get_tokenizer(
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs,
|
||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||
pretrained_model_name_or_path
|
||||
):
|
||||
|
@ -32,7 +32,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[Request]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
prefix_len: int,
|
||||
) -> list[Request]:
|
||||
requests = []
|
||||
|
@ -7,7 +7,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
@ -24,7 +23,7 @@ def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[tuple[str, int, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
@ -32,7 +32,6 @@ import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
@ -316,7 +315,7 @@ def calculate_metrics(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[float],
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||
actual_output_lens: list[int] = []
|
||||
total_input = 0
|
||||
@ -436,9 +435,9 @@ async def benchmark(
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[str],
|
||||
ignore_eos: bool,
|
||||
max_concurrency: Optional[int],
|
||||
max_concurrency: int | None,
|
||||
structured_output_ratio: float,
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
|
@ -6,7 +6,7 @@ import math
|
||||
import os
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
@ -92,7 +92,7 @@ class TimeCollector:
|
||||
def __init__(self, scale: int) -> None:
|
||||
self.cnt: int = 0
|
||||
self._sum: int = 0
|
||||
self._max: Optional[int] = None
|
||||
self._max: int | None = None
|
||||
self.scale = scale
|
||||
self.start_time: int = time.monotonic_ns()
|
||||
|
||||
@ -104,13 +104,13 @@ class TimeCollector:
|
||||
else:
|
||||
self._max = max(self._max, v)
|
||||
|
||||
def avg(self) -> Union[float, str]:
|
||||
def avg(self) -> float | str:
|
||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||
|
||||
def max(self) -> Union[float, str]:
|
||||
def max(self) -> float | str:
|
||||
return self._max / self.scale if self._max else "N/A"
|
||||
|
||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||
def dump_avg_max(self) -> list[float | str]:
|
||||
return [self.avg(), self.max()]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
@ -118,8 +118,8 @@ class TimeCollector:
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type[BaseException]],
|
||||
exc_value: Optional[BaseException],
|
||||
exc_traceback: Optional[TracebackType],
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
exc_traceback: TracebackType | None,
|
||||
) -> None:
|
||||
self.collect(time.monotonic_ns() - self.start_time)
|
||||
|
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
|
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -53,7 +52,7 @@ def bench_int8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark INT8-based kernels."""
|
||||
assert dtype == torch.int8
|
||||
@ -108,7 +107,7 @@ def bench_fp8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark FP8-based kernels."""
|
||||
assert dtype == torch.float8_e4m3fn
|
||||
@ -183,7 +182,7 @@ def bench(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
if dtype == torch.int8:
|
||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
||||
def run(
|
||||
dtype: torch.dtype,
|
||||
MKNs: Iterable[tuple[int, int, int]],
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
results = []
|
||||
for m, k, n in MKNs:
|
||||
|
@ -3,10 +3,9 @@
|
||||
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
||||
def unfused_int8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
|
||||
def unfused_fp8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
||||
def fused_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import itertools
|
||||
from typing import Callable
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import pandas as pd
|
||||
|
@ -22,8 +22,8 @@ Example:
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
||||
def benchmark_allreduce_single(
|
||||
self,
|
||||
sequence_length: int,
|
||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
||||
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||
should_use_fn: Callable[[torch.Tensor], bool],
|
||||
context,
|
||||
num_warmup: int,
|
||||
num_trials: int,
|
||||
) -> Optional[float]:
|
||||
) -> float | None:
|
||||
"""Benchmark method with CUDA graph optimization."""
|
||||
try:
|
||||
# Create test tensor (2D: sequence_length x hidden_size)
|
||||
|
@ -6,11 +6,12 @@ import copy
|
||||
import json
|
||||
import pickle
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -158,7 +159,7 @@ def ref_group_gemm(
|
||||
seq_lens_cpu: torch.Tensor,
|
||||
prompt_lora_mapping_cpu: torch.Tensor,
|
||||
scaling: float,
|
||||
add_inputs: Optional[bool],
|
||||
add_inputs: bool | None,
|
||||
):
|
||||
"""
|
||||
Torch group gemm reference implementation to test correctness of
|
||||
@ -316,8 +317,8 @@ class BenchmarkContext:
|
||||
lora_rank: int
|
||||
sort_by_lora_id: bool
|
||||
dtype: torch.dtype
|
||||
seq_length: Optional[int] = None
|
||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
||||
seq_length: int | None = None
|
||||
num_slices: int | None = None # num_slices for slice based ops
|
||||
|
||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||
ctx = copy.copy(self)
|
||||
@ -561,7 +562,7 @@ class BenchmarkTensors:
|
||||
}
|
||||
|
||||
def bench_fn_kwargs(
|
||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
||||
self, op_type: OpType, add_inputs: bool | None = None
|
||||
) -> dict[str, Any]:
|
||||
if op_type.is_shrink_fn():
|
||||
assert add_inputs is None
|
||||
@ -575,7 +576,7 @@ class BenchmarkTensors:
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
def test_correctness(
|
||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
||||
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||
) -> bool:
|
||||
"""
|
||||
Test correctness of op_type implementation against a grouped gemm
|
||||
@ -611,8 +612,8 @@ def bench_optype(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
expand_fn_add_inputs: Optional[bool] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
expand_fn_add_inputs: bool | None = None,
|
||||
test_correctness: bool = False,
|
||||
) -> TMeasurement:
|
||||
assert arg_pool_size >= 1
|
||||
@ -679,7 +680,7 @@ def bench_torch_mm(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
) -> TMeasurement:
|
||||
"""
|
||||
Benchmark basic torch.mm as a roofline.
|
||||
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
|
||||
"""
|
||||
|
||||
|
||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
||||
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
@ -8,10 +8,9 @@ import math
|
||||
import os
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
|
||||
a: torch.Tensor
|
||||
|
||||
w_q: torch.Tensor
|
||||
group_size: Optional[int]
|
||||
group_size: int | None
|
||||
wtype: ScalarType
|
||||
w_g_s: torch.Tensor
|
||||
w_g_zp: Optional[torch.Tensor]
|
||||
w_ch_s: Optional[torch.Tensor]
|
||||
w_tok_s: Optional[torch.Tensor]
|
||||
w_g_zp: torch.Tensor | None
|
||||
w_ch_s: torch.Tensor | None
|
||||
w_tok_s: torch.Tensor | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TypeConfig:
|
||||
act_type: torch.dtype
|
||||
weight_type: ScalarType
|
||||
output_type: Optional[torch.dtype]
|
||||
group_scale_type: Optional[torch.dtype]
|
||||
group_zero_type: Optional[torch.dtype]
|
||||
channel_scale_type: Optional[torch.dtype]
|
||||
token_scale_type: Optional[torch.dtype]
|
||||
output_type: torch.dtype | None
|
||||
group_scale_type: torch.dtype | None
|
||||
group_zero_type: torch.dtype | None
|
||||
channel_scale_type: torch.dtype | None
|
||||
token_scale_type: torch.dtype | None
|
||||
|
||||
|
||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||
@ -93,8 +92,8 @@ def quantize_and_pack(
|
||||
atype: torch.dtype,
|
||||
w: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
stype: Optional[torch.dtype],
|
||||
group_size: Optional[int],
|
||||
stype: torch.dtype | None,
|
||||
group_size: int | None,
|
||||
zero_points: bool = False,
|
||||
):
|
||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||
@ -113,7 +112,7 @@ def quantize_and_pack(
|
||||
|
||||
|
||||
def create_bench_tensors(
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||
) -> list[BenchmarkTensors]:
|
||||
m, n, k = shape
|
||||
|
||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
||||
return res
|
||||
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||
|
||||
|
||||
def bench(
|
||||
|
@ -3,7 +3,6 @@
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
@ -37,7 +36,7 @@ def main(
|
||||
seed: int,
|
||||
do_profile: bool,
|
||||
device: str = "cuda",
|
||||
kv_cache_dtype: Optional[str] = None,
|
||||
kv_cache_dtype: str | None = None,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
|
@ -3,8 +3,8 @@
|
||||
|
||||
import argparse
|
||||
import math
|
||||
from collections.abc import Callable
|
||||
from contextlib import contextmanager
|
||||
from typing import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
|
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
orig_dtype = x.dtype
|
||||
x = x.to(torch.float32)
|
||||
if residual is not None:
|
||||
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def rmsnorm_naive(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||
@ -65,7 +64,7 @@ def rmsnorm_naive(
|
||||
def rmsnorm_flashinfer(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
|
||||
def rmsnorm_vllm(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from itertools import accumulate
|
||||
from typing import Optional
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
rotary_dim: int | None,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
|
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_decode(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_prefill(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
@ -2,8 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -55,7 +55,7 @@ class Bench:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cuda_graph_params: Optional[CudaGraphBenchParams],
|
||||
cuda_graph_params: CudaGraphBenchParams | None,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
description: str,
|
||||
|
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from statistics import mean
|
||||
from typing import Any, NamedTuple, Optional, Union
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import numpy as np # type: ignore
|
||||
import pandas as pd # type: ignore
|
||||
@ -35,8 +35,8 @@ class Distribution(ABC):
|
||||
class UniformDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
min_val: Union[int, float],
|
||||
max_val: Union[int, float],
|
||||
min_val: int | float,
|
||||
max_val: int | float,
|
||||
is_integer: bool = True,
|
||||
) -> None:
|
||||
self.min_val = min_val
|
||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
||||
|
||||
|
||||
class ConstantDistribution(Distribution):
|
||||
def __init__(self, value: Union[int, float]) -> None:
|
||||
def __init__(self, value: int | float) -> None:
|
||||
self.value = value
|
||||
self.max_val = value
|
||||
|
||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
||||
|
||||
|
||||
class ZipfDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
||||
|
||||
|
||||
class PoissonDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
|
||||
class LognormalDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
mean: Optional[float] = None,
|
||||
sigma: Optional[float] = None,
|
||||
average: Optional[int] = None,
|
||||
median_ratio: Optional[float] = None,
|
||||
max_val: Optional[int] = None,
|
||||
mean: float | None = None,
|
||||
sigma: float | None = None,
|
||||
average: int | None = None,
|
||||
median_ratio: float | None = None,
|
||||
max_val: int | None = None,
|
||||
) -> None:
|
||||
self.average = average
|
||||
self.median_ratio = median_ratio
|
||||
|
@ -13,7 +13,7 @@ from datetime import datetime
|
||||
from enum import Enum
|
||||
from http import HTTPStatus
|
||||
from statistics import mean
|
||||
from typing import NamedTuple, Union
|
||||
from typing import NamedTuple
|
||||
|
||||
import aiohttp # type: ignore
|
||||
import numpy as np # type: ignore
|
||||
@ -169,7 +169,7 @@ class MovingAverage:
|
||||
class DebugStats:
|
||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||
self.logger = logger
|
||||
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
||||
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||
"ttft_ms": MetricStats(),
|
||||
@ -636,7 +636,7 @@ async def client_main(
|
||||
|
||||
if args.verbose:
|
||||
curr_time_sec: float = time.perf_counter()
|
||||
time_since_last_turn: Union[str, float] = "N/A"
|
||||
time_since_last_turn: str | float = "N/A"
|
||||
if conv_id in time_of_last_turn:
|
||||
time_since_last_turn = round(
|
||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||
@ -928,13 +928,13 @@ async def main_mp(
|
||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
||||
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||
# Do not estimate the RPS if the number of samples is very low
|
||||
# (threshold can be tuned if needed)
|
||||
rps = "N/A"
|
||||
|
||||
runtime_left_sec: Union[str, float] = round(
|
||||
runtime_left_sec: str | float = round(
|
||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||
)
|
||||
if percent < 0.05:
|
||||
|
@ -13,7 +13,7 @@ import argparse
|
||||
import json
|
||||
import random
|
||||
from statistics import mean
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd # type: ignore
|
||||
import tqdm # type: ignore
|
||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
||||
|
||||
|
||||
def content_is_valid(
|
||||
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
||||
content: str, min_content_len: int | None, max_content_len: int | None
|
||||
) -> bool:
|
||||
if min_content_len and len(content) < min_content_len:
|
||||
return False
|
||||
@ -37,7 +37,7 @@ def content_is_valid(
|
||||
|
||||
|
||||
def print_stats(
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
||||
) -> None:
|
||||
# Collect statistics
|
||||
stats = []
|
||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
||||
seed: int,
|
||||
input_file: str,
|
||||
output_file: str,
|
||||
max_items: Optional[int],
|
||||
min_content_len: Optional[int] = None,
|
||||
max_content_len: Optional[int] = None,
|
||||
min_turns: Optional[int] = None,
|
||||
max_turns: Optional[int] = None,
|
||||
model: Optional[str] = None,
|
||||
max_items: int | None,
|
||||
min_content_len: int | None = None,
|
||||
max_content_len: int | None = None,
|
||||
min_turns: int | None = None,
|
||||
max_turns: int | None = None,
|
||||
model: str | None = None,
|
||||
) -> None:
|
||||
if min_turns and max_turns:
|
||||
assert min_turns <= max_turns
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
from typing import Union
|
||||
|
||||
from cutlass_library import *
|
||||
|
||||
@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
|
||||
TmaWarpSpecializedCooperative = enum_auto()
|
||||
|
||||
|
||||
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeNames, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "u4b8",
|
||||
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeTag, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
|
||||
**DataTypeSize, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: 4,
|
||||
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
VLLMDataType.u4b8: "vllm::kU4B8",
|
||||
VLLMDataType.u8b128: "vllm::kU8B128",
|
||||
DataType.u4: "vllm::kU4",
|
||||
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.bf16: "vllm::kBfloat16",
|
||||
}
|
||||
|
||||
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
DataType.u8: "at::ScalarType::Byte",
|
||||
DataType.s8: "at::ScalarType::Char",
|
||||
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
||||
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.f32: "at::ScalarType::Float",
|
||||
}
|
||||
|
||||
VLLMKernelScheduleTag: dict[
|
||||
Union[MixedInputKernelScheduleType, KernelScheduleType], str
|
||||
] = {
|
||||
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
|
||||
**KernelScheduleTag, # type: ignore
|
||||
**{
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||
|
@ -9,7 +9,6 @@ from collections.abc import Iterable
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, fields
|
||||
from functools import reduce
|
||||
from typing import Optional, Union
|
||||
|
||||
import jinja2
|
||||
from vllm_cutlass_library_extension import (
|
||||
@ -259,7 +258,7 @@ class ScheduleConfig:
|
||||
@dataclass(frozen=True)
|
||||
class TypeConfig:
|
||||
a: DataType
|
||||
b: Union[DataType, VLLMDataType]
|
||||
b: DataType | VLLMDataType
|
||||
b_group_scale: DataType
|
||||
b_group_zeropoint: DataType
|
||||
b_channel_scale: DataType
|
||||
@ -280,7 +279,7 @@ class PrepackTypeConfig:
|
||||
class ImplConfig:
|
||||
types: TypeConfig
|
||||
schedules: list[ScheduleConfig]
|
||||
heuristic: list[tuple[Optional[str], ScheduleConfig]]
|
||||
heuristic: list[tuple[str | None, ScheduleConfig]]
|
||||
|
||||
|
||||
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
||||
|
@ -16,7 +16,7 @@ Declare supported languages and capabilities:
|
||||
|
||||
??? code "supported_languages and supports_transcription_only"
|
||||
```python
|
||||
from typing import ClassVar, Mapping, Optional, Literal
|
||||
from typing import ClassVar, Mapping, Literal
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
language: Optional[str],
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: Optional[str],
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
# Example with a free-form instruction prompt
|
||||
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
|
||||
@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
language: Optional[str],
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: Optional[str],
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
if language is None:
|
||||
raise ValueError("Language must be specified")
|
||||
@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method (
|
||||
??? code "validate_language()"
|
||||
```python
|
||||
@classmethod
|
||||
def validate_language(cls, language: Optional[str]) -> Optional[str]:
|
||||
def validate_language(cls, language: str | None) -> str | None:
|
||||
if language is None:
|
||||
logger.warning(
|
||||
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
||||
@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> Optional[int]:
|
||||
) -> int | None:
|
||||
# Return None if unknown; otherwise return an estimate.
|
||||
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
||||
```
|
||||
|
@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
||||
@abstractmethod
|
||||
def update_state(
|
||||
self,
|
||||
batch_update: Optional["BatchUpdate"],
|
||||
batch_update: "BatchUpdate" | None,
|
||||
) -> None:
|
||||
"""Called when there are new output tokens, prior
|
||||
to each forward pass.
|
||||
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
|
||||
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
||||
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
||||
|
||||
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
|
||||
* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
|
||||
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
||||
* Use the `BatchUpdate` members to update logits processor internal state
|
||||
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
||||
|
@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes
|
||||
??? code "Example custom logits processor definition"
|
||||
|
||||
``` python
|
||||
from typing import Optional
|
||||
import torch
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes
|
||||
"""Never impacts greedy sampling"""
|
||||
return False
|
||||
|
||||
def update_state(self, batch_update: Optional[BatchUpdate]):
|
||||
def update_state(self, batch_update: BatchUpdate | None):
|
||||
if not batch_update:
|
||||
return
|
||||
|
||||
|
@ -10,7 +10,7 @@ on HuggingFace model repository.
|
||||
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from typing import Any, NamedTuple, Optional
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
@ -30,11 +30,11 @@ question_per_audio_count = {
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: Optional[str] = None
|
||||
prompt_token_ids: Optional[dict[str, list[int]]] = None
|
||||
multi_modal_data: Optional[dict[str, Any]] = None
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
prompt: str | None = None
|
||||
prompt_token_ids: dict[str, list[int]] | None = None
|
||||
multi_modal_data: dict[str, Any] | None = None
|
||||
stop_token_ids: list[int] | None = None
|
||||
lora_requests: list[LoRARequest] | None = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
|
@ -3,7 +3,7 @@
|
||||
# ruff: noqa: E501
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
|
||||
|
||||
def get_finished(
|
||||
self, finished_req_ids: set[str]
|
||||
) -> tuple[Optional[set[str]], Optional[set[str]]]:
|
||||
) -> tuple[set[str] | None, set[str] | None]:
|
||||
if self._async_load:
|
||||
meta = self._get_connector_metadata()
|
||||
assert isinstance(meta, RogueSharedStorageConnectorMetadata)
|
||||
|
@ -33,8 +33,6 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the'
|
||||
------------------------------------------------------------
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor):
|
||||
def is_argmax_invariant(self) -> bool:
|
||||
return False
|
||||
|
||||
def update_state(self, batch_update: Optional[BatchUpdate]):
|
||||
def update_state(self, batch_update: BatchUpdate | None):
|
||||
process_dict_updates(
|
||||
self.req_info,
|
||||
batch_update,
|
||||
|
@ -39,7 +39,7 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the'
|
||||
------------------------------------------------------------
|
||||
"""
|
||||
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
|
||||
def new_req_logits_processor(
|
||||
self,
|
||||
params: SamplingParams,
|
||||
) -> Optional[RequestLogitsProcessor]:
|
||||
) -> RequestLogitsProcessor | None:
|
||||
"""This method returns a new request-level logits processor, customized
|
||||
to the `target_token` value associated with a particular request.
|
||||
|
||||
@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
|
||||
Returns:
|
||||
`Callable` request logits processor, or None
|
||||
"""
|
||||
target_token: Optional[Any] = params.extra_args and params.extra_args.get(
|
||||
target_token: Any | None = params.extra_args and params.extra_args.get(
|
||||
"target_token"
|
||||
)
|
||||
if target_token is None:
|
||||
|
@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda"
|
||||
device, the first and third requests would not repeat the same token.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
|
||||
def new_req_logits_processor(
|
||||
self,
|
||||
params: SamplingParams,
|
||||
) -> Optional[RequestLogitsProcessor]:
|
||||
) -> RequestLogitsProcessor | None:
|
||||
"""This method returns a new request-level logits processor, customized
|
||||
to the `target_token` value associated with a particular request.
|
||||
|
||||
|
@ -8,7 +8,6 @@ Requires HuggingFace credentials for access.
|
||||
"""
|
||||
|
||||
import gc
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from huggingface_hub import snapshot_download
|
||||
@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest
|
||||
|
||||
def create_test_prompts(
|
||||
lora_path: str,
|
||||
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
||||
) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
|
||||
return [
|
||||
# this is an example of using quantization without LoRA
|
||||
(
|
||||
@ -56,7 +55,7 @@ def create_test_prompts(
|
||||
|
||||
def process_requests(
|
||||
engine: LLMEngine,
|
||||
test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
|
||||
test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
|
||||
):
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
@ -78,7 +77,7 @@ def process_requests(
|
||||
|
||||
|
||||
def initialize_engine(
|
||||
model: str, quantization: str, lora_repo: Optional[str]
|
||||
model: str, quantization: str, lora_repo: str | None
|
||||
) -> LLMEngine:
|
||||
"""Initialize the LLMEngine."""
|
||||
|
||||
|
@ -7,8 +7,6 @@ for offline inference.
|
||||
Requires HuggingFace credentials for access to Llama2.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest
|
||||
|
||||
def create_test_prompts(
|
||||
lora_path: str,
|
||||
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
||||
) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
|
||||
"""Create a list of test prompts with their sampling parameters.
|
||||
|
||||
2 requests for base model, 4 requests for the LoRA. We define 2
|
||||
@ -68,7 +66,7 @@ def create_test_prompts(
|
||||
|
||||
def process_requests(
|
||||
engine: LLMEngine,
|
||||
test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
|
||||
test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
|
||||
):
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
@ -3,7 +3,6 @@
|
||||
import argparse
|
||||
import datetime
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
import albumentations
|
||||
import numpy as np
|
||||
@ -160,7 +159,7 @@ def load_example(
|
||||
file_paths: list[str],
|
||||
mean: list[float] = None,
|
||||
std: list[float] = None,
|
||||
indices: Union[list[int], None] = None,
|
||||
indices: list[int] | None = None,
|
||||
):
|
||||
"""Build an input example by loading images in *file_paths*.
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
from typing import Callable, Optional, TypedDict
|
||||
from collections.abc import Callable
|
||||
from typing import TypedDict
|
||||
|
||||
import torch
|
||||
import zmq
|
||||
@ -71,7 +72,7 @@ class WorkerExtension:
|
||||
|
||||
|
||||
def rebuild_ipc(
|
||||
handle: tuple[Callable, tuple], device_id: Optional[int] = None
|
||||
handle: tuple[Callable, tuple], device_id: int | None = None
|
||||
) -> torch.Tensor:
|
||||
func, args = handle
|
||||
list_args = list(args)
|
||||
@ -109,7 +110,7 @@ class ColocateWorkerExtension:
|
||||
self._zmq_ctx = zmq.Context()
|
||||
socket = self._zmq_ctx.socket(zmq.REP)
|
||||
socket.connect(zmq_handles[self.report_device_id()])
|
||||
buffer: Optional[torch.Tensor] = None
|
||||
buffer: torch.Tensor | None = None
|
||||
while True:
|
||||
payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
|
||||
socket.recv_pyobj()
|
||||
|
@ -12,7 +12,7 @@ import os
|
||||
import random
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
from typing import NamedTuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompts: list[str]
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
stop_token_ids: list[int] | None = None
|
||||
lora_requests: list[LoRARequest] | None = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
|
@ -9,7 +9,7 @@ using the chat template defined by the model.
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
from typing import NamedTuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from PIL.Image import Image
|
||||
@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
image_data: list[Image]
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
chat_template: Optional[str] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
stop_token_ids: list[int] | None = None
|
||||
chat_template: str | None = None
|
||||
lora_requests: list[LoRARequest] | None = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
@ -1251,7 +1251,7 @@ model_example_map = {
|
||||
}
|
||||
|
||||
|
||||
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
|
||||
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
|
||||
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
# Disable other modalities to save memory
|
||||
|
@ -11,7 +11,7 @@ on HuggingFace model repository.
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict):
|
||||
|
||||
|
||||
QueryModality = Literal["text", "image", "text+image", "text+images"]
|
||||
Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
|
||||
Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: Optional[str] = None
|
||||
image: Optional[Image] = None
|
||||
query: Optional[str] = None
|
||||
documents: Optional[ScoreMultiModalParam] = None
|
||||
prompt: str | None = None
|
||||
image: Image | None = None
|
||||
query: str | None = None
|
||||
documents: ScoreMultiModalParam | None = None
|
||||
|
||||
|
||||
def run_clip(query: Query) -> ModelRequestData:
|
||||
@ -281,7 +281,7 @@ def get_query(modality: QueryModality):
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
def run_encode(model: str, modality: QueryModality, seed: int | None):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
def run_score(model: str, modality: QueryModality, seed: int | None):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
|
@ -23,7 +23,7 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Optional
|
||||
from collections.abc import Callable
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
@ -49,12 +49,9 @@ class Proxy:
|
||||
decode_instances: list[str],
|
||||
model: str,
|
||||
scheduling_policy: SchedulingPolicy,
|
||||
custom_create_completion: Optional[
|
||||
Callable[[Request], StreamingResponse]
|
||||
] = None,
|
||||
custom_create_chat_completion: Optional[
|
||||
Callable[[Request], StreamingResponse]
|
||||
] = None,
|
||||
custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||
custom_create_chat_completion: Callable[[Request], StreamingResponse]
|
||||
| None = None,
|
||||
):
|
||||
self.prefill_instances = prefill_instances
|
||||
self.decode_instances = decode_instances
|
||||
@ -348,9 +345,9 @@ class ProxyServer:
|
||||
def __init__(
|
||||
self,
|
||||
args: argparse.Namespace,
|
||||
scheduling_policy: Optional[SchedulingPolicy] = None,
|
||||
create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
|
||||
create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
|
||||
scheduling_policy: SchedulingPolicy | None = None,
|
||||
create_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||
create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||
):
|
||||
self.validate_parsed_serve_args(args)
|
||||
self.port = args.port
|
||||
|
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
|
||||
import msgspec
|
||||
import zmq
|
||||
@ -25,16 +25,16 @@ class KVCacheEvent(
|
||||
|
||||
class BlockStored(KVCacheEvent):
|
||||
block_hashes: list[ExternalBlockHash]
|
||||
parent_block_hash: Optional[ExternalBlockHash]
|
||||
parent_block_hash: ExternalBlockHash | None
|
||||
token_ids: list[int]
|
||||
block_size: int
|
||||
lora_id: Optional[int]
|
||||
medium: Optional[str]
|
||||
lora_id: int | None
|
||||
medium: str | None
|
||||
|
||||
|
||||
class BlockRemoved(KVCacheEvent):
|
||||
block_hashes: list[ExternalBlockHash]
|
||||
medium: Optional[str]
|
||||
medium: str | None
|
||||
|
||||
|
||||
class AllBlocksCleared(KVCacheEvent):
|
||||
@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent):
|
||||
|
||||
|
||||
class KVEventBatch(EventBatch):
|
||||
events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
|
||||
events: list[BlockStored | BlockRemoved | AllBlocksCleared]
|
||||
|
||||
|
||||
def process_event(event_batch):
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
@ -43,7 +42,7 @@ async def main():
|
||||
)
|
||||
|
||||
prompt = "Who won the 2004 World Series?"
|
||||
final_output: Optional[RequestOutput] = None
|
||||
final_output: RequestOutput | None = None
|
||||
async for output in engine_client.generate(
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
|
@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example.
|
||||
run: vllm serve BAAI/bge-reranker-base
|
||||
"""
|
||||
|
||||
from typing import Union
|
||||
|
||||
import cohere
|
||||
from cohere import Client, ClientV2
|
||||
|
||||
@ -25,7 +23,7 @@ documents = [
|
||||
|
||||
|
||||
def cohere_rerank(
|
||||
client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
|
||||
client: Client | ClientV2, model: str, query: str, documents: list[str]
|
||||
) -> dict:
|
||||
return client.rerank(model=model, query=query, documents=documents)
|
||||
|
||||
|
@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model.
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
from typing import Literal, Union
|
||||
from typing import Literal
|
||||
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
@ -29,7 +29,7 @@ def create_chat_embeddings(
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
"""
|
||||
Convenience function for accessing vLLM's Chat Embeddings API,
|
||||
|
@ -1,21 +1,15 @@
|
||||
# ruff: noqa: E501
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import enum
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
from typing import Any, Literal
|
||||
|
||||
import openai
|
||||
import pydantic
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from openai.types.chat import ChatCompletionChunk
|
||||
|
||||
from openai.types.chat import ChatCompletionChunk
|
||||
|
||||
ConstraintsFormat = Literal[
|
||||
"choice",
|
||||
|
@ -84,12 +84,6 @@ ignore = [
|
||||
"B007",
|
||||
# f-string format
|
||||
"UP032",
|
||||
# Can remove once 3.10+ is the minimum Python version
|
||||
"UP007",
|
||||
"UP027",
|
||||
"UP035",
|
||||
"UP038",
|
||||
"UP045",
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
from typing import Any, NamedTuple, Optional, cast
|
||||
from typing import Any, NamedTuple, cast
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -185,8 +185,8 @@ def _collect_mm_samples(
|
||||
output_len: int = 5,
|
||||
base_items_per_request: int = 2,
|
||||
num_mm_items_range_ratio: float = 0.0,
|
||||
limit_mm_per_prompt: Optional[dict[str, int]] = None,
|
||||
bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
|
||||
limit_mm_per_prompt: dict[str, int] | None = None,
|
||||
bucket_config: dict[tuple[int, int, int], float] | None = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
) -> list[SampleRequest]:
|
||||
if limit_mm_per_prompt is None:
|
||||
|
@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need!
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Callable, Optional
|
||||
from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
VLLM_CI_NO_SKIP: bool = False
|
||||
VLLM_CI_DTYPE: Optional[str] = None
|
||||
VLLM_CI_HEAD_DTYPE: Optional[str] = None
|
||||
VLLM_CI_HF_DTYPE: Optional[str] = None
|
||||
VLLM_CI_DTYPE: str | None = None
|
||||
VLLM_CI_HEAD_DTYPE: str | None = None
|
||||
VLLM_CI_HF_DTYPE: str | None = None
|
||||
|
||||
environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# A model family has many models with the same architecture.
|
||||
|
@ -2,9 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
from collections.abc import Sequence
|
||||
from collections.abc import Callable, Sequence
|
||||
from copy import deepcopy
|
||||
from typing import Callable, Union
|
||||
|
||||
from torch import fx
|
||||
from torch._ops import OpOverload
|
||||
@ -44,7 +43,7 @@ class TestBackend:
|
||||
Inductor config is default-initialized from VllmConfig.CompilationConfig.
|
||||
"""
|
||||
|
||||
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
|
||||
def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
|
||||
self.custom_passes = list(passes)
|
||||
compile_config = get_current_vllm_config().compilation_config
|
||||
self.inductor_config = compile_config.inductor_compile_config
|
||||
|
@ -10,7 +10,7 @@ initialized randomly with a fixed seed.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module):
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
For tractable computation:
|
||||
@ -217,7 +217,7 @@ class LlamaModel(nn.Module):
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor],
|
||||
input_ids: torch.Tensor | None,
|
||||
positions: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = self.embedding_tokens(input_ids)
|
||||
|
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
|
||||
import pytest
|
||||
|
@ -1,11 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from typing import Any, Union
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||
|
||||
|
||||
def run_model(
|
||||
compile_config: Union[int, CompilationConfig],
|
||||
compile_config: int | CompilationConfig,
|
||||
model: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
):
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch._dynamo
|
||||
@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype()
|
||||
FP4_DTYPE = torch.uint8
|
||||
|
||||
# globals needed for string-import custom Dynamo backend field
|
||||
backend: Optional[TestBackend] = None
|
||||
backend_unfused: Optional[TestBackend] = None
|
||||
backend: TestBackend | None = None
|
||||
backend_unfused: TestBackend | None = None
|
||||
|
||||
|
||||
class AttentionQuantPatternModel(torch.nn.Module):
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
@ -10,7 +9,7 @@ from vllm.config import CompilationLevel
|
||||
|
||||
|
||||
class MyMod(torch.nn.Module):
|
||||
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
if cache is not None:
|
||||
return x + cache
|
||||
return x * 2
|
||||
@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
||||
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
# this is the function to be compiled
|
||||
return self.model(x, cache)
|
||||
|
||||
def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
||||
def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
# let torch.compile compile twice
|
||||
if len(self.compiled_codes) == 2:
|
||||
dispatch_id = 0 if cache is None else 1
|
||||
|
@ -21,7 +21,7 @@ import threading
|
||||
from collections.abc import Generator
|
||||
from contextlib import nullcontext
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
|
||||
from typing import Any, Callable, TypedDict, TypeVar, cast
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
|
||||
|
||||
_M = TypeVar("_M")
|
||||
|
||||
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
|
||||
_PromptMultiModalInput = list[_M] | list[list[_M]]
|
||||
|
||||
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
||||
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
|
||||
@ -267,7 +267,7 @@ class HfRunner:
|
||||
|
||||
return "cpu" if current_platform.is_cpu() else current_platform.device_type
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
def wrap_device(self, x: _T, device: str | None = None) -> _T:
|
||||
if x is None or isinstance(x, (bool,)):
|
||||
return x
|
||||
|
||||
@ -287,14 +287,14 @@ class HfRunner:
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
model_kwargs: dict[str, Any] | None = None,
|
||||
trust_remote_code: bool = True,
|
||||
is_sentence_transformer: bool = False,
|
||||
is_cross_encoder: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
# Set this to avoid hanging issue
|
||||
default_torch_num_threads: Optional[int] = None,
|
||||
default_torch_num_threads: int | None = None,
|
||||
) -> None:
|
||||
init_ctx = (
|
||||
nullcontext()
|
||||
@ -319,7 +319,7 @@ class HfRunner:
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
model_kwargs: dict[str, Any] | None = None,
|
||||
trust_remote_code: bool = True,
|
||||
is_sentence_transformer: bool = False,
|
||||
is_cross_encoder: bool = False,
|
||||
@ -406,11 +406,11 @@ class HfRunner:
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: Union[list[str], list[list[int]]],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]:
|
||||
prompts: list[str] | list[list[int]],
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
|
||||
if images is not None:
|
||||
assert len(prompts) == len(images)
|
||||
|
||||
@ -420,9 +420,7 @@ class HfRunner:
|
||||
if audios is not None:
|
||||
assert len(prompts) == len(audios)
|
||||
|
||||
all_inputs: list[
|
||||
Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]
|
||||
] = []
|
||||
all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
|
||||
for i, prompt in enumerate(prompts):
|
||||
if isinstance(prompt, str):
|
||||
processor_kwargs: dict[str, Any] = {
|
||||
@ -494,10 +492,10 @@ class HfRunner:
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompts: Union[list[str], list[list[int]]],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
prompts: list[str] | list[list[int]],
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[tuple[list[list[int]], list[str]]]:
|
||||
all_inputs = self.get_inputs(
|
||||
@ -522,11 +520,11 @@ class HfRunner:
|
||||
|
||||
def generate_greedy(
|
||||
self,
|
||||
prompts: Union[list[str], list[list[int]]],
|
||||
prompts: list[str] | list[list[int]],
|
||||
max_tokens: int,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[tuple[list[int], str]]:
|
||||
outputs = self.generate(
|
||||
@ -546,9 +544,9 @@ class HfRunner:
|
||||
prompts: list[str],
|
||||
beam_width: int,
|
||||
max_tokens: int,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
) -> list[tuple[list[list[int]], list[str]]]:
|
||||
outputs = self.generate(
|
||||
prompts,
|
||||
@ -574,9 +572,9 @@ class HfRunner:
|
||||
self,
|
||||
prompts: list[str],
|
||||
max_tokens: int,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[list[torch.Tensor]]:
|
||||
all_inputs = self.get_inputs(
|
||||
@ -624,7 +622,7 @@ class HfRunner:
|
||||
def _hidden_states_to_logprobs(
|
||||
self,
|
||||
hidden_states: tuple[tuple[torch.Tensor, ...], ...],
|
||||
num_logprobs: Optional[int],
|
||||
num_logprobs: int | None,
|
||||
) -> tuple[list[dict[int, float]], int]:
|
||||
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
|
||||
output_len = len(hidden_states)
|
||||
@ -652,10 +650,10 @@ class HfRunner:
|
||||
self,
|
||||
prompts: list[str],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
num_logprobs: int | None,
|
||||
images: PromptImageInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[TokensTextLogprobs]:
|
||||
all_inputs = self.get_inputs(
|
||||
@ -734,20 +732,20 @@ class VllmRunner:
|
||||
model_name: str,
|
||||
runner: RunnerOption = "auto",
|
||||
convert: ConvertOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
tokenizer_name: str | None = None,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = True,
|
||||
seed: Optional[int] = 0,
|
||||
max_model_len: Optional[int] = 1024,
|
||||
seed: int | None = 0,
|
||||
max_model_len: int | None = 1024,
|
||||
dtype: str = "auto",
|
||||
disable_log_stats: bool = True,
|
||||
tensor_parallel_size: int = 1,
|
||||
block_size: int = 16 if not torch.xpu.is_available() else 64,
|
||||
enable_chunked_prefill: Optional[bool] = False,
|
||||
enable_chunked_prefill: bool | None = False,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: Optional[bool] = False,
|
||||
enforce_eager: bool | None = False,
|
||||
# Set this to avoid hanging issue
|
||||
default_torch_num_threads: Optional[int] = None,
|
||||
default_torch_num_threads: int | None = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
init_ctx = (
|
||||
@ -785,10 +783,10 @@ class VllmRunner:
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
prompts: list[str] | list[torch.Tensor] | list[list[int]],
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
if any(
|
||||
x is not None and len(x) != len(prompts) for x in [images, videos, audios]
|
||||
@ -824,11 +822,11 @@ class VllmRunner:
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
||||
prompts: list[str] | list[torch.Tensor] | list[list[int]],
|
||||
sampling_params: SamplingParams,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[tuple[list[list[int]], list[str]]]:
|
||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||
@ -871,11 +869,11 @@ class VllmRunner:
|
||||
self,
|
||||
prompts: list[str],
|
||||
sampling_params: SamplingParams,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
|
||||
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||
|
||||
req_outputs = self.llm.generate(
|
||||
@ -894,11 +892,11 @@ class VllmRunner:
|
||||
|
||||
def generate_greedy(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
||||
prompts: list[str] | list[torch.Tensor] | list[list[int]],
|
||||
max_tokens: int,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[tuple[list[int], str]]:
|
||||
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
||||
@ -916,15 +914,15 @@ class VllmRunner:
|
||||
self,
|
||||
prompts: list[str],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int] = None,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
stop: Optional[list[str]] = None,
|
||||
num_logprobs: int | None,
|
||||
num_prompt_logprobs: int | None = None,
|
||||
images: PromptImageInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
stop: list[str] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
|
||||
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||
greedy_logprobs_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=max_tokens,
|
||||
@ -957,7 +955,7 @@ class VllmRunner:
|
||||
perplexities = []
|
||||
for output in outputs:
|
||||
output = cast(TokensTextLogprobsPromptLogprobs, output)
|
||||
token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
|
||||
token_datas = cast(list[dict[int, Logprob] | None], output[3])
|
||||
assert token_datas[0] is None
|
||||
token_log_probs = []
|
||||
for token_data in token_datas[1:]:
|
||||
@ -976,10 +974,10 @@ class VllmRunner:
|
||||
prompts: list[str],
|
||||
beam_width: int,
|
||||
max_tokens: int,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
concurrency_limit: Optional[int] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
concurrency_limit: int | None = None,
|
||||
) -> list[tuple[list[list[int]], list[str]]]:
|
||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||
|
||||
@ -1002,9 +1000,9 @@ class VllmRunner:
|
||||
def embed(
|
||||
self,
|
||||
prompts: list[str],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> list[list[float]]:
|
||||
@ -1023,8 +1021,8 @@ class VllmRunner:
|
||||
|
||||
def score(
|
||||
self,
|
||||
text_1: Union[str, list[str]],
|
||||
text_2: Union[str, list[str]],
|
||||
text_1: list[str] | str,
|
||||
text_2: list[str] | str,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> list[float]:
|
||||
@ -1226,8 +1224,8 @@ def _find_free_port() -> int:
|
||||
class LocalAssetServer:
|
||||
address: str
|
||||
port: int
|
||||
server: Optional[http.server.ThreadingHTTPServer]
|
||||
thread: Optional[threading.Thread]
|
||||
server: http.server.ThreadingHTTPServer | None
|
||||
thread: threading.Thread | None
|
||||
|
||||
def __init__(self, address: str = "127.0.0.1") -> None:
|
||||
self.address = address
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
@ -15,8 +15,8 @@ def _test_stopping(
|
||||
llm: LLM,
|
||||
expected_output: str,
|
||||
expected_reason: Any,
|
||||
stop: Optional[list[str]] = None,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
stop: list[str] | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
include_in_output: bool = False,
|
||||
) -> None:
|
||||
output = llm.generate(
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
from typing import Optional, Union
|
||||
|
||||
import msgspec
|
||||
import msgspec.msgpack
|
||||
@ -78,8 +77,8 @@ class MockSubscriber:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pub_endpoints: Union[str, list[str]],
|
||||
replay_endpoints: Optional[Union[str, list[str]]] = None,
|
||||
pub_endpoints: str | list[str],
|
||||
replay_endpoints: str | list[str] | None = None,
|
||||
topic: str = "",
|
||||
decode_type=SampleBatch,
|
||||
):
|
||||
@ -111,7 +110,7 @@ class MockSubscriber:
|
||||
self.last_seq = -1
|
||||
self.decoder = msgspec.msgpack.Decoder(type=decode_type)
|
||||
|
||||
def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
|
||||
def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
|
||||
"""Receive a single message with timeout"""
|
||||
if not self.sub.poll(timeout):
|
||||
return None
|
||||
|
@ -5,9 +5,8 @@
|
||||
Run `pytest tests/distributed/test_comm_ops.py`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Callable
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple, Optional
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
|
||||
|
||||
class CPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: Optional[str] = None
|
||||
load_format: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -54,7 +54,7 @@ class CPTestSettings:
|
||||
dcp_base: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False]:
|
||||
|
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple, Optional
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple):
|
||||
|
||||
class EPTestOptions(NamedTuple):
|
||||
trust_remote_code: bool
|
||||
tokenizer_mode: Optional[str]
|
||||
load_format: Optional[str] = None
|
||||
hf_overrides: Optional[str] = None
|
||||
tokenizer_mode: str | None
|
||||
load_format: str | None = None
|
||||
hf_overrides: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -40,9 +40,9 @@ class EPTestSettings:
|
||||
tp_base: int = 2,
|
||||
runner: RunnerOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
tokenizer_mode: str | None = None,
|
||||
load_format: str | None = None,
|
||||
hf_overrides: str | None = None,
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
@ -72,9 +72,9 @@ class EPTestSettings:
|
||||
tp_base: int = 2,
|
||||
runner: RunnerOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
tokenizer_mode: str | None = None,
|
||||
load_format: str | None = None,
|
||||
hf_overrides: str | None = None,
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
|
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple, Optional
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple):
|
||||
|
||||
class PPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: Optional[str] = None
|
||||
load_format: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -52,7 +52,7 @@ class PPTestSettings:
|
||||
pp_base: int = 2,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
@ -76,7 +76,7 @@ class PPTestSettings:
|
||||
pp_base: int = 2,
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
|
@ -1,16 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from typing_extensions import LiteralString
|
||||
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import LiteralString
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"PP_SIZE, MODEL_NAME",
|
||||
|
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple, Optional
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
|
||||
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
|
||||
|
||||
class SPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: Optional[str] = None
|
||||
load_format: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -53,7 +53,7 @@ class SPTestSettings:
|
||||
pp_base: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False, True]:
|
||||
@ -84,7 +84,7 @@ class SPTestSettings:
|
||||
pp_base: int = 1,
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False, True]:
|
||||
@ -115,7 +115,7 @@ class SPTestSettings:
|
||||
pp_base: int = 1,
|
||||
runner: RunnerOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
load_format: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for fusion_val in [False, True]:
|
||||
|
@ -5,7 +5,7 @@ import json
|
||||
from argparse import ArgumentError
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Annotated, Literal, Optional, Union
|
||||
from typing import Annotated, Literal
|
||||
|
||||
import pytest
|
||||
|
||||
@ -115,9 +115,9 @@ class NestedConfig:
|
||||
class DummyConfig:
|
||||
regular_bool: bool = True
|
||||
"""Regular bool with default True"""
|
||||
optional_bool: Optional[bool] = None
|
||||
optional_bool: bool | None = None
|
||||
"""Optional bool with default None"""
|
||||
optional_literal: Optional[Literal["x", "y"]] = None
|
||||
optional_literal: Literal["x", "y"] | None = None
|
||||
"""Optional literal with default None"""
|
||||
tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
|
||||
"""Tuple with variable length"""
|
||||
@ -127,7 +127,7 @@ class DummyConfig:
|
||||
"""List with variable length"""
|
||||
list_literal: list[Literal[1, 2]] = field(default_factory=list)
|
||||
"""List with literal choices"""
|
||||
list_union: list[Union[str, type[object]]] = field(default_factory=list)
|
||||
list_union: list[str | type[object]] = field(default_factory=list)
|
||||
"""List with union type"""
|
||||
literal_literal: Literal[Literal[1], Literal[2]] = 1
|
||||
"""Literal of literals with default 1"""
|
||||
@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected):
|
||||
("type_hint", "expected"),
|
||||
[
|
||||
(Annotated[int, "annotation"], {int}),
|
||||
(Optional[int], {int, type(None)}),
|
||||
(Annotated[Optional[int], "annotation"], {int, type(None)}),
|
||||
(Optional[Annotated[int, "annotation"]], {int, type(None)}),
|
||||
(int | None, {int, type(None)}),
|
||||
(Annotated[int | None, "annotation"], {int, type(None)}),
|
||||
(Annotated[int, "annotation"] | None, {int, type(None)}),
|
||||
],
|
||||
ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"],
|
||||
ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"],
|
||||
)
|
||||
def test_get_type_hints(type_hint, expected):
|
||||
assert get_type_hints(type_hint) == expected
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
from typing import Callable
|
||||
from collections.abc import Callable
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
|
@ -3,7 +3,6 @@
|
||||
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st
|
||||
[(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
|
||||
)
|
||||
async def test_prompt_logprobs_chat(
|
||||
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
|
||||
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
|
||||
):
|
||||
params: dict = {
|
||||
"messages": [
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
from typing import Union
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
@ -166,7 +165,7 @@ async def test_function_tool_use(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
stream: bool,
|
||||
tool_choice: Union[str, dict],
|
||||
tool_choice: str | dict,
|
||||
enable_thinking: bool,
|
||||
):
|
||||
if not stream:
|
||||
|
@ -4,7 +4,6 @@
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass, field
|
||||
from http import HTTPStatus
|
||||
from typing import Optional
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
@ -38,13 +37,13 @@ class MockModelConfig:
|
||||
trust_remote_code: bool = False
|
||||
tokenizer_mode: str = "auto"
|
||||
max_model_len: int = 100
|
||||
tokenizer_revision: Optional[str] = None
|
||||
tokenizer_revision: str | None = None
|
||||
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
|
||||
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
|
||||
logits_processor_pattern: Optional[str] = None
|
||||
diff_sampling_param: Optional[dict] = None
|
||||
logits_processor_pattern: str | None = None
|
||||
diff_sampling_param: dict | None = None
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: Optional[list[str]] = None
|
||||
allowed_media_domains: list[str] | None = None
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
skip_tokenizer_init: bool = False
|
||||
@ -56,7 +55,7 @@ class MockModelConfig:
|
||||
class MockLoRAResolver(LoRAResolver):
|
||||
async def resolve_lora(
|
||||
self, base_model_name: str, lora_name: str
|
||||
) -> Optional[LoRARequest]:
|
||||
) -> LoRARequest | None:
|
||||
if lora_name == "test-lora":
|
||||
return LoRARequest(
|
||||
lora_name="test-lora",
|
||||
|
@ -1,16 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from typing import Any
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from openai import OpenAI
|
||||
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from openai import OpenAI
|
||||
|
||||
GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
|
||||
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Iterable
|
||||
from typing import Union
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
@ -84,10 +83,10 @@ class StreamingToolReconstructor:
|
||||
def run_tool_extraction(
|
||||
tool_parser: ToolParser,
|
||||
model_output: str,
|
||||
request: Union[ChatCompletionRequest, None] = None,
|
||||
request: ChatCompletionRequest | None = None,
|
||||
streaming: bool = False,
|
||||
assert_one_tool_per_delta: bool = True,
|
||||
) -> tuple[Union[str, None], list[ToolCall]]:
|
||||
) -> tuple[str | None, list[ToolCall]]:
|
||||
if streaming:
|
||||
reconstructor = run_tool_extraction_streaming(
|
||||
tool_parser,
|
||||
@ -105,7 +104,7 @@ def run_tool_extraction(
|
||||
def run_tool_extraction_nonstreaming(
|
||||
tool_parser: ToolParser,
|
||||
model_output: str,
|
||||
request: Union[ChatCompletionRequest, None] = None,
|
||||
request: ChatCompletionRequest | None = None,
|
||||
) -> ExtractedToolCallInformation:
|
||||
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
||||
return tool_parser.extract_tool_calls(model_output, request)
|
||||
@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming(
|
||||
def run_tool_extraction_streaming(
|
||||
tool_parser: ToolParser,
|
||||
model_deltas: Iterable[str],
|
||||
request: Union[ChatCompletionRequest, None] = None,
|
||||
request: ChatCompletionRequest | None = None,
|
||||
assert_one_tool_per_delta: bool = True,
|
||||
) -> StreamingToolReconstructor:
|
||||
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
||||
|
@ -4,8 +4,6 @@
|
||||
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
@ -103,14 +101,14 @@ async def test_matryoshka(
|
||||
run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
|
||||
|
||||
if model_info.is_matryoshka:
|
||||
valid_dimensions: list[Optional[int]] = [None]
|
||||
valid_dimensions: list[int | None] = [None]
|
||||
if model_info.matryoshka_dimensions is not None:
|
||||
valid_dimensions += model_info.matryoshka_dimensions[:2]
|
||||
|
||||
for dimensions in valid_dimensions:
|
||||
await make_request_and_correctness_test(dimensions)
|
||||
|
||||
invalid_dimensions: list[Optional[int]] = [-1]
|
||||
invalid_dimensions: list[int | None] = [-1]
|
||||
if model_info.matryoshka_dimensions is not None:
|
||||
assert 5 not in model_info.matryoshka_dimensions
|
||||
invalid_dimensions.append(5)
|
||||
|
@ -5,7 +5,6 @@ import multiprocessing
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args):
|
||||
assert len(manager.processes) == 3
|
||||
|
||||
# Create a result capture for the thread
|
||||
result: dict[str, Optional[Exception]] = {"exception": None}
|
||||
result: dict[str, Exception | None] = {"exception": None}
|
||||
|
||||
def run_with_exception_capture():
|
||||
try:
|
||||
@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args):
|
||||
assert len(manager.processes) == 3
|
||||
|
||||
# Create a result capture for the thread
|
||||
result: dict[str, Optional[Exception]] = {"exception": None}
|
||||
result: dict[str, Exception | None] = {"exception": None}
|
||||
|
||||
def run_with_exception_capture():
|
||||
try:
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
import warnings
|
||||
from collections.abc import Mapping
|
||||
from typing import Literal, Optional
|
||||
from typing import Literal
|
||||
|
||||
import pytest
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
||||
@ -152,9 +152,9 @@ def audio_url():
|
||||
|
||||
|
||||
def _assert_mm_data_is_image_input(
|
||||
mm_data: Optional[MultiModalDataDict],
|
||||
mm_data: MultiModalDataDict | None,
|
||||
image_count: int,
|
||||
skipped_image_indices: Optional[list] = None,
|
||||
skipped_image_indices: list | None = None,
|
||||
) -> None:
|
||||
assert mm_data is not None
|
||||
assert set(mm_data.keys()) == {"image"}
|
||||
@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input(
|
||||
|
||||
|
||||
def _assert_mm_uuids(
|
||||
mm_uuids: Optional[MultiModalUUIDDict],
|
||||
mm_uuids: MultiModalUUIDDict | None,
|
||||
media_count: int,
|
||||
expected_uuids: list[Optional[str]],
|
||||
expected_uuids: list[str | None],
|
||||
modality: str = "image",
|
||||
) -> None:
|
||||
if len(expected_uuids) > 0:
|
||||
@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int]
|
||||
|
||||
|
||||
def _assert_mm_data_inputs(
|
||||
mm_data: Optional[MultiModalDataDict],
|
||||
mm_data: MultiModalDataDict | None,
|
||||
data_count: MultiModalDataCounts,
|
||||
skipped_media_indices: Optional[dict[str, list]] = None, # modality -> list[int]
|
||||
skipped_media_indices: dict[str, list] | None = None, # modality -> list[int]
|
||||
) -> None:
|
||||
assert mm_data is not None
|
||||
assert set(data_count.keys()) == (set(mm_data.keys()))
|
||||
|
@ -3,7 +3,6 @@
|
||||
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pybase64
|
||||
@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
max_model_len: int = 100
|
||||
encoder_config: Optional[dict] = None
|
||||
encoder_config: dict | None = None
|
||||
|
||||
|
||||
class MockTokenizerResult:
|
||||
|
@ -12,7 +12,6 @@ import json
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Generator
|
||||
from typing import Optional, Union
|
||||
|
||||
import aiohttp
|
||||
import numpy as np
|
||||
@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm
|
||||
INVALID = -9999999
|
||||
|
||||
|
||||
def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
|
||||
def download_and_cache_file(url: str, filename: str | None = None) -> str:
|
||||
"""Download and cache a file from a URL."""
|
||||
if filename is None:
|
||||
filename = os.path.join("/tmp", url.split("/")[-1])
|
||||
@ -81,9 +80,9 @@ async def call_vllm_api(
|
||||
prompt: str,
|
||||
temperature: float,
|
||||
max_tokens: int,
|
||||
stop: Optional[list[str]] = None,
|
||||
url: Optional[str] = None,
|
||||
seed: Optional[int] = None,
|
||||
stop: list[str] | None = None,
|
||||
url: str | None = None,
|
||||
seed: int | None = None,
|
||||
) -> str:
|
||||
"""Call vLLM's OpenAI-compatible completions endpoint."""
|
||||
data = {
|
||||
@ -112,8 +111,8 @@ def evaluate_gsm8k(
|
||||
host: str = "http://127.0.0.1",
|
||||
port: int = 8000,
|
||||
temperature: float = 0.0,
|
||||
seed: Optional[int] = 42,
|
||||
) -> dict[str, Union[float, int]]:
|
||||
seed: int | None = 42,
|
||||
) -> dict[str, float | int]:
|
||||
"""
|
||||
Evaluate GSM8K accuracy using vLLM serve endpoint.
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -27,8 +26,8 @@ def ref_paged_attn(
|
||||
kv_lens: list[int],
|
||||
block_tables: torch.Tensor,
|
||||
scale: float,
|
||||
sliding_window: Optional[int] = None,
|
||||
soft_cap: Optional[float] = None,
|
||||
sliding_window: int | None = None,
|
||||
soft_cap: float | None = None,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(query_lens)
|
||||
block_tables = block_tables.cpu().numpy()
|
||||
@ -94,12 +93,12 @@ def test_varlen_with_paged_kv(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
sliding_window: Optional[int],
|
||||
sliding_window: int | None,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
num_blocks: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
q_dtype: torch.dtype | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -50,7 +49,7 @@ def ref_masked_attention(
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
|
||||
if attn_mask is not None:
|
||||
@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention(
|
||||
block_tables: torch.Tensor,
|
||||
seq_lens: torch.Tensor,
|
||||
scale: float,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
alibi_slopes: torch.Tensor | None,
|
||||
) -> None:
|
||||
num_query_heads = query.shape[1]
|
||||
num_kv_heads = value_cache.shape[1]
|
||||
@ -415,7 +414,7 @@ def ref_multi_query_kv_attention(
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
alibi_bias: Optional[list[torch.Tensor]],
|
||||
alibi_bias: list[torch.Tensor] | None,
|
||||
dtype: torch.dtype,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(cu_seq_lens) - 1
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -85,7 +84,7 @@ def test_cascade(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
num_blocks: int,
|
||||
fa_version: int,
|
||||
) -> None:
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -17,7 +16,7 @@ def cal_diff(
|
||||
y: torch.Tensor,
|
||||
name: str,
|
||||
use_fp8: bool = False,
|
||||
diff_threshold: Optional[float] = None,
|
||||
diff_threshold: float | None = None,
|
||||
) -> None:
|
||||
x, y = x.double(), y.double()
|
||||
cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -34,8 +33,8 @@ def ref_paged_attn(
|
||||
kv_lens: list[int],
|
||||
block_tables: torch.Tensor,
|
||||
scale: float,
|
||||
sliding_window: Optional[int] = None,
|
||||
soft_cap: Optional[float] = None,
|
||||
sliding_window: int | None = None,
|
||||
soft_cap: float | None = None,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(query_lens)
|
||||
block_tables = block_tables.cpu().numpy()
|
||||
@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
num_blocks: int,
|
||||
sliding_window: Optional[int],
|
||||
sliding_window: int | None,
|
||||
fa_version: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
q_dtype: torch.dtype | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
if not is_fa_version_supported(fa_version):
|
||||
@ -221,13 +220,13 @@ def test_varlen_with_paged_kv(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
sliding_window: Optional[int],
|
||||
sliding_window: int | None,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
num_blocks: int,
|
||||
fa_version: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
q_dtype: torch.dtype | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
if not is_fa_version_supported(fa_version):
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import pytest
|
||||
@ -26,8 +25,8 @@ def ref_paged_attn(
|
||||
kv_lens: list[int],
|
||||
block_tables: torch.Tensor,
|
||||
scale: float,
|
||||
sliding_window: Optional[int] = None,
|
||||
soft_cap: Optional[float] = None,
|
||||
sliding_window: int | None = None,
|
||||
soft_cap: float | None = None,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(query_lens)
|
||||
block_tables = block_tables.cpu().numpy()
|
||||
@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
sliding_window: Optional[int],
|
||||
soft_cap: float | None,
|
||||
sliding_window: int | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
sliding_window: Optional[int],
|
||||
soft_cap: float | None,
|
||||
sliding_window: int | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
) -> None:
|
||||
pytest.skip("TODO: fix the accuracy issue")
|
||||
torch.set_default_device("cuda")
|
||||
@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
) -> None:
|
||||
# test doesn't work for num_heads = (16,16)
|
||||
torch.set_default_device("cuda")
|
||||
|
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import pytest
|
||||
@ -68,9 +67,7 @@ NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation.
|
||||
@torch.inference_mode
|
||||
def test_flashinfer_trtllm_decode_with_baseline(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_lens: tuple[int, int],
|
||||
num_heads: tuple[int, int],
|
||||
@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
|
||||
kv_layout: str,
|
||||
block_size: int,
|
||||
window_left: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
has_sinks: bool,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
|
||||
@torch.inference_mode
|
||||
def test_flashinfer_trtllm_prefill_with_baseline(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_lens: tuple[int, int],
|
||||
num_heads: tuple[int, int],
|
||||
@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
|
||||
kv_layout: str,
|
||||
block_size: int,
|
||||
window_left: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
has_sinks: bool,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
|
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -20,7 +19,7 @@ def merge_attn_states_torch(
|
||||
prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
||||
suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
|
||||
suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
||||
output_lse: Optional[torch.Tensor] = None, # [NUM_HEADS, NUM_TOKENS]
|
||||
output_lse: torch.Tensor | None = None, # [NUM_HEADS, NUM_TOKENS]
|
||||
):
|
||||
p_lse = prefix_lse
|
||||
s_lse = suffix_lse
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -32,8 +31,8 @@ def ref_paged_attn(
|
||||
kv_lens: list[int],
|
||||
block_tables: torch.Tensor,
|
||||
scale: float,
|
||||
sliding_window: Optional[int] = None,
|
||||
soft_cap: Optional[float] = None,
|
||||
sliding_window: int | None = None,
|
||||
soft_cap: float | None = None,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(query_lens)
|
||||
block_tables = block_tables.cpu().numpy()
|
||||
@ -98,12 +97,12 @@ def test_triton_unified_attn(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
sliding_window: Optional[int],
|
||||
sliding_window: int | None,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
soft_cap: float | None,
|
||||
num_blocks: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
q_dtype: torch.dtype | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -31,13 +30,13 @@ EPS = 1e-6
|
||||
## Helpers
|
||||
|
||||
|
||||
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
|
||||
def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
|
||||
return torch.as_tensor(x, dtype=torch.float32, device="cuda")
|
||||
|
||||
|
||||
def ref_rms_norm(
|
||||
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
if residual is not None:
|
||||
residual = residual.clone()
|
||||
out, residual = rms_norm_layer.forward_native(x, residual)
|
||||
@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: Optional[torch.Tensor],
|
||||
scale_ub: Optional[torch.Tensor],
|
||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
if scale_ub is not None:
|
||||
assert quant_dtype == torch.float8_e4m3fn
|
||||
|
||||
@ -76,9 +75,9 @@ def ref_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: Optional[torch.Tensor],
|
||||
scale_ub: Optional[torch.Tensor],
|
||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
return ref_dynamic_per_token_quant(
|
||||
rms_norm_layer, x, quant_dtype, residual, scale_ub
|
||||
)
|
||||
@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant(
|
||||
weight: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: Optional[torch.Tensor],
|
||||
scale_ub: Optional[torch.Tensor],
|
||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
if residual is not None:
|
||||
residual = residual.clone()
|
||||
out, scales = ops.rms_norm_dynamic_per_token_quant(
|
||||
@ -103,9 +102,9 @@ def ops_impl(
|
||||
weight: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
residual: Optional[torch.Tensor],
|
||||
scale_ub: Optional[torch.Tensor],
|
||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
||||
residual: torch.Tensor | None,
|
||||
scale_ub: torch.Tensor | None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
|
||||
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Callable
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -68,7 +68,7 @@ def test_rotary_embedding(
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
rotary_dim: int | None,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
|
@ -4,8 +4,6 @@
|
||||
Tests for miscellaneous utilities
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
@ -17,7 +15,7 @@ def rotary_embedding_opcheck(
|
||||
rot,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: Optional[torch.Tensor] = None,
|
||||
key: torch.Tensor | None = None,
|
||||
):
|
||||
cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -19,11 +18,11 @@ from vllm.platforms import current_platform
|
||||
def causal_conv1d_ref(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
initial_states: Optional[torch.Tensor] = None,
|
||||
bias: torch.Tensor | None = None,
|
||||
initial_states: torch.Tensor | None = None,
|
||||
return_final_states: bool = False,
|
||||
final_states_out: Optional[torch.Tensor] = None,
|
||||
activation: Optional[str] = "silu",
|
||||
final_states_out: torch.Tensor | None = None,
|
||||
activation: str | None = "silu",
|
||||
):
|
||||
"""
|
||||
x: (batch, dim, seqlen)
|
||||
@ -117,12 +116,12 @@ def causal_conv1d_update_ref(
|
||||
def causal_conv1d_opcheck_fn(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
cu_seq_len: Optional[torch.Tensor] = None,
|
||||
cache_indices: Optional[torch.Tensor] = None,
|
||||
has_initial_state: Optional[torch.Tensor] = None,
|
||||
conv_states: Optional[torch.Tensor] = None,
|
||||
activation: Optional[str] = "silu",
|
||||
bias: torch.Tensor | None = None,
|
||||
cu_seq_len: torch.Tensor | None = None,
|
||||
cache_indices: torch.Tensor | None = None,
|
||||
has_initial_state: torch.Tensor | None = None,
|
||||
conv_states: torch.Tensor | None = None,
|
||||
activation: str | None = "silu",
|
||||
pad_slot_id: int = PAD_SLOT_ID,
|
||||
):
|
||||
"""
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
@ -35,7 +35,7 @@ from .mk_objects import (
|
||||
from .parallel_utils import ProcessGroupInfo
|
||||
|
||||
|
||||
def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
|
||||
def _describe_tensor(t: torch.Tensor | None, name: str) -> str:
|
||||
if t is None:
|
||||
return f"{name} : None"
|
||||
else:
|
||||
@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
Ms: Union[list[int], int]
|
||||
Ms: list[int] | int
|
||||
K: int
|
||||
N: int
|
||||
E: int
|
||||
topks: Union[list[int], int]
|
||||
topks: list[int] | int
|
||||
dtype: torch.dtype
|
||||
quant_config: Optional[TestMoEQuantConfig]
|
||||
quant_config: TestMoEQuantConfig | None
|
||||
|
||||
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
|
||||
fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
|
||||
|
||||
fused_moe_chunk_size: Optional[int]
|
||||
fused_moe_chunk_size: int | None
|
||||
world_size: int
|
||||
|
||||
torch_trace_dir_path: Optional[str] = None
|
||||
torch_trace_dir_path: str | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.quant_config is None:
|
||||
@ -93,7 +93,7 @@ class Config:
|
||||
return self.Ms
|
||||
|
||||
@property
|
||||
def quant_dtype(self) -> Union[torch.dtype, str, None]:
|
||||
def quant_dtype(self) -> torch.dtype | str | None:
|
||||
assert self.quant_config is not None
|
||||
return self.quant_config.quant_dtype
|
||||
|
||||
@ -112,7 +112,7 @@ class Config:
|
||||
return self.quant_config.per_out_ch_quant
|
||||
|
||||
@property
|
||||
def quant_block_shape(self) -> Optional[list[int]]:
|
||||
def quant_block_shape(self) -> list[int] | None:
|
||||
assert self.quant_config is not None
|
||||
return self.quant_config.block_shape
|
||||
|
||||
@ -209,7 +209,7 @@ class Config:
|
||||
info = prepare_finalize_info(self.prepare_finalize_type)
|
||||
return info.backend
|
||||
|
||||
def is_valid(self) -> tuple[bool, Optional[str]]:
|
||||
def is_valid(self) -> tuple[bool, str | None]:
|
||||
# Check prepare-finalize and fused-experts compatibility
|
||||
if self.is_batched_prepare_finalize():
|
||||
if not self.is_batched_fused_experts():
|
||||
@ -280,10 +280,10 @@ class Config:
|
||||
class WeightTensors:
|
||||
w1: torch.Tensor
|
||||
w2: torch.Tensor
|
||||
w1_scale: Optional[torch.Tensor]
|
||||
w2_scale: Optional[torch.Tensor]
|
||||
w1_gs: Optional[torch.Tensor] = None
|
||||
w2_gs: Optional[torch.Tensor] = None
|
||||
w1_scale: torch.Tensor | None
|
||||
w2_scale: torch.Tensor | None
|
||||
w1_gs: torch.Tensor | None = None
|
||||
w2_gs: torch.Tensor | None = None
|
||||
|
||||
def describe(self):
|
||||
s = ""
|
||||
@ -351,11 +351,11 @@ class WeightTensors:
|
||||
@dataclass
|
||||
class RankTensors:
|
||||
hidden_states: torch.Tensor
|
||||
hidden_states_scale: Optional[torch.Tensor]
|
||||
hidden_states_scale: torch.Tensor | None
|
||||
|
||||
topk_weights: torch.Tensor
|
||||
topk_ids: torch.Tensor
|
||||
expert_map: Optional[torch.Tensor]
|
||||
expert_map: torch.Tensor | None
|
||||
|
||||
def describe(self):
|
||||
s = ""
|
||||
@ -370,7 +370,7 @@ class RankTensors:
|
||||
@staticmethod
|
||||
def make_hidden_states(
|
||||
config: Config,
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""
|
||||
Return hidden_states
|
||||
"""
|
||||
|
@ -4,7 +4,6 @@
|
||||
import copy
|
||||
from enum import Enum
|
||||
from itertools import product
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
@ -82,7 +81,7 @@ def make_feature_matrix(csv_file_path: str):
|
||||
import pandas as pd
|
||||
|
||||
def add_to_results(
|
||||
config: Config, success: Result, results_df: Optional[pd.DataFrame] = None
|
||||
config: Config, success: Result, results_df: pd.DataFrame | None = None
|
||||
):
|
||||
config_dict = asdict(config)
|
||||
config_dict["prepare_finalize_type"] = config_dict[
|
||||
@ -121,7 +120,7 @@ def make_feature_matrix(csv_file_path: str):
|
||||
product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES)
|
||||
)
|
||||
|
||||
results_df: Optional[pd.DataFrame] = None
|
||||
results_df: pd.DataFrame | None = None
|
||||
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
|
||||
combinations
|
||||
):
|
||||
|
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
@ -43,25 +42,25 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
|
||||
@dataclass
|
||||
class TestMoEQuantConfig:
|
||||
quant_dtype: Union[torch.dtype, str, None]
|
||||
quant_dtype: torch.dtype | str | None
|
||||
per_out_ch_quant: bool
|
||||
per_act_token_quant: bool
|
||||
block_shape: Optional[list[int]]
|
||||
block_shape: list[int] | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PrepareFinalizeInfo:
|
||||
activation_format: mk.FusedMoEActivationFormat
|
||||
supported_dtypes: list[Union[torch.dtype, str]]
|
||||
supported_dtypes: list[torch.dtype | str]
|
||||
blocked_quantization_support: bool
|
||||
backend: Optional[str]
|
||||
backend: str | None
|
||||
supports_apply_weight_on_input: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpertInfo:
|
||||
activation_format: mk.FusedMoEActivationFormat
|
||||
supported_dtypes: list[Union[torch.dtype, str]]
|
||||
supported_dtypes: list[torch.dtype | str]
|
||||
blocked_quantization_support: bool
|
||||
supports_chunking: bool
|
||||
supports_expert_map: bool
|
||||
@ -78,7 +77,7 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []
|
||||
|
||||
standard_format = mk.FusedMoEActivationFormat.Standard
|
||||
batched_format = mk.FusedMoEActivationFormat.BatchedExperts
|
||||
common_float_types: list[Union[torch.dtype, str]] = [
|
||||
common_float_types: list[torch.dtype | str] = [
|
||||
torch.float8_e4m3fn,
|
||||
torch.bfloat16,
|
||||
torch.float16,
|
||||
@ -92,9 +91,9 @@ fp8_types = [torch.float8_e4m3fn]
|
||||
def register_prepare_and_finalize(
|
||||
kind,
|
||||
activation_format: mk.FusedMoEActivationFormat,
|
||||
supported_dtypes: list[Union[torch.dtype, str]],
|
||||
supported_dtypes: list[torch.dtype | str],
|
||||
blocked_quantization_support: bool,
|
||||
backend: Optional[str],
|
||||
backend: str | None,
|
||||
force_multigpu: bool = False,
|
||||
supports_apply_weight_on_input: bool = True,
|
||||
):
|
||||
@ -121,7 +120,7 @@ def register_prepare_and_finalize(
|
||||
def register_experts(
|
||||
kind,
|
||||
activation_format: mk.FusedMoEActivationFormat,
|
||||
supported_dtypes: list[Union[torch.dtype, str]],
|
||||
supported_dtypes: list[torch.dtype | str],
|
||||
blocked_quantization_support: bool,
|
||||
supports_chunking: bool,
|
||||
supports_expert_map: bool,
|
||||
@ -340,7 +339,7 @@ if cutlass_fp4_supported():
|
||||
supports_expert_map=False,
|
||||
)
|
||||
|
||||
MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [
|
||||
MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
|
||||
None,
|
||||
# per-channel / per-column weights and per-tensor activations
|
||||
TestMoEQuantConfig(
|
||||
@ -395,7 +394,7 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
|
||||
|
||||
def make_prepare_finalize(
|
||||
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
|
||||
backend: Optional[str],
|
||||
backend: str | None,
|
||||
moe: FusedMoEConfig,
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
) -> mk.FusedMoEPrepareAndFinalize:
|
||||
|
@ -3,11 +3,12 @@
|
||||
import dataclasses
|
||||
import os
|
||||
import traceback
|
||||
from typing import Any, Callable, Optional
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Concatenate
|
||||
|
||||
import torch
|
||||
from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage]
|
||||
from typing_extensions import Concatenate, ParamSpec
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import init_distributed_environment, initialize_model_parallel
|
||||
@ -58,9 +59,9 @@ def _worker_parallel_launch(
|
||||
world_local_size: int,
|
||||
node_rank: int,
|
||||
init_method: str,
|
||||
worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None],
|
||||
vllm_config: Optional[VllmConfig],
|
||||
env_dict: Optional[dict],
|
||||
worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None],
|
||||
vllm_config: VllmConfig | None,
|
||||
env_dict: dict | None,
|
||||
*args: P.args,
|
||||
**kwargs: P.kwargs,
|
||||
) -> None:
|
||||
|
@ -2,8 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import copy
|
||||
from collections.abc import Callable
|
||||
from itertools import product
|
||||
from typing import Any, Callable
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -7,12 +7,13 @@ DeepEP test utilities
|
||||
import dataclasses
|
||||
import os
|
||||
import traceback
|
||||
from typing import Callable, Optional
|
||||
from collections.abc import Callable
|
||||
from typing import Concatenate
|
||||
|
||||
import torch
|
||||
from torch.distributed import ProcessGroup
|
||||
from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage]
|
||||
from typing_extensions import Concatenate, ParamSpec
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from vllm.utils import get_open_port, has_deep_ep
|
||||
|
||||
@ -126,8 +127,8 @@ def make_deepep_ht_a2a(
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
ht_args: DeepEPHTArgs,
|
||||
q_dtype: Optional[torch.dtype] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
q_dtype: torch.dtype | None = None,
|
||||
block_shape: list[int] | None = None,
|
||||
):
|
||||
import deep_ep
|
||||
|
||||
@ -153,8 +154,8 @@ def make_deepep_ll_a2a(
|
||||
pg: ProcessGroup,
|
||||
pgi: ProcessGroupInfo,
|
||||
deepep_ll_args: DeepEPLLArgs,
|
||||
q_dtype: Optional[torch.dtype] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
q_dtype: torch.dtype | None = None,
|
||||
block_shape: list[int] | None = None,
|
||||
):
|
||||
import deep_ep
|
||||
|
||||
@ -185,10 +186,10 @@ def make_deepep_a2a(
|
||||
pg: ProcessGroup,
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
deepep_ht_args: Optional[DeepEPHTArgs],
|
||||
deepep_ll_args: Optional[DeepEPLLArgs],
|
||||
q_dtype: Optional[torch.dtype] = None,
|
||||
block_shape: Optional[list[int]] = None,
|
||||
deepep_ht_args: DeepEPHTArgs | None,
|
||||
deepep_ll_args: DeepEPLLArgs | None,
|
||||
q_dtype: torch.dtype | None = None,
|
||||
block_shape: list[int] | None = None,
|
||||
):
|
||||
if deepep_ht_args is not None:
|
||||
assert deepep_ll_args is None
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -55,7 +54,7 @@ vllm_config.scheduler_config.max_model_len = 8192
|
||||
@dataclass
|
||||
class BatchedMMConfig:
|
||||
in_dtype: torch.dtype
|
||||
quant_dtype: Optional[torch.dtype]
|
||||
quant_dtype: torch.dtype | None
|
||||
out_dtype: torch.dtype
|
||||
num_experts: int
|
||||
max_tokens_per_expert: int
|
||||
@ -115,7 +114,7 @@ def test_batched_mm(
|
||||
K: int,
|
||||
N: int,
|
||||
dtype: torch.dtype,
|
||||
block_shape: Optional[list[int]],
|
||||
block_shape: list[int] | None,
|
||||
per_act_token_quant: bool,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
@ -242,7 +241,7 @@ def test_fused_moe_batched_experts(
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
per_act_token_quant: bool,
|
||||
block_shape: Optional[list[int]],
|
||||
block_shape: list[int] | None,
|
||||
input_scales: bool,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
|
@ -5,7 +5,6 @@ Tests compute_expert_num_tokens kernels
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -16,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
|
||||
@dataclasses.dataclass
|
||||
class TestTensors:
|
||||
topk_ids: torch.Tensor
|
||||
expert_map: Optional[torch.Tensor] = None
|
||||
expert_map: torch.Tensor | None = None
|
||||
|
||||
def to_device(self, device: str):
|
||||
self.topk_ids = self.topk_ids.to(device=device)
|
||||
|
@ -3,7 +3,6 @@
|
||||
import copy
|
||||
import dataclasses
|
||||
from math import prod
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -85,16 +84,16 @@ class MOETensors:
|
||||
@dataclasses.dataclass
|
||||
class MOETensors8Bit(MOETensors):
|
||||
# quantized
|
||||
a_q: Optional[torch.Tensor] = None # a -> a_q
|
||||
w1_q: Optional[torch.Tensor] = None # w1 -> w1_q
|
||||
w2_q: Optional[torch.Tensor] = None # w2 -> w2_q
|
||||
a_scale: Optional[torch.Tensor] = None
|
||||
w1_scale: Optional[torch.Tensor] = None
|
||||
w2_scale: Optional[torch.Tensor] = None
|
||||
a_q: torch.Tensor | None = None # a -> a_q
|
||||
w1_q: torch.Tensor | None = None # w1 -> w1_q
|
||||
w2_q: torch.Tensor | None = None # w2 -> w2_q
|
||||
a_scale: torch.Tensor | None = None
|
||||
w1_scale: torch.Tensor | None = None
|
||||
w2_scale: torch.Tensor | None = None
|
||||
# dequantized
|
||||
a_d: Optional[torch.Tensor] = None # a -> a_q -> a_d
|
||||
w1_d: Optional[torch.Tensor] = None # w1 -> w1_q -> w1_d
|
||||
w2_d: Optional[torch.Tensor] = None # w2 -> w2_q -> w2_d
|
||||
a_d: torch.Tensor | None = None # a -> a_q -> a_d
|
||||
w1_d: torch.Tensor | None = None # w1 -> w1_q -> w1_d
|
||||
w2_d: torch.Tensor | None = None # w2 -> w2_q -> w2_d
|
||||
|
||||
@staticmethod
|
||||
def make_moe_tensors_8bit(
|
||||
@ -209,7 +208,7 @@ def run_8_bit(
|
||||
topk_ids: torch.Tensor,
|
||||
per_act_token: bool,
|
||||
per_out_ch: bool,
|
||||
num_local_experts: Optional[int] = None,
|
||||
num_local_experts: int | None = None,
|
||||
) -> torch.Tensor:
|
||||
assert not any(
|
||||
[
|
||||
@ -280,7 +279,7 @@ def test_cutlass_moe_8_bit_no_graph(
|
||||
per_act_token: bool,
|
||||
per_out_ch: bool,
|
||||
monkeypatch,
|
||||
ep_size: Optional[int] = None,
|
||||
ep_size: int | None = None,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
|
||||
|
@ -7,7 +7,6 @@ fp8 block-quantized case.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch.distributed
|
||||
@ -92,13 +91,13 @@ class TestConfig:
|
||||
block_size: list[int]
|
||||
# configs for testing low-latency kernels
|
||||
low_latency: bool
|
||||
use_fp8_dispatch: Optional[bool] = False
|
||||
use_fp8_dispatch: bool | None = False
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TestTensors:
|
||||
rank_tokens: torch.Tensor # all ranks make this many tokens
|
||||
rank_token_scales: Optional[torch.Tensor]
|
||||
rank_token_scales: torch.Tensor | None
|
||||
topk: torch.Tensor
|
||||
topk_weights: torch.Tensor
|
||||
config: TestConfig
|
||||
@ -143,7 +142,7 @@ def make_ll_modular_kernel(
|
||||
max_tokens_per_rank: int,
|
||||
dp_size: int,
|
||||
hidden_size: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
q_dtype: torch.dtype | None,
|
||||
test_config: TestConfig,
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
) -> FusedMoEModularKernel:
|
||||
@ -179,7 +178,7 @@ def make_ht_modular_kernel(
|
||||
pgi: ProcessGroupInfo,
|
||||
dp_size: int,
|
||||
num_local_experts: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
q_dtype: torch.dtype | None,
|
||||
test_config: TestConfig,
|
||||
quant_config: FusedMoEQuantConfig,
|
||||
) -> FusedMoEModularKernel:
|
||||
@ -249,8 +248,8 @@ def deepep_deepgemm_moe_impl(
|
||||
test_tensors: TestTensors,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
w1_scale: Optional[torch.Tensor],
|
||||
w2_scale: Optional[torch.Tensor],
|
||||
w1_scale: torch.Tensor | None,
|
||||
w2_scale: torch.Tensor | None,
|
||||
) -> torch.Tensor:
|
||||
test_config = test_tensors.config
|
||||
num_experts = test_config.num_experts
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user