[Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-19 20:20:55 +08:00
committed by GitHub
parent 7a6c8c3fa1
commit d31f7844f8
52 changed files with 246 additions and 237 deletions

View File

@ -60,7 +60,7 @@ from vllm.multimodal.utils import fetch_image
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_num_threads
logger = init_logger(__name__)

View File

@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.async_utils import merge_async_iterators
MODEL_PATH = "zai-org/chatglm3-6b"
LORA_RANK = 64

View File

@ -17,7 +17,7 @@ from transformers import (
)
from vllm.platforms import current_platform
from vllm.utils.functools import identity
from vllm.utils.func_utils import identity
from ....conftest import (
IMAGE_ASSETS,

View File

@ -25,7 +25,7 @@ from transformers import (
from transformers.video_utils import VideoMetadata
from vllm.logprobs import SampleLogprobs
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
from .types import RunnerOutput

View File

@ -34,7 +34,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype
from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS

View File

@ -5,7 +5,7 @@ from collections.abc import AsyncIterator
import pytest
from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.async_utils import merge_async_iterators
async def _mock_async_iterator(idx: int):

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.utils.collections import swap_dict_values
from vllm.utils.collection_utils import swap_dict_values
@pytest.mark.parametrize(

View File

@ -4,7 +4,7 @@
import pytest
from vllm.utils.functools import deprecate_kwargs, supports_kw
from vllm.utils.func_utils import deprecate_kwargs, supports_kw
from ..utils import error_on_warning

View File

@ -0,0 +1,25 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import pickle
import pytest
from vllm.utils.hashing import sha256
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
def test_sha256(input: tuple):
digest = sha256(input)
assert digest is not None
assert isinstance(digest, bytes)
assert digest != b""
input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
assert digest == hashlib.sha256(input_bytes).digest()
# hashing again, returns the same value
assert digest == sha256(input)
# hashing different input, returns different value
assert digest != sha256(input + (1,))

View File

@ -0,0 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm_test_utils.monitor import monitor
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
from ..utils import create_new_process_for_each_test
@create_new_process_for_each_test()
def test_memory_profiling():
# Fake out some model loading + inference memory usage to test profiling
# Memory used by other processes will show up as cuda usage outside of torch
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
lib = CudaRTLibrary()
# 512 MiB allocation outside of this instance
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
baseline_snapshot = MemorySnapshot()
# load weights
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
def measure_current_non_torch():
free, total = torch.cuda.mem_get_info()
current_used = total - free
current_torch = torch.cuda.memory_reserved()
current_non_torch = current_used - current_torch
return current_non_torch
with (
memory_profiling(
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
) as result,
monitor(measure_current_non_torch) as monitored_values,
):
# make a memory spike, 1 GiB
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
del spike
# Add some extra non-torch memory 256 MiB (simulate NCCL)
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
# this is an analytic value, it is exact,
# we only have 256 MiB non-torch memory increase
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
assert measured_diff == 256 * 1024 * 1024
# Check that the memory usage is within 5% of the expected values
# 5% tolerance is caused by cuda runtime.
# we cannot control cuda runtime in the granularity of bytes,
# which causes a small error (<10 MiB in practice)
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
assert abs(non_torch_ratio - 1) <= 0.05
assert result.torch_peak_increase == 1024 * 1024 * 1024
del weights
lib.cudaFree(handle1)
lib.cudaFree(handle2)

View File

@ -0,0 +1,104 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.utils.torch_utils import (
common_broadcastable_dtype,
current_stream,
is_lossless_cast,
)
@pytest.mark.parametrize(
("src_dtype", "tgt_dtype", "expected_result"),
[
# Different precision_levels
(torch.bool, torch.int8, True),
(torch.bool, torch.float16, True),
(torch.bool, torch.complex32, True),
(torch.int64, torch.bool, False),
(torch.int64, torch.float16, True),
(torch.int64, torch.complex32, True),
(torch.float64, torch.bool, False),
(torch.float64, torch.int8, False),
(torch.float64, torch.complex32, True),
(torch.complex128, torch.bool, False),
(torch.complex128, torch.int8, False),
(torch.complex128, torch.float16, False),
# precision_level=0
(torch.bool, torch.bool, True),
# precision_level=1
(torch.int8, torch.int16, True),
(torch.int16, torch.int8, False),
(torch.uint8, torch.int8, False),
(torch.int8, torch.uint8, False),
# precision_level=2
(torch.float16, torch.float32, True),
(torch.float32, torch.float16, False),
(torch.bfloat16, torch.float32, True),
(torch.float32, torch.bfloat16, False),
# precision_level=3
(torch.complex32, torch.complex64, True),
(torch.complex64, torch.complex32, False),
],
)
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
@pytest.mark.parametrize(
("dtypes", "expected_result"),
[
([torch.bool], torch.bool),
([torch.bool, torch.int8], torch.int8),
([torch.bool, torch.int8, torch.float16], torch.float16),
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
],
)
def test_common_broadcastable_dtype(dtypes, expected_result):
assert common_broadcastable_dtype(dtypes) == expected_result
def test_current_stream_multithread():
import threading
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
main_default_stream = torch.cuda.current_stream()
child_stream = torch.cuda.Stream()
thread_stream_ready = threading.Event()
thread_can_exit = threading.Event()
def child_thread_func():
with torch.cuda.stream(child_stream):
thread_stream_ready.set()
thread_can_exit.wait(timeout=10)
child_thread = threading.Thread(target=child_thread_func)
child_thread.start()
try:
assert thread_stream_ready.wait(timeout=5), (
"Child thread failed to enter stream context in time"
)
main_current_stream = current_stream()
assert main_current_stream != child_stream, (
"Main thread's current_stream was contaminated by child thread"
)
assert main_current_stream == main_default_stream, (
"Main thread's current_stream is not the default stream"
)
# Notify child thread it can exit
thread_can_exit.set()
finally:
# Ensure child thread exits properly
child_thread.join(timeout=5)
if child_thread.is_alive():
pytest.fail("Child thread failed to exit properly")

View File

@ -2,10 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
import hashlib
import json
import os
import pickle
import tempfile
from pathlib import Path
from unittest.mock import patch
@ -14,7 +12,6 @@ import pytest
import torch
import yaml
from transformers import AutoTokenizer
from vllm_test_utils.monitor import monitor
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
@ -24,13 +21,6 @@ from vllm.utils import (
bind_kv_cache,
unique_filepath,
)
from vllm.utils.hashing import sha256
from vllm.utils.torch_utils import (
common_broadcastable_dtype,
current_stream,
is_lossless_cast,
)
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
from ..utils import create_new_process_for_each_test, flat_product
@ -267,61 +257,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
assert "-O.mode" in caplog_vllm.text
@create_new_process_for_each_test()
def test_memory_profiling():
# Fake out some model loading + inference memory usage to test profiling
# Memory used by other processes will show up as cuda usage outside of torch
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
lib = CudaRTLibrary()
# 512 MiB allocation outside of this instance
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
baseline_snapshot = MemorySnapshot()
# load weights
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
def measure_current_non_torch():
free, total = torch.cuda.mem_get_info()
current_used = total - free
current_torch = torch.cuda.memory_reserved()
current_non_torch = current_used - current_torch
return current_non_torch
with (
memory_profiling(
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
) as result,
monitor(measure_current_non_torch) as monitored_values,
):
# make a memory spike, 1 GiB
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
del spike
# Add some extra non-torch memory 256 MiB (simulate NCCL)
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
# this is an analytic value, it is exact,
# we only have 256 MiB non-torch memory increase
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
assert measured_diff == 256 * 1024 * 1024
# Check that the memory usage is within 5% of the expected values
# 5% tolerance is caused by cuda runtime.
# we cannot control cuda runtime in the granularity of bytes,
# which causes a small error (<10 MiB in practice)
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
assert abs(non_torch_ratio - 1) <= 0.05
assert result.torch_peak_increase == 1024 * 1024 * 1024
del weights
lib.cudaFree(handle1)
lib.cudaFree(handle2)
def test_bind_kv_cache():
from vllm.attention import Attention
@ -403,56 +338,6 @@ def test_bind_kv_cache_pp():
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
@pytest.mark.parametrize(
("src_dtype", "tgt_dtype", "expected_result"),
[
# Different precision_levels
(torch.bool, torch.int8, True),
(torch.bool, torch.float16, True),
(torch.bool, torch.complex32, True),
(torch.int64, torch.bool, False),
(torch.int64, torch.float16, True),
(torch.int64, torch.complex32, True),
(torch.float64, torch.bool, False),
(torch.float64, torch.int8, False),
(torch.float64, torch.complex32, True),
(torch.complex128, torch.bool, False),
(torch.complex128, torch.int8, False),
(torch.complex128, torch.float16, False),
# precision_level=0
(torch.bool, torch.bool, True),
# precision_level=1
(torch.int8, torch.int16, True),
(torch.int16, torch.int8, False),
(torch.uint8, torch.int8, False),
(torch.int8, torch.uint8, False),
# precision_level=2
(torch.float16, torch.float32, True),
(torch.float32, torch.float16, False),
(torch.bfloat16, torch.float32, True),
(torch.float32, torch.bfloat16, False),
# precision_level=3
(torch.complex32, torch.complex64, True),
(torch.complex64, torch.complex32, False),
],
)
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
@pytest.mark.parametrize(
("dtypes", "expected_result"),
[
([torch.bool], torch.bool),
([torch.bool, torch.int8], torch.int8),
([torch.bool, torch.int8, torch.float16], torch.float16),
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
],
)
def test_common_broadcastable_dtype(dtypes, expected_result):
assert common_broadcastable_dtype(dtypes) == expected_result
def test_model_specification(
parser_with_config, cli_config_file, cli_config_file_with_model
):
@ -535,23 +420,6 @@ def test_model_specification(
assert args.port == 12312
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
def test_sha256(input: tuple):
digest = sha256(input)
assert digest is not None
assert isinstance(digest, bytes)
assert digest != b""
input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
assert digest == hashlib.sha256(input_bytes).digest()
# hashing again, returns the same value
assert digest == sha256(input)
# hashing different input, returns different value
assert digest != sha256(input + (1,))
def test_convert_ids_list_to_tokens():
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
token_ids = tokenizer.encode("Hello, world!")
@ -561,50 +429,6 @@ def test_convert_ids_list_to_tokens():
assert tokens == ["Hello", ",", " world", "!"]
def test_current_stream_multithread():
import threading
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
main_default_stream = torch.cuda.current_stream()
child_stream = torch.cuda.Stream()
thread_stream_ready = threading.Event()
thread_can_exit = threading.Event()
def child_thread_func():
with torch.cuda.stream(child_stream):
thread_stream_ready.set()
thread_can_exit.wait(timeout=10)
child_thread = threading.Thread(target=child_thread_func)
child_thread.start()
try:
assert thread_stream_ready.wait(timeout=5), (
"Child thread failed to enter stream context in time"
)
main_current_stream = current_stream()
assert main_current_stream != child_stream, (
"Main thread's current_stream was contaminated by child thread"
)
assert main_current_stream == main_default_stream, (
"Main thread's current_stream is not the default stream"
)
# Notify child thread it can exit
thread_can_exit.set()
finally:
# Ensure child thread exits properly
child_thread.join(timeout=5)
if child_thread.is_alive():
pytest.fail("Child thread failed to exit properly")
def test_load_config_file(tmp_path):
# Define the configuration data
config_data = {

View File

@ -23,13 +23,14 @@ ALLOWED_FILES = {
"vllm/transformers_utils/config.py",
"vllm/model_executor/models/registry.py",
"vllm/compilation/caching.py",
"tests/utils_/test_utils.py",
"tests/tokenization/test_cached_tokenizer.py",
"vllm/distributed/utils.py",
"vllm/distributed/parallel_state.py",
"vllm/distributed/device_communicators/all_reduce_utils.py",
"vllm/distributed/device_communicators/shm_broadcast.py",
"vllm/distributed/device_communicators/shm_object_storage.py",
"vllm/utils/hashing.py",
"tests/utils_/test_hashing.py",
"tests/tokenization/test_cached_tokenizer.py",
"benchmarks/kernels/graph_machete_bench.py",
"benchmarks/kernels/benchmark_lora.py",
"benchmarks/kernels/benchmark_machete.py",
@ -40,10 +41,8 @@ ALLOWED_FILES = {
"vllm/executor/mp_distributed_executor.py",
"vllm/executor/ray_distributed_executor.py",
"vllm/entrypoints/llm.py",
"tests/utils.py",
# pickle and cloudpickle
"vllm/utils/__init__.py",
"vllm/utils/hashing.py",
"tests/utils.py",
}
PICKLE_RE = re.compile(

View File

@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.async_utils import merge_async_iterators
def run_vllm(

View File

@ -51,7 +51,7 @@ from vllm.transformers_utils.chat_templates import get_chat_template_fallback_pa
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import random_uuid
from vllm.utils.functools import supports_kw
from vllm.utils.func_utils import supports_kw
logger = init_logger(__name__)

View File

@ -76,7 +76,7 @@ from vllm.transformers_utils.tokenizer import (
)
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter, Device
from vllm.utils.collections import as_iter, is_list_of
from vllm.utils.collection_utils import as_iter, is_list_of
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.llm_engine import LLMEngine
from vllm.v1.sample.logits_processor import LogitsProcessor

View File

@ -70,7 +70,7 @@ from vllm.transformers_utils.tokenizers import (
truncate_tool_call_ids,
validate_request_params,
)
from vllm.utils.collections import as_list
from vllm.utils.collection_utils import as_list
logger = init_logger(__name__)

View File

@ -34,8 +34,8 @@ from vllm.logprobs import Logprob
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.collections import as_list
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
logger = init_logger(__name__)

View File

@ -39,8 +39,8 @@ from vllm.outputs import (
RequestOutput,
)
from vllm.pooling_params import PoolingParams
from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.collections import chunk_list
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import chunk_list
logger = init_logger(__name__)

View File

@ -91,13 +91,13 @@ from vllm.tracing import (
)
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import random_uuid
from vllm.utils.asyncio import (
from vllm.utils.async_utils import (
AsyncMicrobatchTokenizer,
collect_from_async_generator,
make_async,
merge_async_iterators,
)
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.v1.engine import EngineCoreRequest
logger = init_logger(__name__)

View File

@ -36,7 +36,7 @@ from vllm.entrypoints.utils import _validate_truncation_size
from vllm.logger import init_logger
from vllm.outputs import PoolingOutput, PoolingRequestOutput
from vllm.tasks import SupportedTask
from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.async_utils import merge_async_iterators
logger = init_logger(__name__)

View File

@ -37,7 +37,7 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils.asyncio import make_async, merge_async_iterators
from vllm.utils.async_utils import make_async, merge_async_iterators
logger = init_logger(__name__)

View File

@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import import_from_path
logger = init_logger(__name__)

View File

@ -17,7 +17,7 @@ from vllm.inputs.data import TextPrompt as EngineTextPrompt
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils.asyncio import AsyncMicrobatchTokenizer
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
@dataclass(frozen=True)

View File

@ -17,7 +17,7 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import ExecuteModelRequest
from vllm.tasks import SupportedTask
from vllm.utils.asyncio import make_async
from vllm.utils.async_utils import make_async
from vllm.v1.outputs import SamplerOutput
from vllm.v1.worker.worker_base import WorkerBase

View File

@ -19,7 +19,7 @@ from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.ray.ray_env import get_env_vars_to_copy
from vllm.sequence import ExecuteModelRequest
from vllm.utils.asyncio import make_async
from vllm.utils.async_utils import make_async
from vllm.utils.network_utils import (
get_distributed_init_method,
get_ip,

View File

@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cas
from typing_extensions import TypeIs
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from .data import (
EmbedsPrompt,

View File

@ -17,7 +17,7 @@ from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils.collections import LazyDict
from vllm.utils.collection_utils import LazyDict
logger = init_logger(__name__)

View File

@ -31,7 +31,7 @@ from vllm.utils.deep_gemm import (
get_mk_alignment_for_contiguous_layout,
m_grouped_fp8_gemm_nt_contiguous,
)
from vllm.utils.functools import run_once
from vllm.utils.func_utils import run_once
logger = init_logger(__name__)

View File

@ -28,7 +28,7 @@ from vllm.model_executor.parameter import (
RowvLLMParameter,
)
from vllm.transformers_utils.config import get_safetensors_params_metadata
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
if TYPE_CHECKING:
from vllm.model_executor.layers.quantization import QuantizationMethods

View File

@ -57,7 +57,7 @@ from vllm.model_executor.parameter import (
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.transformers_utils.config import get_safetensors_params_metadata
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
logger = init_logger(__name__)

View File

@ -48,7 +48,7 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (
)
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.utils.torch_utils import set_default_torch_dtype

View File

@ -24,7 +24,7 @@ from vllm.inputs import TokensPrompt
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.utils.functools import supports_kw
from vllm.utils.func_utils import supports_kw
from .interfaces_base import VllmModel, is_pooling_model

View File

@ -15,7 +15,7 @@ import torch.nn as nn
from typing_extensions import TypeIs, TypeVar
from vllm.logger import init_logger
from vllm.utils.functools import supports_kw
from vllm.utils.func_utils import supports_kw
if TYPE_CHECKING:
from vllm.config import VllmConfig

View File

@ -33,7 +33,7 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP

View File

@ -85,7 +85,7 @@ from vllm.multimodal.processing import (
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.utils.collections import flatten_2d_lists
from vllm.utils.collection_utils import flatten_2d_lists
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.utils.torch_utils import set_default_torch_dtype

View File

@ -79,7 +79,7 @@ from vllm.multimodal.processing import (
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from .interfaces import (
MultiModalEmbeddings,

View File

@ -22,7 +22,7 @@ from typing import (
import numpy as np
from typing_extensions import NotRequired, TypeVar, deprecated
from vllm.utils.collections import full_groupby, is_list_of
from vllm.utils.collection_utils import full_groupby, is_list_of
from vllm.utils.import_utils import LazyLoader
from vllm.utils.jsontree import json_map_leaves

View File

@ -19,7 +19,7 @@ import numpy as np
import torch
from typing_extensions import assert_never
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import LazyLoader
from .audio import AudioResampler

View File

@ -25,8 +25,8 @@ from typing_extensions import TypeVar, assert_never
from vllm.logger import init_logger
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
from vllm.utils.collections import flatten_2d_lists, full_groupby
from vllm.utils.functools import get_allowed_kwarg_only_overrides
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
from vllm.utils.jsontree import JSONTree, json_map_leaves
from .hasher import MultiModalHasher
@ -486,7 +486,7 @@ _M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp)
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
"""
Convenience function to apply
[`full_groupby`][vllm.utils.collections.full_groupby]
[`full_groupby`][vllm.utils.collection_utils.full_groupby]
based on modality.
"""
return full_groupby(values, key=lambda x: x.modality)

View File

@ -9,7 +9,7 @@ import torch.nn as nn
from vllm.config.multimodal import BaseDummyOptions
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
from vllm.utils.collections import ClassRegistry
from vllm.utils.collection_utils import ClassRegistry
from .cache import BaseMultiModalProcessorCache
from .processing import (

View File

@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Any
from vllm.entrypoints.tool_server import ToolServer
from vllm.logger import init_logger
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import import_from_path
if TYPE_CHECKING:

View File

@ -5,7 +5,7 @@ import os
from collections.abc import Mapping
from vllm.logger import init_logger
from vllm.utils.functools import run_once
from vllm.utils.func_utils import run_once
TRACE_HEADERS = ["traceparent", "tracestate"]

View File

@ -16,7 +16,7 @@ from transformers.processing_utils import ProcessorMixin
from transformers.video_processing_utils import BaseVideoProcessor
from typing_extensions import TypeVar
from vllm.utils.functools import get_allowed_kwarg_only_overrides
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
if TYPE_CHECKING:
from vllm.config import ModelConfig

View File

@ -1122,9 +1122,6 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
return cls
## moved to vllm.utils.profiling (imported at module top)
# Only relevant for models using ALiBi (e.g, MPT)
def check_use_alibi(model_config: ModelConfig) -> bool:
cfg = model_config.hf_text_config
@ -1150,9 +1147,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
)
## moved to vllm.utils.hashing
@cache
def _has_module(module_name: str) -> bool:
"""Return True if *module_name* can be found in the current environment.

View File

@ -30,9 +30,9 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va
from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, cdiv
from vllm.utils.asyncio import cancel_task_threadsafe
from vllm.utils.collections import as_list
from vllm.utils.functools import deprecate_kwargs
from vllm.utils.async_utils import cancel_task_threadsafe
from vllm.utils.collection_utils import as_list
from vllm.utils.func_utils import deprecate_kwargs
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError

View File

@ -23,7 +23,7 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.tasks import SupportedTask
from vllm.utils.asyncio import in_loop
from vllm.utils.async_utils import in_loop
from vllm.utils.network_utils import (
close_sockets,
get_open_port,

View File

@ -13,7 +13,7 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.utils.collections import swap_dict_values
from vllm.utils.collection_utils import swap_dict_values
from vllm.v1.outputs import LogprobsTensors
from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import (

View File

@ -10,7 +10,7 @@ import torch
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingType
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.utils.collections import swap_dict_values
from vllm.utils.collection_utils import swap_dict_values
from vllm.v1.outputs import LogprobsTensors
from vllm.v1.worker.block_table import MultiGroupBlockTable
from vllm.v1.worker.gpu_input_batch import CachedRequestState