From d31f7844f8d4e312f0521a69f7aa27c94103b2db Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 19 Oct 2025 20:20:55 +0800 Subject: [PATCH] [Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169) Signed-off-by: DarkLight1337 --- tests/conftest.py | 2 +- tests/lora/test_add_lora.py | 2 +- .../multimodal/generation/test_common.py | 2 +- .../generation/vlm_utils/model_utils.py | 2 +- .../processing/test_tensor_schema.py | 2 +- tests/utils_/test_async_utils.py | 2 +- ...ollections.py => test_collection_utils.py} | 2 +- tests/utils_/test_func_utils.py | 2 +- tests/utils_/test_hashing.py | 25 +++ tests/utils_/test_mem_utils.py | 63 +++++++ tests/utils_/test_torch_utils.py | 104 +++++++++++ tests/utils_/test_utils.py | 176 ------------------ tools/pre_commit/check_pickle_imports.py | 9 +- vllm/benchmarks/throughput.py | 2 +- vllm/entrypoints/chat_utils.py | 2 +- vllm/entrypoints/llm.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 4 +- vllm/entrypoints/openai/serving_embedding.py | 4 +- vllm/entrypoints/openai/serving_engine.py | 4 +- vllm/entrypoints/openai/serving_pooling.py | 2 +- vllm/entrypoints/openai/serving_score.py | 2 +- .../tool_parsers/abstract_tool_parser.py | 2 +- vllm/entrypoints/renderer.py | 2 +- vllm/executor/executor_base.py | 2 +- vllm/executor/ray_distributed_executor.py | 2 +- vllm/inputs/parse.py | 2 +- vllm/model_executor/layers/activation.py | 2 +- .../layers/fused_moe/deep_gemm_moe.py | 2 +- .../layers/quantization/gptq.py | 2 +- .../layers/quantization/gptq_marlin.py | 2 +- vllm/model_executor/models/deepseek_vl2.py | 2 +- vllm/model_executor/models/interfaces.py | 2 +- vllm/model_executor/models/interfaces_base.py | 2 +- .../model_executor/models/llava_next_video.py | 2 +- vllm/model_executor/models/minicpmv.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 2 +- vllm/multimodal/inputs.py | 2 +- vllm/multimodal/parse.py | 2 +- vllm/multimodal/processing.py | 6 +- vllm/multimodal/registry.py | 2 +- vllm/reasoning/abs_reasoning_parsers.py | 2 +- vllm/tracing.py | 2 +- vllm/transformers_utils/processor.py | 2 +- vllm/utils/__init__.py | 6 - vllm/utils/{asyncio.py => async_utils.py} | 0 .../{collections.py => collection_utils.py} | 0 vllm/utils/{functools.py => func_utils.py} | 0 vllm/v1/engine/async_llm.py | 6 +- vllm/v1/engine/core_client.py | 2 +- vllm/v1/worker/gpu_input_batch.py | 2 +- vllm/v1/worker/tpu_input_batch.py | 2 +- 52 files changed, 246 insertions(+), 237 deletions(-) rename tests/utils_/{test_collections.py => test_collection_utils.py} (93%) create mode 100644 tests/utils_/test_hashing.py create mode 100644 tests/utils_/test_mem_utils.py create mode 100644 tests/utils_/test_torch_utils.py rename vllm/utils/{asyncio.py => async_utils.py} (100%) rename vllm/utils/{collections.py => collection_utils.py} (100%) rename vllm/utils/{functools.py => func_utils.py} (100%) diff --git a/tests/conftest.py b/tests/conftest.py index 5e94a8322e..ec0179b9cd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -60,7 +60,7 @@ from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_num_threads logger = init_logger(__name__) diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index d3d5e2c1cf..9a82ab99ea 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import ( from vllm.inputs import TextPrompt from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams -from vllm.utils.asyncio import merge_async_iterators +from vllm.utils.async_utils import merge_async_iterators MODEL_PATH = "zai-org/chatglm3-6b" LORA_RANK = 64 diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index d94a3d5cf3..44bbc4479c 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -17,7 +17,7 @@ from transformers import ( ) from vllm.platforms import current_platform -from vllm.utils.functools import identity +from vllm.utils.func_utils import identity from ....conftest import ( IMAGE_ASSETS, diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 8329542584..8f0caed4dd 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -25,7 +25,7 @@ from transformers import ( from transformers.video_utils import VideoMetadata from vllm.logprobs import SampleLogprobs -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from .....conftest import HfRunner, ImageAsset, ImageTestAssets from .types import RunnerOutput diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 093898dd4b..c0436e1179 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -34,7 +34,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS diff --git a/tests/utils_/test_async_utils.py b/tests/utils_/test_async_utils.py index 070cfb38fb..03d116bdfd 100644 --- a/tests/utils_/test_async_utils.py +++ b/tests/utils_/test_async_utils.py @@ -5,7 +5,7 @@ from collections.abc import AsyncIterator import pytest -from vllm.utils.asyncio import merge_async_iterators +from vllm.utils.async_utils import merge_async_iterators async def _mock_async_iterator(idx: int): diff --git a/tests/utils_/test_collections.py b/tests/utils_/test_collection_utils.py similarity index 93% rename from tests/utils_/test_collections.py rename to tests/utils_/test_collection_utils.py index cb96bf2b0d..19f4a3d1c9 100644 --- a/tests/utils_/test_collections.py +++ b/tests/utils_/test_collection_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from vllm.utils.collections import swap_dict_values +from vllm.utils.collection_utils import swap_dict_values @pytest.mark.parametrize( diff --git a/tests/utils_/test_func_utils.py b/tests/utils_/test_func_utils.py index e2b5003fd0..9ce1ada095 100644 --- a/tests/utils_/test_func_utils.py +++ b/tests/utils_/test_func_utils.py @@ -4,7 +4,7 @@ import pytest -from vllm.utils.functools import deprecate_kwargs, supports_kw +from vllm.utils.func_utils import deprecate_kwargs, supports_kw from ..utils import error_on_warning diff --git a/tests/utils_/test_hashing.py b/tests/utils_/test_hashing.py new file mode 100644 index 0000000000..484627a547 --- /dev/null +++ b/tests/utils_/test_hashing.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import hashlib +import pickle + +import pytest + +from vllm.utils.hashing import sha256 + + +@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])]) +def test_sha256(input: tuple): + digest = sha256(input) + assert digest is not None + assert isinstance(digest, bytes) + assert digest != b"" + + input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL) + assert digest == hashlib.sha256(input_bytes).digest() + + # hashing again, returns the same value + assert digest == sha256(input) + + # hashing different input, returns different value + assert digest != sha256(input + (1,)) diff --git a/tests/utils_/test_mem_utils.py b/tests/utils_/test_mem_utils.py new file mode 100644 index 0000000000..4b1058be41 --- /dev/null +++ b/tests/utils_/test_mem_utils.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +from vllm_test_utils.monitor import monitor + +from vllm.utils.mem_utils import MemorySnapshot, memory_profiling + +from ..utils import create_new_process_for_each_test + + +@create_new_process_for_each_test() +def test_memory_profiling(): + # Fake out some model loading + inference memory usage to test profiling + # Memory used by other processes will show up as cuda usage outside of torch + from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary + + lib = CudaRTLibrary() + # 512 MiB allocation outside of this instance + handle1 = lib.cudaMalloc(512 * 1024 * 1024) + + baseline_snapshot = MemorySnapshot() + + # load weights + + weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32) + + weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB + + def measure_current_non_torch(): + free, total = torch.cuda.mem_get_info() + current_used = total - free + current_torch = torch.cuda.memory_reserved() + current_non_torch = current_used - current_torch + return current_non_torch + + with ( + memory_profiling( + baseline_snapshot=baseline_snapshot, weights_memory=weights_memory + ) as result, + monitor(measure_current_non_torch) as monitored_values, + ): + # make a memory spike, 1 GiB + spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32) + del spike + + # Add some extra non-torch memory 256 MiB (simulate NCCL) + handle2 = lib.cudaMalloc(256 * 1024 * 1024) + + # this is an analytic value, it is exact, + # we only have 256 MiB non-torch memory increase + measured_diff = monitored_values.values[-1] - monitored_values.values[0] + assert measured_diff == 256 * 1024 * 1024 + + # Check that the memory usage is within 5% of the expected values + # 5% tolerance is caused by cuda runtime. + # we cannot control cuda runtime in the granularity of bytes, + # which causes a small error (<10 MiB in practice) + non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa + assert abs(non_torch_ratio - 1) <= 0.05 + assert result.torch_peak_increase == 1024 * 1024 * 1024 + del weights + lib.cudaFree(handle1) + lib.cudaFree(handle2) diff --git a/tests/utils_/test_torch_utils.py b/tests/utils_/test_torch_utils.py new file mode 100644 index 0000000000..4a96627666 --- /dev/null +++ b/tests/utils_/test_torch_utils.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from vllm.utils.torch_utils import ( + common_broadcastable_dtype, + current_stream, + is_lossless_cast, +) + + +@pytest.mark.parametrize( + ("src_dtype", "tgt_dtype", "expected_result"), + [ + # Different precision_levels + (torch.bool, torch.int8, True), + (torch.bool, torch.float16, True), + (torch.bool, torch.complex32, True), + (torch.int64, torch.bool, False), + (torch.int64, torch.float16, True), + (torch.int64, torch.complex32, True), + (torch.float64, torch.bool, False), + (torch.float64, torch.int8, False), + (torch.float64, torch.complex32, True), + (torch.complex128, torch.bool, False), + (torch.complex128, torch.int8, False), + (torch.complex128, torch.float16, False), + # precision_level=0 + (torch.bool, torch.bool, True), + # precision_level=1 + (torch.int8, torch.int16, True), + (torch.int16, torch.int8, False), + (torch.uint8, torch.int8, False), + (torch.int8, torch.uint8, False), + # precision_level=2 + (torch.float16, torch.float32, True), + (torch.float32, torch.float16, False), + (torch.bfloat16, torch.float32, True), + (torch.float32, torch.bfloat16, False), + # precision_level=3 + (torch.complex32, torch.complex64, True), + (torch.complex64, torch.complex32, False), + ], +) +def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result): + assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result + + +@pytest.mark.parametrize( + ("dtypes", "expected_result"), + [ + ([torch.bool], torch.bool), + ([torch.bool, torch.int8], torch.int8), + ([torch.bool, torch.int8, torch.float16], torch.float16), + ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501 + ], +) +def test_common_broadcastable_dtype(dtypes, expected_result): + assert common_broadcastable_dtype(dtypes) == expected_result + + +def test_current_stream_multithread(): + import threading + + if not torch.cuda.is_available(): + pytest.skip("CUDA not available") + + main_default_stream = torch.cuda.current_stream() + child_stream = torch.cuda.Stream() + + thread_stream_ready = threading.Event() + thread_can_exit = threading.Event() + + def child_thread_func(): + with torch.cuda.stream(child_stream): + thread_stream_ready.set() + thread_can_exit.wait(timeout=10) + + child_thread = threading.Thread(target=child_thread_func) + child_thread.start() + + try: + assert thread_stream_ready.wait(timeout=5), ( + "Child thread failed to enter stream context in time" + ) + + main_current_stream = current_stream() + + assert main_current_stream != child_stream, ( + "Main thread's current_stream was contaminated by child thread" + ) + assert main_current_stream == main_default_stream, ( + "Main thread's current_stream is not the default stream" + ) + + # Notify child thread it can exit + thread_can_exit.set() + + finally: + # Ensure child thread exits properly + child_thread.join(timeout=5) + if child_thread.is_alive(): + pytest.fail("Child thread failed to exit properly") diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 72771ed139..9028c925b5 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -2,10 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa -import hashlib import json import os -import pickle import tempfile from pathlib import Path from unittest.mock import patch @@ -14,7 +12,6 @@ import pytest import torch import yaml from transformers import AutoTokenizer -from vllm_test_utils.monitor import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens @@ -24,13 +21,6 @@ from vllm.utils import ( bind_kv_cache, unique_filepath, ) -from vllm.utils.hashing import sha256 -from vllm.utils.torch_utils import ( - common_broadcastable_dtype, - current_stream, - is_lossless_cast, -) -from vllm.utils.mem_utils import MemorySnapshot, memory_profiling from ..utils import create_new_process_for_each_test, flat_product @@ -267,61 +257,6 @@ def test_duplicate_dict_args(caplog_vllm, parser): assert "-O.mode" in caplog_vllm.text -@create_new_process_for_each_test() -def test_memory_profiling(): - # Fake out some model loading + inference memory usage to test profiling - # Memory used by other processes will show up as cuda usage outside of torch - from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary - - lib = CudaRTLibrary() - # 512 MiB allocation outside of this instance - handle1 = lib.cudaMalloc(512 * 1024 * 1024) - - baseline_snapshot = MemorySnapshot() - - # load weights - - weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32) - - weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB - - def measure_current_non_torch(): - free, total = torch.cuda.mem_get_info() - current_used = total - free - current_torch = torch.cuda.memory_reserved() - current_non_torch = current_used - current_torch - return current_non_torch - - with ( - memory_profiling( - baseline_snapshot=baseline_snapshot, weights_memory=weights_memory - ) as result, - monitor(measure_current_non_torch) as monitored_values, - ): - # make a memory spike, 1 GiB - spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32) - del spike - - # Add some extra non-torch memory 256 MiB (simulate NCCL) - handle2 = lib.cudaMalloc(256 * 1024 * 1024) - - # this is an analytic value, it is exact, - # we only have 256 MiB non-torch memory increase - measured_diff = monitored_values.values[-1] - monitored_values.values[0] - assert measured_diff == 256 * 1024 * 1024 - - # Check that the memory usage is within 5% of the expected values - # 5% tolerance is caused by cuda runtime. - # we cannot control cuda runtime in the granularity of bytes, - # which causes a small error (<10 MiB in practice) - non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa - assert abs(non_torch_ratio - 1) <= 0.05 - assert result.torch_peak_increase == 1024 * 1024 * 1024 - del weights - lib.cudaFree(handle1) - lib.cudaFree(handle2) - - def test_bind_kv_cache(): from vllm.attention import Attention @@ -403,56 +338,6 @@ def test_bind_kv_cache_pp(): assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0] -@pytest.mark.parametrize( - ("src_dtype", "tgt_dtype", "expected_result"), - [ - # Different precision_levels - (torch.bool, torch.int8, True), - (torch.bool, torch.float16, True), - (torch.bool, torch.complex32, True), - (torch.int64, torch.bool, False), - (torch.int64, torch.float16, True), - (torch.int64, torch.complex32, True), - (torch.float64, torch.bool, False), - (torch.float64, torch.int8, False), - (torch.float64, torch.complex32, True), - (torch.complex128, torch.bool, False), - (torch.complex128, torch.int8, False), - (torch.complex128, torch.float16, False), - # precision_level=0 - (torch.bool, torch.bool, True), - # precision_level=1 - (torch.int8, torch.int16, True), - (torch.int16, torch.int8, False), - (torch.uint8, torch.int8, False), - (torch.int8, torch.uint8, False), - # precision_level=2 - (torch.float16, torch.float32, True), - (torch.float32, torch.float16, False), - (torch.bfloat16, torch.float32, True), - (torch.float32, torch.bfloat16, False), - # precision_level=3 - (torch.complex32, torch.complex64, True), - (torch.complex64, torch.complex32, False), - ], -) -def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result): - assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result - - -@pytest.mark.parametrize( - ("dtypes", "expected_result"), - [ - ([torch.bool], torch.bool), - ([torch.bool, torch.int8], torch.int8), - ([torch.bool, torch.int8, torch.float16], torch.float16), - ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501 - ], -) -def test_common_broadcastable_dtype(dtypes, expected_result): - assert common_broadcastable_dtype(dtypes) == expected_result - - def test_model_specification( parser_with_config, cli_config_file, cli_config_file_with_model ): @@ -535,23 +420,6 @@ def test_model_specification( assert args.port == 12312 -@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])]) -def test_sha256(input: tuple): - digest = sha256(input) - assert digest is not None - assert isinstance(digest, bytes) - assert digest != b"" - - input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL) - assert digest == hashlib.sha256(input_bytes).digest() - - # hashing again, returns the same value - assert digest == sha256(input) - - # hashing different input, returns different value - assert digest != sha256(input + (1,)) - - def test_convert_ids_list_to_tokens(): tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") token_ids = tokenizer.encode("Hello, world!") @@ -561,50 +429,6 @@ def test_convert_ids_list_to_tokens(): assert tokens == ["Hello", ",", " world", "!"] -def test_current_stream_multithread(): - import threading - - if not torch.cuda.is_available(): - pytest.skip("CUDA not available") - - main_default_stream = torch.cuda.current_stream() - child_stream = torch.cuda.Stream() - - thread_stream_ready = threading.Event() - thread_can_exit = threading.Event() - - def child_thread_func(): - with torch.cuda.stream(child_stream): - thread_stream_ready.set() - thread_can_exit.wait(timeout=10) - - child_thread = threading.Thread(target=child_thread_func) - child_thread.start() - - try: - assert thread_stream_ready.wait(timeout=5), ( - "Child thread failed to enter stream context in time" - ) - - main_current_stream = current_stream() - - assert main_current_stream != child_stream, ( - "Main thread's current_stream was contaminated by child thread" - ) - assert main_current_stream == main_default_stream, ( - "Main thread's current_stream is not the default stream" - ) - - # Notify child thread it can exit - thread_can_exit.set() - - finally: - # Ensure child thread exits properly - child_thread.join(timeout=5) - if child_thread.is_alive(): - pytest.fail("Child thread failed to exit properly") - - def test_load_config_file(tmp_path): # Define the configuration data config_data = { diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py index 6a5ac40d64..211abb463e 100644 --- a/tools/pre_commit/check_pickle_imports.py +++ b/tools/pre_commit/check_pickle_imports.py @@ -23,13 +23,14 @@ ALLOWED_FILES = { "vllm/transformers_utils/config.py", "vllm/model_executor/models/registry.py", "vllm/compilation/caching.py", - "tests/utils_/test_utils.py", - "tests/tokenization/test_cached_tokenizer.py", "vllm/distributed/utils.py", "vllm/distributed/parallel_state.py", "vllm/distributed/device_communicators/all_reduce_utils.py", "vllm/distributed/device_communicators/shm_broadcast.py", "vllm/distributed/device_communicators/shm_object_storage.py", + "vllm/utils/hashing.py", + "tests/utils_/test_hashing.py", + "tests/tokenization/test_cached_tokenizer.py", "benchmarks/kernels/graph_machete_bench.py", "benchmarks/kernels/benchmark_lora.py", "benchmarks/kernels/benchmark_machete.py", @@ -40,10 +41,8 @@ ALLOWED_FILES = { "vllm/executor/mp_distributed_executor.py", "vllm/executor/ray_distributed_executor.py", "vllm/entrypoints/llm.py", - "tests/utils.py", - # pickle and cloudpickle "vllm/utils/__init__.py", - "vllm/utils/hashing.py", + "tests/utils.py", } PICKLE_RE = re.compile( diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 7610b95034..866365ac18 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams -from vllm.utils.asyncio import merge_async_iterators +from vllm.utils.async_utils import merge_async_iterators def run_vllm( diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 10472b18d8..881447cb20 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -51,7 +51,7 @@ from vllm.transformers_utils.chat_templates import get_chat_template_fallback_pa from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import random_uuid -from vllm.utils.functools import supports_kw +from vllm.utils.func_utils import supports_kw logger = init_logger(__name__) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 30bcb59437..e82db693c9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -76,7 +76,7 @@ from vllm.transformers_utils.tokenizer import ( ) from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, Device -from vllm.utils.collections import as_iter, is_list_of +from vllm.utils.collection_utils import as_iter, is_list_of from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.llm_engine import LLMEngine from vllm.v1.sample.logits_processor import LogitsProcessor diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b5b314e15a..32e6b1d96c 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -70,7 +70,7 @@ from vllm.transformers_utils.tokenizers import ( truncate_tool_call_ids, validate_request_params, ) -from vllm.utils.collections import as_list +from vllm.utils.collection_utils import as_list logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 5c41990205..44211201d4 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -34,8 +34,8 @@ from vllm.logprobs import Logprob from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils.asyncio import merge_async_iterators -from vllm.utils.collections import as_list +from vllm.utils.async_utils import merge_async_iterators +from vllm.utils.collection_utils import as_list logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 2e3129cbeb..55f58e7757 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -39,8 +39,8 @@ from vllm.outputs import ( RequestOutput, ) from vllm.pooling_params import PoolingParams -from vllm.utils.asyncio import merge_async_iterators -from vllm.utils.collections import chunk_list +from vllm.utils.async_utils import merge_async_iterators +from vllm.utils.collection_utils import chunk_list logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index ffcde8c302..af5a423134 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -91,13 +91,13 @@ from vllm.tracing import ( ) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import random_uuid -from vllm.utils.asyncio import ( +from vllm.utils.async_utils import ( AsyncMicrobatchTokenizer, collect_from_async_generator, make_async, merge_async_iterators, ) -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.v1.engine import EngineCoreRequest logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 102a29fe35..7a27348da3 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -36,7 +36,7 @@ from vllm.entrypoints.utils import _validate_truncation_size from vllm.logger import init_logger from vllm.outputs import PoolingOutput, PoolingRequestOutput from vllm.tasks import SupportedTask -from vllm.utils.asyncio import merge_async_iterators +from vllm.utils.async_utils import merge_async_iterators logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index de3d7fbb2c..9cbfc97918 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -37,7 +37,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils.asyncio import make_async, merge_async_iterators +from vllm.utils.async_utils import make_async, merge_async_iterators logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 34ec9f8110..4733288644 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import import_from_path logger = init_logger(__name__) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 8fbc17e96f..a845528200 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -17,7 +17,7 @@ from vllm.inputs.data import TextPrompt as EngineTextPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.parse import get_prompt_components, parse_raw_prompts from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils.asyncio import AsyncMicrobatchTokenizer +from vllm.utils.async_utils import AsyncMicrobatchTokenizer @dataclass(frozen=True) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index cfb7950bcc..9de2249f6c 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -17,7 +17,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest from vllm.tasks import SupportedTask -from vllm.utils.asyncio import make_async +from vllm.utils.async_utils import make_async from vllm.v1.outputs import SamplerOutput from vllm.v1.worker.worker_base import WorkerBase diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index bf5947acb3..8e8901807f 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -19,7 +19,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.ray.ray_env import get_env_vars_to_copy from vllm.sequence import ExecuteModelRequest -from vllm.utils.asyncio import make_async +from vllm.utils.async_utils import make_async from vllm.utils.network_utils import ( get_distributed_init_method, get_ip, diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index c84fc098f0..211551be8e 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cas from typing_extensions import TypeIs -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from .data import ( EmbedsPrompt, diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index f48fad559e..fb11227089 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -17,7 +17,7 @@ from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils.collections import LazyDict +from vllm.utils.collection_utils import LazyDict logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 71776c654b..69a815a4e3 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -31,7 +31,7 @@ from vllm.utils.deep_gemm import ( get_mk_alignment_for_contiguous_layout, m_grouped_fp8_gemm_nt_contiguous, ) -from vllm.utils.functools import run_once +from vllm.utils.func_utils import run_once logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 0a1800590b..a3cd68948b 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -28,7 +28,7 @@ from vllm.model_executor.parameter import ( RowvLLMParameter, ) from vllm.transformers_utils.config import get_safetensors_params_metadata -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of if TYPE_CHECKING: from vllm.model_executor.layers.quantization import QuantizationMethods diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 191c7a6388..0d5439357f 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -57,7 +57,7 @@ from vllm.model_executor.parameter import ( from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.transformers_utils.config import get_safetensors_params_metadata -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of logger = init_logger(__name__) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 56bbaf0da1..3fc8187278 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -48,7 +48,7 @@ from vllm.transformers_utils.configs.deepseek_vl2 import ( ) from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 0eef5ab7db..6e046c16b7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -24,7 +24,7 @@ from vllm.inputs import TokensPrompt from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.utils.functools import supports_kw +from vllm.utils.func_utils import supports_kw from .interfaces_base import VllmModel, is_pooling_model diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 0c3c262da4..d87a65a470 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -15,7 +15,7 @@ import torch.nn as nn from typing_extensions import TypeIs, TypeVar from vllm.logger import init_logger -from vllm.utils.functools import supports_kw +from vllm.utils.func_utils import supports_kw if TYPE_CHECKING: from vllm.config import VllmConfig diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 56acb3ddf1..77c331b018 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -33,7 +33,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index b4a558ad69..147661babc 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -85,7 +85,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils.collections import flatten_2d_lists +from vllm.utils.collection_utils import flatten_2d_lists from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 1718cb2603..6955fc80af 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -79,7 +79,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index e3a0fa562e..a05f54191f 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -22,7 +22,7 @@ from typing import ( import numpy as np from typing_extensions import NotRequired, TypeVar, deprecated -from vllm.utils.collections import full_groupby, is_list_of +from vllm.utils.collection_utils import full_groupby, is_list_of from vllm.utils.import_utils import LazyLoader from vllm.utils.jsontree import json_map_leaves diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 8cb402e951..1ae2c7408a 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -19,7 +19,7 @@ import numpy as np import torch from typing_extensions import assert_never -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import LazyLoader from .audio import AudioResampler diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index d4477d8c85..94122c1d4c 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -25,8 +25,8 @@ from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens -from vllm.utils.collections import flatten_2d_lists, full_groupby -from vllm.utils.functools import get_allowed_kwarg_only_overrides +from vllm.utils.collection_utils import flatten_2d_lists, full_groupby +from vllm.utils.func_utils import get_allowed_kwarg_only_overrides from vllm.utils.jsontree import JSONTree, json_map_leaves from .hasher import MultiModalHasher @@ -486,7 +486,7 @@ _M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp) def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: """ Convenience function to apply - [`full_groupby`][vllm.utils.collections.full_groupby] + [`full_groupby`][vllm.utils.collection_utils.full_groupby] based on modality. """ return full_groupby(values, key=lambda x: x.modality) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 0ac10bd08b..2e4031bd51 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -9,7 +9,7 @@ import torch.nn as nn from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config -from vllm.utils.collections import ClassRegistry +from vllm.utils.collection_utils import ClassRegistry from .cache import BaseMultiModalProcessorCache from .processing import ( diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index ee890e662e..ebd660ca5a 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Any from vllm.entrypoints.tool_server import ToolServer from vllm.logger import init_logger -from vllm.utils.collections import is_list_of +from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import import_from_path if TYPE_CHECKING: diff --git a/vllm/tracing.py b/vllm/tracing.py index 4e55c746c8..01bbebf35c 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -5,7 +5,7 @@ import os from collections.abc import Mapping from vllm.logger import init_logger -from vllm.utils.functools import run_once +from vllm.utils.func_utils import run_once TRACE_HEADERS = ["traceparent", "tracestate"] diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 7f9d554733..98eb9cf335 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -16,7 +16,7 @@ from transformers.processing_utils import ProcessorMixin from transformers.video_processing_utils import BaseVideoProcessor from typing_extensions import TypeVar -from vllm.utils.functools import get_allowed_kwarg_only_overrides +from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: from vllm.config import ModelConfig diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index e1504d49a4..9a52e99998 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1122,9 +1122,6 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]: return cls -## moved to vllm.utils.profiling (imported at module top) - - # Only relevant for models using ALiBi (e.g, MPT) def check_use_alibi(model_config: ModelConfig) -> bool: cfg = model_config.hf_text_config @@ -1150,9 +1147,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool: ) -## moved to vllm.utils.hashing - - @cache def _has_module(module_name: str) -> bool: """Return True if *module_name* can be found in the current environment. diff --git a/vllm/utils/asyncio.py b/vllm/utils/async_utils.py similarity index 100% rename from vllm/utils/asyncio.py rename to vllm/utils/async_utils.py diff --git a/vllm/utils/collections.py b/vllm/utils/collection_utils.py similarity index 100% rename from vllm/utils/collections.py rename to vllm/utils/collection_utils.py diff --git a/vllm/utils/functools.py b/vllm/utils/func_utils.py similarity index 100% rename from vllm/utils/functools.py rename to vllm/utils/func_utils.py diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2e8a3150b4..e17cd7beb0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -30,9 +30,9 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import Device, cdiv -from vllm.utils.asyncio import cancel_task_threadsafe -from vllm.utils.collections import as_list -from vllm.utils.functools import deprecate_kwargs +from vllm.utils.async_utils import cancel_task_threadsafe +from vllm.utils.collection_utils import as_list +from vllm.utils.func_utils import deprecate_kwargs from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index c99f672f15..9e99454117 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -23,7 +23,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask -from vllm.utils.asyncio import in_loop +from vllm.utils.async_utils import in_loop from vllm.utils.network_utils import ( close_sockets, get_open_port, diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 4a9cbeaea0..476c3edefb 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -13,7 +13,7 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import length_from_prompt_token_ids_or_embeds -from vllm.utils.collections import swap_dict_values +from vllm.utils.collection_utils import swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import ( diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index f52d92afab..74e8225b2f 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -10,7 +10,7 @@ import torch from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingType from vllm.utils import length_from_prompt_token_ids_or_embeds -from vllm.utils.collections import swap_dict_values +from vllm.utils.collection_utils import swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.worker.block_table import MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState