[Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-20 06:03:50 +08:00 · 2025-10-19 20:20:55 +08:00
parent 7a6c8c3fa1
commit d31f7844f8
52 changed files with 246 additions and 237 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -60,7 +60,7 @@ from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.utils import maybe_model_redirect
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_num_threads

 logger = init_logger(__name__)
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators

 MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -17,7 +17,7 @@ from transformers import (
 )

 from vllm.platforms import current_platform
-from vllm.utils.functools import identity
+from vllm.utils.func_utils import identity

 from ....conftest import (
    IMAGE_ASSETS,
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@ -25,7 +25,7 @@ from transformers import (
 from transformers.video_utils import VideoMetadata

 from vllm.logprobs import SampleLogprobs
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of

 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@ -34,7 +34,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype

 from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
--- a/tests/utils_/test_async_utils.py
+++ b/tests/utils_/test_async_utils.py
@ -5,7 +5,7 @@ from collections.abc import AsyncIterator

 import pytest

-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators


 async def _mock_async_iterator(idx: int):
--- a/tests/utils_/test_collection_utils.py
+++ b/tests/utils_/test_collection_utils.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-from vllm.utils.collections import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values


@pytest.mark.parametrize(
--- a/tests/utils_/test_func_utils.py
+++ b/tests/utils_/test_func_utils.py
@ -4,7 +4,7 @@

 import pytest

-from vllm.utils.functools import deprecate_kwargs, supports_kw
+from vllm.utils.func_utils import deprecate_kwargs, supports_kw

 from ..utils import error_on_warning

--- a/tests/utils_/test_hashing.py
+++ b/tests/utils_/test_hashing.py
@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import pickle
+
+import pytest
+
+from vllm.utils.hashing import sha256
+
+
+@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
+def test_sha256(input: tuple):
+    digest = sha256(input)
+    assert digest is not None
+    assert isinstance(digest, bytes)
+    assert digest != b""
+
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert digest == hashlib.sha256(input_bytes).digest()
+
+    # hashing again, returns the same value
+    assert digest == sha256(input)
+
+    # hashing different input, returns different value
+    assert digest != sha256(input + (1,))
--- a/tests/utils_/test_mem_utils.py
+++ b/tests/utils_/test_mem_utils.py
@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm_test_utils.monitor import monitor
+
+from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
+
+from ..utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_snapshot = MemorySnapshot()
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
+
+    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
+
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
+    with (
+        memory_profiling(
+            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
+        ) as result,
+        monitor(measure_current_non_torch) as monitored_values,
+    ):
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
+    # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
+    # which causes a small error (<10 MiB in practice)
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
--- a/tests/utils_/test_torch_utils.py
+++ b/tests/utils_/test_torch_utils.py
@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.utils.torch_utils import (
+    common_broadcastable_dtype,
+    current_stream,
+    is_lossless_cast,
+)
+
+
+@pytest.mark.parametrize(
+    ("src_dtype", "tgt_dtype", "expected_result"),
+    [
+        # Different precision_levels
+        (torch.bool, torch.int8, True),
+        (torch.bool, torch.float16, True),
+        (torch.bool, torch.complex32, True),
+        (torch.int64, torch.bool, False),
+        (torch.int64, torch.float16, True),
+        (torch.int64, torch.complex32, True),
+        (torch.float64, torch.bool, False),
+        (torch.float64, torch.int8, False),
+        (torch.float64, torch.complex32, True),
+        (torch.complex128, torch.bool, False),
+        (torch.complex128, torch.int8, False),
+        (torch.complex128, torch.float16, False),
+        # precision_level=0
+        (torch.bool, torch.bool, True),
+        # precision_level=1
+        (torch.int8, torch.int16, True),
+        (torch.int16, torch.int8, False),
+        (torch.uint8, torch.int8, False),
+        (torch.int8, torch.uint8, False),
+        # precision_level=2
+        (torch.float16, torch.float32, True),
+        (torch.float32, torch.float16, False),
+        (torch.bfloat16, torch.float32, True),
+        (torch.float32, torch.bfloat16, False),
+        # precision_level=3
+        (torch.complex32, torch.complex64, True),
+        (torch.complex64, torch.complex32, False),
+    ],
+)
+def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
+    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
+
+
+@pytest.mark.parametrize(
+    ("dtypes", "expected_result"),
+    [
+        ([torch.bool], torch.bool),
+        ([torch.bool, torch.int8], torch.int8),
+        ([torch.bool, torch.int8, torch.float16], torch.float16),
+        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
+    ],
+)
+def test_common_broadcastable_dtype(dtypes, expected_result):
+    assert common_broadcastable_dtype(dtypes) == expected_result
+
+
+def test_current_stream_multithread():
+    import threading
+
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    main_default_stream = torch.cuda.current_stream()
+    child_stream = torch.cuda.Stream()
+
+    thread_stream_ready = threading.Event()
+    thread_can_exit = threading.Event()
+
+    def child_thread_func():
+        with torch.cuda.stream(child_stream):
+            thread_stream_ready.set()
+            thread_can_exit.wait(timeout=10)
+
+    child_thread = threading.Thread(target=child_thread_func)
+    child_thread.start()
+
+    try:
+        assert thread_stream_ready.wait(timeout=5), (
+            "Child thread failed to enter stream context in time"
+        )
+
+        main_current_stream = current_stream()
+
+        assert main_current_stream != child_stream, (
+            "Main thread's current_stream was contaminated by child thread"
+        )
+        assert main_current_stream == main_default_stream, (
+            "Main thread's current_stream is not the default stream"
+        )
+
+        # Notify child thread it can exit
+        thread_can_exit.set()
+
+    finally:
+        # Ensure child thread exits properly
+        child_thread.join(timeout=5)
+        if child_thread.is_alive():
+            pytest.fail("Child thread failed to exit properly")
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@ -2,10 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa

-import hashlib
 import json
 import os
-import pickle
 import tempfile
 from pathlib import Path
 from unittest.mock import patch
@ -14,7 +12,6 @@ import pytest
 import torch
 import yaml
 from transformers import AutoTokenizer
-from vllm_test_utils.monitor import monitor

 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
@ -24,13 +21,6 @@ from vllm.utils import (
    bind_kv_cache,
    unique_filepath,
 )
-from vllm.utils.hashing import sha256
-from vllm.utils.torch_utils import (
-    common_broadcastable_dtype,
-    current_stream,
-    is_lossless_cast,
-)
-from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
 from ..utils import create_new_process_for_each_test, flat_product


@ -267,61 +257,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
    assert "-O.mode" in caplog_vllm.text


-@create_new_process_for_each_test()
-def test_memory_profiling():
-    # Fake out some model loading + inference memory usage to test profiling
-    # Memory used by other processes will show up as cuda usage outside of torch
-    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
-
-    lib = CudaRTLibrary()
-    # 512 MiB allocation outside of this instance
-    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
-
-    baseline_snapshot = MemorySnapshot()
-
-    # load weights
-
-    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
-
-    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
-
-    def measure_current_non_torch():
-        free, total = torch.cuda.mem_get_info()
-        current_used = total - free
-        current_torch = torch.cuda.memory_reserved()
-        current_non_torch = current_used - current_torch
-        return current_non_torch
-
-    with (
-        memory_profiling(
-            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
-        ) as result,
-        monitor(measure_current_non_torch) as monitored_values,
-    ):
-        # make a memory spike, 1 GiB
-        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
-        del spike
-
-        # Add some extra non-torch memory 256 MiB (simulate NCCL)
-        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
-
-    # this is an analytic value, it is exact,
-    # we only have 256 MiB non-torch memory increase
-    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
-    assert measured_diff == 256 * 1024 * 1024
-
-    # Check that the memory usage is within 5% of the expected values
-    # 5% tolerance is caused by cuda runtime.
-    # we cannot control cuda runtime in the granularity of bytes,
-    # which causes a small error (<10 MiB in practice)
-    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
-    assert abs(non_torch_ratio - 1) <= 0.05
-    assert result.torch_peak_increase == 1024 * 1024 * 1024
-    del weights
-    lib.cudaFree(handle1)
-    lib.cudaFree(handle2)
-
-
 def test_bind_kv_cache():
    from vllm.attention import Attention

@ -403,56 +338,6 @@ def test_bind_kv_cache_pp():
        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]


-@pytest.mark.parametrize(
-    ("src_dtype", "tgt_dtype", "expected_result"),
-    [
-        # Different precision_levels
-        (torch.bool, torch.int8, True),
-        (torch.bool, torch.float16, True),
-        (torch.bool, torch.complex32, True),
-        (torch.int64, torch.bool, False),
-        (torch.int64, torch.float16, True),
-        (torch.int64, torch.complex32, True),
-        (torch.float64, torch.bool, False),
-        (torch.float64, torch.int8, False),
-        (torch.float64, torch.complex32, True),
-        (torch.complex128, torch.bool, False),
-        (torch.complex128, torch.int8, False),
-        (torch.complex128, torch.float16, False),
-        # precision_level=0
-        (torch.bool, torch.bool, True),
-        # precision_level=1
-        (torch.int8, torch.int16, True),
-        (torch.int16, torch.int8, False),
-        (torch.uint8, torch.int8, False),
-        (torch.int8, torch.uint8, False),
-        # precision_level=2
-        (torch.float16, torch.float32, True),
-        (torch.float32, torch.float16, False),
-        (torch.bfloat16, torch.float32, True),
-        (torch.float32, torch.bfloat16, False),
-        # precision_level=3
-        (torch.complex32, torch.complex64, True),
-        (torch.complex64, torch.complex32, False),
-    ],
-)
-def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
-    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
-
-
-@pytest.mark.parametrize(
-    ("dtypes", "expected_result"),
-    [
-        ([torch.bool], torch.bool),
-        ([torch.bool, torch.int8], torch.int8),
-        ([torch.bool, torch.int8, torch.float16], torch.float16),
-        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
-    ],
-)
-def test_common_broadcastable_dtype(dtypes, expected_result):
-    assert common_broadcastable_dtype(dtypes) == expected_result
-
-
 def test_model_specification(
    parser_with_config, cli_config_file, cli_config_file_with_model
 ):
@ -535,23 +420,6 @@ def test_model_specification(
    assert args.port == 12312


-@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
-def test_sha256(input: tuple):
-    digest = sha256(input)
-    assert digest is not None
-    assert isinstance(digest, bytes)
-    assert digest != b""
-
-    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    assert digest == hashlib.sha256(input_bytes).digest()
-
-    # hashing again, returns the same value
-    assert digest == sha256(input)
-
-    # hashing different input, returns different value
-    assert digest != sha256(input + (1,))
-
-
 def test_convert_ids_list_to_tokens():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
    token_ids = tokenizer.encode("Hello, world!")
@ -561,50 +429,6 @@ def test_convert_ids_list_to_tokens():
    assert tokens == ["Hello", ",", " world", "!"]


-def test_current_stream_multithread():
-    import threading
-
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA not available")
-
-    main_default_stream = torch.cuda.current_stream()
-    child_stream = torch.cuda.Stream()
-
-    thread_stream_ready = threading.Event()
-    thread_can_exit = threading.Event()
-
-    def child_thread_func():
-        with torch.cuda.stream(child_stream):
-            thread_stream_ready.set()
-            thread_can_exit.wait(timeout=10)
-
-    child_thread = threading.Thread(target=child_thread_func)
-    child_thread.start()
-
-    try:
-        assert thread_stream_ready.wait(timeout=5), (
-            "Child thread failed to enter stream context in time"
-        )
-
-        main_current_stream = current_stream()
-
-        assert main_current_stream != child_stream, (
-            "Main thread's current_stream was contaminated by child thread"
-        )
-        assert main_current_stream == main_default_stream, (
-            "Main thread's current_stream is not the default stream"
-        )
-
-        # Notify child thread it can exit
-        thread_can_exit.set()
-
-    finally:
-        # Ensure child thread exits properly
-        child_thread.join(timeout=5)
-        if child_thread.is_alive():
-            pytest.fail("Child thread failed to exit properly")
-
-
 def test_load_config_file(tmp_path):
    # Define the configuration data
    config_data = {
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@ -23,13 +23,14 @@ ALLOWED_FILES = {
    "vllm/transformers_utils/config.py",
    "vllm/model_executor/models/registry.py",
    "vllm/compilation/caching.py",
-    "tests/utils_/test_utils.py",
-    "tests/tokenization/test_cached_tokenizer.py",
    "vllm/distributed/utils.py",
    "vllm/distributed/parallel_state.py",
    "vllm/distributed/device_communicators/all_reduce_utils.py",
    "vllm/distributed/device_communicators/shm_broadcast.py",
    "vllm/distributed/device_communicators/shm_object_storage.py",
+    "vllm/utils/hashing.py",
+    "tests/utils_/test_hashing.py",
+    "tests/tokenization/test_cached_tokenizer.py",
    "benchmarks/kernels/graph_machete_bench.py",
    "benchmarks/kernels/benchmark_lora.py",
    "benchmarks/kernels/benchmark_machete.py",
@ -40,10 +41,8 @@ ALLOWED_FILES = {
    "vllm/executor/mp_distributed_executor.py",
    "vllm/executor/ray_distributed_executor.py",
    "vllm/entrypoints/llm.py",
-    "tests/utils.py",
-    # pickle and cloudpickle
    "vllm/utils/__init__.py",
-    "vllm/utils/hashing.py",
+    "tests/utils.py",
 }

 PICKLE_RE = re.compile(
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators


 def run_vllm(
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -51,7 +51,7 @@ from vllm.transformers_utils.chat_templates import get_chat_template_fallback_pa
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
-from vllm.utils.functools import supports_kw
+from vllm.utils.func_utils import supports_kw

 logger = init_logger(__name__)

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -76,7 +76,7 @@ from vllm.transformers_utils.tokenizer import (
 )
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device
-from vllm.utils.collections import as_iter, is_list_of
+from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.sample.logits_processor import LogitsProcessor
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -70,7 +70,7 @@ from vllm.transformers_utils.tokenizers import (
    truncate_tool_call_ids,
    validate_request_params,
 )
-from vllm.utils.collections import as_list
+from vllm.utils.collection_utils import as_list

 logger = init_logger(__name__)

--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -34,8 +34,8 @@ from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils.asyncio import merge_async_iterators
-from vllm.utils.collections import as_list
+from vllm.utils.async_utils import merge_async_iterators
+from vllm.utils.collection_utils import as_list

 logger = init_logger(__name__)

--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@ -39,8 +39,8 @@ from vllm.outputs import (
    RequestOutput,
 )
 from vllm.pooling_params import PoolingParams
-from vllm.utils.asyncio import merge_async_iterators
-from vllm.utils.collections import chunk_list
+from vllm.utils.async_utils import merge_async_iterators
+from vllm.utils.collection_utils import chunk_list

 logger = init_logger(__name__)

--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -91,13 +91,13 @@ from vllm.tracing import (
 )
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
-from vllm.utils.asyncio import (
+from vllm.utils.async_utils import (
    AsyncMicrobatchTokenizer,
    collect_from_async_generator,
    make_async,
    merge_async_iterators,
 )
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.v1.engine import EngineCoreRequest

 logger = init_logger(__name__)
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@ -36,7 +36,7 @@ from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.tasks import SupportedTask
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators

 logger = init_logger(__name__)

--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@ -37,7 +37,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils.asyncio import make_async, merge_async_iterators
+from vllm.utils.async_utils import make_async, merge_async_iterators

 logger = init_logger(__name__)

--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path

 logger = init_logger(__name__)
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@ -17,7 +17,7 @@ from vllm.inputs.data import TextPrompt as EngineTextPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils.asyncio import AsyncMicrobatchTokenizer
+from vllm.utils.async_utils import AsyncMicrobatchTokenizer


@dataclass(frozen=True)
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@ -17,7 +17,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.tasks import SupportedTask
-from vllm.utils.asyncio import make_async
+from vllm.utils.async_utils import make_async
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.worker.worker_base import WorkerBase

--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@ -19,7 +19,7 @@ from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils.asyncio import make_async
+from vllm.utils.async_utils import make_async
 from vllm.utils.network_utils import (
    get_distributed_init_method,
    get_ip,
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cas

 from typing_extensions import TypeIs

-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of

 from .data import (
    EmbedsPrompt,
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@ -17,7 +17,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils.collections import LazyDict
+from vllm.utils.collection_utils import LazyDict

 logger = init_logger(__name__)

--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@ -31,7 +31,7 @@ from vllm.utils.deep_gemm import (
    get_mk_alignment_for_contiguous_layout,
    m_grouped_fp8_gemm_nt_contiguous,
 )
-from vllm.utils.functools import run_once
+from vllm.utils.func_utils import run_once

 logger = init_logger(__name__)

--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@ -28,7 +28,7 @@ from vllm.model_executor.parameter import (
    RowvLLMParameter,
 )
 from vllm.transformers_utils.config import get_safetensors_params_metadata
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of

 if TYPE_CHECKING:
    from vllm.model_executor.layers.quantization import QuantizationMethods
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@ -57,7 +57,7 @@ from vllm.model_executor.parameter import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.transformers_utils.config import get_safetensors_params_metadata
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of

 logger = init_logger(__name__)

--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@ -48,7 +48,7 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (
 )
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype

--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@ -24,7 +24,7 @@ from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.utils.functools import supports_kw
+from vllm.utils.func_utils import supports_kw

 from .interfaces_base import VllmModel, is_pooling_model

--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@ -15,7 +15,7 @@ import torch.nn as nn
 from typing_extensions import TypeIs, TypeVar

 from vllm.logger import init_logger
-from vllm.utils.functools import supports_kw
+from vllm.utils.func_utils import supports_kw

 if TYPE_CHECKING:
    from vllm.config import VllmConfig
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@ -33,7 +33,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@ -85,7 +85,7 @@ from vllm.multimodal.processing import (
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils.collections import flatten_2d_lists
+from vllm.utils.collection_utils import flatten_2d_lists
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype

--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@ -79,7 +79,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of

 from .interfaces import (
    MultiModalEmbeddings,
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@ -22,7 +22,7 @@ from typing import (
 import numpy as np
 from typing_extensions import NotRequired, TypeVar, deprecated

-from vllm.utils.collections import full_groupby, is_list_of
+from vllm.utils.collection_utils import full_groupby, is_list_of
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.jsontree import json_map_leaves

--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@ -19,7 +19,7 @@ import numpy as np
 import torch
 from typing_extensions import assert_never

-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader

 from .audio import AudioResampler
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@ -25,8 +25,8 @@ from typing_extensions import TypeVar, assert_never
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
-from vllm.utils.collections import flatten_2d_lists, full_groupby
-from vllm.utils.functools import get_allowed_kwarg_only_overrides
+from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
+from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves

 from .hasher import MultiModalHasher
@ -486,7 +486,7 @@ _M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp)
 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
    """
    Convenience function to apply
-    [`full_groupby`][vllm.utils.collections.full_groupby]
+    [`full_groupby`][vllm.utils.collection_utils.full_groupby]
    based on modality.
    """
    return full_groupby(values, key=lambda x: x.modality)
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@ -9,7 +9,7 @@ import torch.nn as nn
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
-from vllm.utils.collections import ClassRegistry
+from vllm.utils.collection_utils import ClassRegistry

 from .cache import BaseMultiModalProcessorCache
 from .processing import (
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Any

 from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path

 if TYPE_CHECKING:
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@ -5,7 +5,7 @@ import os
 from collections.abc import Mapping

 from vllm.logger import init_logger
-from vllm.utils.functools import run_once
+from vllm.utils.func_utils import run_once

 TRACE_HEADERS = ["traceparent", "tracestate"]

--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@ -16,7 +16,7 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar

-from vllm.utils.functools import get_allowed_kwarg_only_overrides
+from vllm.utils.func_utils import get_allowed_kwarg_only_overrides

 if TYPE_CHECKING:
    from vllm.config import ModelConfig
--- a/vllm/utils/init.py
+++ b/vllm/utils/init.py
@ -1122,9 +1122,6 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
    return cls


-## moved to vllm.utils.profiling (imported at module top)
-
-
 # Only relevant for models using ALiBi (e.g, MPT)
 def check_use_alibi(model_config: ModelConfig) -> bool:
    cfg = model_config.hf_text_config
@ -1150,9 +1147,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
    )


-## moved to vllm.utils.hashing
-
-
@cache
 def _has_module(module_name: str) -> bool:
    """Return True if *module_name* can be found in the current environment.
--- a/vllm/utils/async_utils.py
+++ b/vllm/utils/async_utils.py
--- a/vllm/utils/collection_utils.py
+++ b/vllm/utils/collection_utils.py
--- a/vllm/utils/func_utils.py
+++ b/vllm/utils/func_utils.py
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -30,9 +30,9 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va
 from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv
-from vllm.utils.asyncio import cancel_task_threadsafe
-from vllm.utils.collections import as_list
-from vllm.utils.functools import deprecate_kwargs
+from vllm.utils.async_utils import cancel_task_threadsafe
+from vllm.utils.collection_utils import as_list
+from vllm.utils.func_utils import deprecate_kwargs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@ -23,7 +23,7 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
-from vllm.utils.asyncio import in_loop
+from vllm.utils.async_utils import in_loop
 from vllm.utils.network_utils import (
    close_sockets,
    get_open_port,
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@ -13,7 +13,7 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.utils.collections import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@ -10,7 +10,7 @@ import torch
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingType
 from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.utils.collections import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState