mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 06:03:50 +08:00
[Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@ -60,7 +60,7 @@ from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils.asyncio import merge_async_iterators
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
|
||||
MODEL_PATH = "zai-org/chatglm3-6b"
|
||||
LORA_RANK = 64
|
||||
|
@ -17,7 +17,7 @@ from transformers import (
|
||||
)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.functools import identity
|
||||
from vllm.utils.func_utils import identity
|
||||
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
|
@ -25,7 +25,7 @@ from transformers import (
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
||||
from .types import RunnerOutput
|
||||
|
@ -34,7 +34,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
|
||||
|
@ -5,7 +5,7 @@ from collections.abc import AsyncIterator
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils.asyncio import merge_async_iterators
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
|
||||
|
||||
async def _mock_async_iterator(idx: int):
|
||||
|
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from vllm.utils.collections import swap_dict_values
|
||||
from vllm.utils.collection_utils import swap_dict_values
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
@ -4,7 +4,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils.functools import deprecate_kwargs, supports_kw
|
||||
from vllm.utils.func_utils import deprecate_kwargs, supports_kw
|
||||
|
||||
from ..utils import error_on_warning
|
||||
|
||||
|
25
tests/utils_/test_hashing.py
Normal file
25
tests/utils_/test_hashing.py
Normal file
@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import hashlib
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils.hashing import sha256
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
|
||||
def test_sha256(input: tuple):
|
||||
digest = sha256(input)
|
||||
assert digest is not None
|
||||
assert isinstance(digest, bytes)
|
||||
assert digest != b""
|
||||
|
||||
input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
assert digest == hashlib.sha256(input_bytes).digest()
|
||||
|
||||
# hashing again, returns the same value
|
||||
assert digest == sha256(input)
|
||||
|
||||
# hashing different input, returns different value
|
||||
assert digest != sha256(input + (1,))
|
63
tests/utils_/test_mem_utils.py
Normal file
63
tests/utils_/test_mem_utils.py
Normal file
@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
from vllm_test_utils.monitor import monitor
|
||||
|
||||
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_memory_profiling():
|
||||
# Fake out some model loading + inference memory usage to test profiling
|
||||
# Memory used by other processes will show up as cuda usage outside of torch
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
|
||||
lib = CudaRTLibrary()
|
||||
# 512 MiB allocation outside of this instance
|
||||
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
|
||||
|
||||
baseline_snapshot = MemorySnapshot()
|
||||
|
||||
# load weights
|
||||
|
||||
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||
|
||||
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
|
||||
|
||||
def measure_current_non_torch():
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
current_used = total - free
|
||||
current_torch = torch.cuda.memory_reserved()
|
||||
current_non_torch = current_used - current_torch
|
||||
return current_non_torch
|
||||
|
||||
with (
|
||||
memory_profiling(
|
||||
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
|
||||
) as result,
|
||||
monitor(measure_current_non_torch) as monitored_values,
|
||||
):
|
||||
# make a memory spike, 1 GiB
|
||||
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||
del spike
|
||||
|
||||
# Add some extra non-torch memory 256 MiB (simulate NCCL)
|
||||
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
|
||||
|
||||
# this is an analytic value, it is exact,
|
||||
# we only have 256 MiB non-torch memory increase
|
||||
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
|
||||
assert measured_diff == 256 * 1024 * 1024
|
||||
|
||||
# Check that the memory usage is within 5% of the expected values
|
||||
# 5% tolerance is caused by cuda runtime.
|
||||
# we cannot control cuda runtime in the granularity of bytes,
|
||||
# which causes a small error (<10 MiB in practice)
|
||||
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
|
||||
assert abs(non_torch_ratio - 1) <= 0.05
|
||||
assert result.torch_peak_increase == 1024 * 1024 * 1024
|
||||
del weights
|
||||
lib.cudaFree(handle1)
|
||||
lib.cudaFree(handle2)
|
104
tests/utils_/test_torch_utils.py
Normal file
104
tests/utils_/test_torch_utils.py
Normal file
@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.utils.torch_utils import (
|
||||
common_broadcastable_dtype,
|
||||
current_stream,
|
||||
is_lossless_cast,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("src_dtype", "tgt_dtype", "expected_result"),
|
||||
[
|
||||
# Different precision_levels
|
||||
(torch.bool, torch.int8, True),
|
||||
(torch.bool, torch.float16, True),
|
||||
(torch.bool, torch.complex32, True),
|
||||
(torch.int64, torch.bool, False),
|
||||
(torch.int64, torch.float16, True),
|
||||
(torch.int64, torch.complex32, True),
|
||||
(torch.float64, torch.bool, False),
|
||||
(torch.float64, torch.int8, False),
|
||||
(torch.float64, torch.complex32, True),
|
||||
(torch.complex128, torch.bool, False),
|
||||
(torch.complex128, torch.int8, False),
|
||||
(torch.complex128, torch.float16, False),
|
||||
# precision_level=0
|
||||
(torch.bool, torch.bool, True),
|
||||
# precision_level=1
|
||||
(torch.int8, torch.int16, True),
|
||||
(torch.int16, torch.int8, False),
|
||||
(torch.uint8, torch.int8, False),
|
||||
(torch.int8, torch.uint8, False),
|
||||
# precision_level=2
|
||||
(torch.float16, torch.float32, True),
|
||||
(torch.float32, torch.float16, False),
|
||||
(torch.bfloat16, torch.float32, True),
|
||||
(torch.float32, torch.bfloat16, False),
|
||||
# precision_level=3
|
||||
(torch.complex32, torch.complex64, True),
|
||||
(torch.complex64, torch.complex32, False),
|
||||
],
|
||||
)
|
||||
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
||||
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("dtypes", "expected_result"),
|
||||
[
|
||||
([torch.bool], torch.bool),
|
||||
([torch.bool, torch.int8], torch.int8),
|
||||
([torch.bool, torch.int8, torch.float16], torch.float16),
|
||||
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
||||
],
|
||||
)
|
||||
def test_common_broadcastable_dtype(dtypes, expected_result):
|
||||
assert common_broadcastable_dtype(dtypes) == expected_result
|
||||
|
||||
|
||||
def test_current_stream_multithread():
|
||||
import threading
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
pytest.skip("CUDA not available")
|
||||
|
||||
main_default_stream = torch.cuda.current_stream()
|
||||
child_stream = torch.cuda.Stream()
|
||||
|
||||
thread_stream_ready = threading.Event()
|
||||
thread_can_exit = threading.Event()
|
||||
|
||||
def child_thread_func():
|
||||
with torch.cuda.stream(child_stream):
|
||||
thread_stream_ready.set()
|
||||
thread_can_exit.wait(timeout=10)
|
||||
|
||||
child_thread = threading.Thread(target=child_thread_func)
|
||||
child_thread.start()
|
||||
|
||||
try:
|
||||
assert thread_stream_ready.wait(timeout=5), (
|
||||
"Child thread failed to enter stream context in time"
|
||||
)
|
||||
|
||||
main_current_stream = current_stream()
|
||||
|
||||
assert main_current_stream != child_stream, (
|
||||
"Main thread's current_stream was contaminated by child thread"
|
||||
)
|
||||
assert main_current_stream == main_default_stream, (
|
||||
"Main thread's current_stream is not the default stream"
|
||||
)
|
||||
|
||||
# Notify child thread it can exit
|
||||
thread_can_exit.set()
|
||||
|
||||
finally:
|
||||
# Ensure child thread exits properly
|
||||
child_thread.join(timeout=5)
|
||||
if child_thread.is_alive():
|
||||
pytest.fail("Child thread failed to exit properly")
|
@ -2,10 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
@ -14,7 +12,6 @@ import pytest
|
||||
import torch
|
||||
import yaml
|
||||
from transformers import AutoTokenizer
|
||||
from vllm_test_utils.monitor import monitor
|
||||
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
|
||||
@ -24,13 +21,6 @@ from vllm.utils import (
|
||||
bind_kv_cache,
|
||||
unique_filepath,
|
||||
)
|
||||
from vllm.utils.hashing import sha256
|
||||
from vllm.utils.torch_utils import (
|
||||
common_broadcastable_dtype,
|
||||
current_stream,
|
||||
is_lossless_cast,
|
||||
)
|
||||
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
||||
from ..utils import create_new_process_for_each_test, flat_product
|
||||
|
||||
|
||||
@ -267,61 +257,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
||||
assert "-O.mode" in caplog_vllm.text
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_memory_profiling():
|
||||
# Fake out some model loading + inference memory usage to test profiling
|
||||
# Memory used by other processes will show up as cuda usage outside of torch
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
|
||||
lib = CudaRTLibrary()
|
||||
# 512 MiB allocation outside of this instance
|
||||
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
|
||||
|
||||
baseline_snapshot = MemorySnapshot()
|
||||
|
||||
# load weights
|
||||
|
||||
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||
|
||||
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
|
||||
|
||||
def measure_current_non_torch():
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
current_used = total - free
|
||||
current_torch = torch.cuda.memory_reserved()
|
||||
current_non_torch = current_used - current_torch
|
||||
return current_non_torch
|
||||
|
||||
with (
|
||||
memory_profiling(
|
||||
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
|
||||
) as result,
|
||||
monitor(measure_current_non_torch) as monitored_values,
|
||||
):
|
||||
# make a memory spike, 1 GiB
|
||||
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
|
||||
del spike
|
||||
|
||||
# Add some extra non-torch memory 256 MiB (simulate NCCL)
|
||||
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
|
||||
|
||||
# this is an analytic value, it is exact,
|
||||
# we only have 256 MiB non-torch memory increase
|
||||
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
|
||||
assert measured_diff == 256 * 1024 * 1024
|
||||
|
||||
# Check that the memory usage is within 5% of the expected values
|
||||
# 5% tolerance is caused by cuda runtime.
|
||||
# we cannot control cuda runtime in the granularity of bytes,
|
||||
# which causes a small error (<10 MiB in practice)
|
||||
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
|
||||
assert abs(non_torch_ratio - 1) <= 0.05
|
||||
assert result.torch_peak_increase == 1024 * 1024 * 1024
|
||||
del weights
|
||||
lib.cudaFree(handle1)
|
||||
lib.cudaFree(handle2)
|
||||
|
||||
|
||||
def test_bind_kv_cache():
|
||||
from vllm.attention import Attention
|
||||
|
||||
@ -403,56 +338,6 @@ def test_bind_kv_cache_pp():
|
||||
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("src_dtype", "tgt_dtype", "expected_result"),
|
||||
[
|
||||
# Different precision_levels
|
||||
(torch.bool, torch.int8, True),
|
||||
(torch.bool, torch.float16, True),
|
||||
(torch.bool, torch.complex32, True),
|
||||
(torch.int64, torch.bool, False),
|
||||
(torch.int64, torch.float16, True),
|
||||
(torch.int64, torch.complex32, True),
|
||||
(torch.float64, torch.bool, False),
|
||||
(torch.float64, torch.int8, False),
|
||||
(torch.float64, torch.complex32, True),
|
||||
(torch.complex128, torch.bool, False),
|
||||
(torch.complex128, torch.int8, False),
|
||||
(torch.complex128, torch.float16, False),
|
||||
# precision_level=0
|
||||
(torch.bool, torch.bool, True),
|
||||
# precision_level=1
|
||||
(torch.int8, torch.int16, True),
|
||||
(torch.int16, torch.int8, False),
|
||||
(torch.uint8, torch.int8, False),
|
||||
(torch.int8, torch.uint8, False),
|
||||
# precision_level=2
|
||||
(torch.float16, torch.float32, True),
|
||||
(torch.float32, torch.float16, False),
|
||||
(torch.bfloat16, torch.float32, True),
|
||||
(torch.float32, torch.bfloat16, False),
|
||||
# precision_level=3
|
||||
(torch.complex32, torch.complex64, True),
|
||||
(torch.complex64, torch.complex32, False),
|
||||
],
|
||||
)
|
||||
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
|
||||
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("dtypes", "expected_result"),
|
||||
[
|
||||
([torch.bool], torch.bool),
|
||||
([torch.bool, torch.int8], torch.int8),
|
||||
([torch.bool, torch.int8, torch.float16], torch.float16),
|
||||
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
|
||||
],
|
||||
)
|
||||
def test_common_broadcastable_dtype(dtypes, expected_result):
|
||||
assert common_broadcastable_dtype(dtypes) == expected_result
|
||||
|
||||
|
||||
def test_model_specification(
|
||||
parser_with_config, cli_config_file, cli_config_file_with_model
|
||||
):
|
||||
@ -535,23 +420,6 @@ def test_model_specification(
|
||||
assert args.port == 12312
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
|
||||
def test_sha256(input: tuple):
|
||||
digest = sha256(input)
|
||||
assert digest is not None
|
||||
assert isinstance(digest, bytes)
|
||||
assert digest != b""
|
||||
|
||||
input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
assert digest == hashlib.sha256(input_bytes).digest()
|
||||
|
||||
# hashing again, returns the same value
|
||||
assert digest == sha256(input)
|
||||
|
||||
# hashing different input, returns different value
|
||||
assert digest != sha256(input + (1,))
|
||||
|
||||
|
||||
def test_convert_ids_list_to_tokens():
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
|
||||
token_ids = tokenizer.encode("Hello, world!")
|
||||
@ -561,50 +429,6 @@ def test_convert_ids_list_to_tokens():
|
||||
assert tokens == ["Hello", ",", " world", "!"]
|
||||
|
||||
|
||||
def test_current_stream_multithread():
|
||||
import threading
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
pytest.skip("CUDA not available")
|
||||
|
||||
main_default_stream = torch.cuda.current_stream()
|
||||
child_stream = torch.cuda.Stream()
|
||||
|
||||
thread_stream_ready = threading.Event()
|
||||
thread_can_exit = threading.Event()
|
||||
|
||||
def child_thread_func():
|
||||
with torch.cuda.stream(child_stream):
|
||||
thread_stream_ready.set()
|
||||
thread_can_exit.wait(timeout=10)
|
||||
|
||||
child_thread = threading.Thread(target=child_thread_func)
|
||||
child_thread.start()
|
||||
|
||||
try:
|
||||
assert thread_stream_ready.wait(timeout=5), (
|
||||
"Child thread failed to enter stream context in time"
|
||||
)
|
||||
|
||||
main_current_stream = current_stream()
|
||||
|
||||
assert main_current_stream != child_stream, (
|
||||
"Main thread's current_stream was contaminated by child thread"
|
||||
)
|
||||
assert main_current_stream == main_default_stream, (
|
||||
"Main thread's current_stream is not the default stream"
|
||||
)
|
||||
|
||||
# Notify child thread it can exit
|
||||
thread_can_exit.set()
|
||||
|
||||
finally:
|
||||
# Ensure child thread exits properly
|
||||
child_thread.join(timeout=5)
|
||||
if child_thread.is_alive():
|
||||
pytest.fail("Child thread failed to exit properly")
|
||||
|
||||
|
||||
def test_load_config_file(tmp_path):
|
||||
# Define the configuration data
|
||||
config_data = {
|
||||
|
@ -23,13 +23,14 @@ ALLOWED_FILES = {
|
||||
"vllm/transformers_utils/config.py",
|
||||
"vllm/model_executor/models/registry.py",
|
||||
"vllm/compilation/caching.py",
|
||||
"tests/utils_/test_utils.py",
|
||||
"tests/tokenization/test_cached_tokenizer.py",
|
||||
"vllm/distributed/utils.py",
|
||||
"vllm/distributed/parallel_state.py",
|
||||
"vllm/distributed/device_communicators/all_reduce_utils.py",
|
||||
"vllm/distributed/device_communicators/shm_broadcast.py",
|
||||
"vllm/distributed/device_communicators/shm_object_storage.py",
|
||||
"vllm/utils/hashing.py",
|
||||
"tests/utils_/test_hashing.py",
|
||||
"tests/tokenization/test_cached_tokenizer.py",
|
||||
"benchmarks/kernels/graph_machete_bench.py",
|
||||
"benchmarks/kernels/benchmark_lora.py",
|
||||
"benchmarks/kernels/benchmark_machete.py",
|
||||
@ -40,10 +41,8 @@ ALLOWED_FILES = {
|
||||
"vllm/executor/mp_distributed_executor.py",
|
||||
"vllm/executor/ray_distributed_executor.py",
|
||||
"vllm/entrypoints/llm.py",
|
||||
"tests/utils.py",
|
||||
# pickle and cloudpickle
|
||||
"vllm/utils/__init__.py",
|
||||
"vllm/utils/hashing.py",
|
||||
"tests/utils.py",
|
||||
}
|
||||
|
||||
PICKLE_RE = re.compile(
|
||||
|
@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.utils.asyncio import merge_async_iterators
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
|
||||
|
||||
def run_vllm(
|
||||
|
@ -51,7 +51,7 @@ from vllm.transformers_utils.chat_templates import get_chat_template_fallback_pa
|
||||
from vllm.transformers_utils.processor import cached_get_processor
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.functools import supports_kw
|
||||
from vllm.utils.func_utils import supports_kw
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -76,7 +76,7 @@ from vllm.transformers_utils.tokenizer import (
|
||||
)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Counter, Device
|
||||
from vllm.utils.collections import as_iter, is_list_of
|
||||
from vllm.utils.collection_utils import as_iter, is_list_of
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
@ -70,7 +70,7 @@ from vllm.transformers_utils.tokenizers import (
|
||||
truncate_tool_call_ids,
|
||||
validate_request_params,
|
||||
)
|
||||
from vllm.utils.collections import as_list
|
||||
from vllm.utils.collection_utils import as_list
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -34,8 +34,8 @@ from vllm.logprobs import Logprob
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils.asyncio import merge_async_iterators
|
||||
from vllm.utils.collections import as_list
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
from vllm.utils.collection_utils import as_list
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -39,8 +39,8 @@ from vllm.outputs import (
|
||||
RequestOutput,
|
||||
)
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.utils.asyncio import merge_async_iterators
|
||||
from vllm.utils.collections import chunk_list
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
from vllm.utils.collection_utils import chunk_list
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -91,13 +91,13 @@ from vllm.tracing import (
|
||||
)
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.asyncio import (
|
||||
from vllm.utils.async_utils import (
|
||||
AsyncMicrobatchTokenizer,
|
||||
collect_from_async_generator,
|
||||
make_async,
|
||||
merge_async_iterators,
|
||||
)
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
@ -36,7 +36,7 @@ from vllm.entrypoints.utils import _validate_truncation_size
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import PoolingOutput, PoolingRequestOutput
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.asyncio import merge_async_iterators
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -37,7 +37,7 @@ from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.utils.asyncio import make_async, merge_async_iterators
|
||||
from vllm.utils.async_utils import make_async, merge_async_iterators
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.import_utils import import_from_path
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
@ -17,7 +17,7 @@ from vllm.inputs.data import TextPrompt as EngineTextPrompt
|
||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils.asyncio import AsyncMicrobatchTokenizer
|
||||
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
@ -17,7 +17,7 @@ from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.asyncio import make_async
|
||||
from vllm.utils.async_utils import make_async
|
||||
from vllm.v1.outputs import SamplerOutput
|
||||
from vllm.v1.worker.worker_base import WorkerBase
|
||||
|
||||
|
@ -19,7 +19,7 @@ from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.ray.ray_env import get_env_vars_to_copy
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils.asyncio import make_async
|
||||
from vllm.utils.async_utils import make_async
|
||||
from vllm.utils.network_utils import (
|
||||
get_distributed_init_method,
|
||||
get_ip,
|
||||
|
@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cas
|
||||
|
||||
from typing_extensions import TypeIs
|
||||
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
|
||||
from .data import (
|
||||
EmbedsPrompt,
|
||||
|
@ -17,7 +17,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.collections import LazyDict
|
||||
from vllm.utils.collection_utils import LazyDict
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -31,7 +31,7 @@ from vllm.utils.deep_gemm import (
|
||||
get_mk_alignment_for_contiguous_layout,
|
||||
m_grouped_fp8_gemm_nt_contiguous,
|
||||
)
|
||||
from vllm.utils.functools import run_once
|
||||
from vllm.utils.func_utils import run_once
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -28,7 +28,7 @@ from vllm.model_executor.parameter import (
|
||||
RowvLLMParameter,
|
||||
)
|
||||
from vllm.transformers_utils.config import get_safetensors_params_metadata
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
|
@ -57,7 +57,7 @@ from vllm.model_executor.parameter import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
from vllm.transformers_utils.config import get_safetensors_params_metadata
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -48,7 +48,7 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (
|
||||
)
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
|
@ -24,7 +24,7 @@ from vllm.inputs import TokensPrompt
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.utils.functools import supports_kw
|
||||
from vllm.utils.func_utils import supports_kw
|
||||
|
||||
from .interfaces_base import VllmModel, is_pooling_model
|
||||
|
||||
|
@ -15,7 +15,7 @@ import torch.nn as nn
|
||||
from typing_extensions import TypeIs, TypeVar
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.functools import supports_kw
|
||||
from vllm.utils.func_utils import supports_kw
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
|
@ -33,7 +33,7 @@ from vllm.multimodal.processing import (
|
||||
)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
|
@ -85,7 +85,7 @@ from vllm.multimodal.processing import (
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.collections import flatten_2d_lists
|
||||
from vllm.utils.collection_utils import flatten_2d_lists
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
|
@ -79,7 +79,7 @@ from vllm.multimodal.processing import (
|
||||
)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
|
||||
from .interfaces import (
|
||||
MultiModalEmbeddings,
|
||||
|
@ -22,7 +22,7 @@ from typing import (
|
||||
import numpy as np
|
||||
from typing_extensions import NotRequired, TypeVar, deprecated
|
||||
|
||||
from vllm.utils.collections import full_groupby, is_list_of
|
||||
from vllm.utils.collection_utils import full_groupby, is_list_of
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.utils.jsontree import json_map_leaves
|
||||
|
||||
|
@ -19,7 +19,7 @@ import numpy as np
|
||||
import torch
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
|
||||
from .audio import AudioResampler
|
||||
|
@ -25,8 +25,8 @@ from typing_extensions import TypeVar, assert_never
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
|
||||
from vllm.utils.collections import flatten_2d_lists, full_groupby
|
||||
from vllm.utils.functools import get_allowed_kwarg_only_overrides
|
||||
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
|
||||
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||
from vllm.utils.jsontree import JSONTree, json_map_leaves
|
||||
|
||||
from .hasher import MultiModalHasher
|
||||
@ -486,7 +486,7 @@ _M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp)
|
||||
def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
|
||||
"""
|
||||
Convenience function to apply
|
||||
[`full_groupby`][vllm.utils.collections.full_groupby]
|
||||
[`full_groupby`][vllm.utils.collection_utils.full_groupby]
|
||||
based on modality.
|
||||
"""
|
||||
return full_groupby(values, key=lambda x: x.modality)
|
||||
|
@ -9,7 +9,7 @@ import torch.nn as nn
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
|
||||
from vllm.utils.collections import ClassRegistry
|
||||
from vllm.utils.collection_utils import ClassRegistry
|
||||
|
||||
from .cache import BaseMultiModalProcessorCache
|
||||
from .processing import (
|
||||
|
@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Any
|
||||
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.collections import is_list_of
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.import_utils import import_from_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
@ -5,7 +5,7 @@ import os
|
||||
from collections.abc import Mapping
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.functools import run_once
|
||||
from vllm.utils.func_utils import run_once
|
||||
|
||||
TRACE_HEADERS = ["traceparent", "tracestate"]
|
||||
|
||||
|
@ -16,7 +16,7 @@ from transformers.processing_utils import ProcessorMixin
|
||||
from transformers.video_processing_utils import BaseVideoProcessor
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
from vllm.utils.functools import get_allowed_kwarg_only_overrides
|
||||
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
|
@ -1122,9 +1122,6 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
|
||||
return cls
|
||||
|
||||
|
||||
## moved to vllm.utils.profiling (imported at module top)
|
||||
|
||||
|
||||
# Only relevant for models using ALiBi (e.g, MPT)
|
||||
def check_use_alibi(model_config: ModelConfig) -> bool:
|
||||
cfg = model_config.hf_text_config
|
||||
@ -1150,9 +1147,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
|
||||
)
|
||||
|
||||
|
||||
## moved to vllm.utils.hashing
|
||||
|
||||
|
||||
@cache
|
||||
def _has_module(module_name: str) -> bool:
|
||||
"""Return True if *module_name* can be found in the current environment.
|
||||
|
@ -30,9 +30,9 @@ from vllm.transformers_utils.config import maybe_register_config_serialize_by_va
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Device, cdiv
|
||||
from vllm.utils.asyncio import cancel_task_threadsafe
|
||||
from vllm.utils.collections import as_list
|
||||
from vllm.utils.functools import deprecate_kwargs
|
||||
from vllm.utils.async_utils import cancel_task_threadsafe
|
||||
from vllm.utils.collection_utils import as_list
|
||||
from vllm.utils.func_utils import deprecate_kwargs
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core_client import EngineCoreClient
|
||||
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
|
||||
|
@ -23,7 +23,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.asyncio import in_loop
|
||||
from vllm.utils.async_utils import in_loop
|
||||
from vllm.utils.network_utils import (
|
||||
close_sockets,
|
||||
get_open_port,
|
||||
|
@ -13,7 +13,7 @@ from vllm.multimodal.inputs import MultiModalFeatureSpec
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams, SamplingType
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||
from vllm.utils.collections import swap_dict_values
|
||||
from vllm.utils.collection_utils import swap_dict_values
|
||||
from vllm.v1.outputs import LogprobsTensors
|
||||
from vllm.v1.pool.metadata import PoolingMetadata
|
||||
from vllm.v1.sample.logits_processor import (
|
||||
|
@ -10,7 +10,7 @@ import torch
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import SamplingType
|
||||
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||
from vllm.utils.collections import swap_dict_values
|
||||
from vllm.utils.collection_utils import swap_dict_values
|
||||
from vllm.v1.outputs import LogprobsTensors
|
||||
from vllm.v1.worker.block_table import MultiGroupBlockTable
|
||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState
|
||||
|
Reference in New Issue
Block a user