diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 4c5971da1c..09f4ec03fb 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -6,7 +6,7 @@ import torch from vllm import LLM, SamplingParams from vllm.device_allocator.cumem import CuMemAllocator -from vllm.utils import GiB_bytes +from vllm.utils.mem_constants import GiB_bytes from ..utils import create_new_process_for_each_test diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 15cdb950a7..9662e73321 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -11,7 +11,7 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops from vllm.attention.layer import Attention, MultiHeadAttention from vllm.platforms import current_platform -from vllm.utils import get_max_shared_memory_bytes +from vllm.utils.mem_utils import get_max_shared_memory_bytes if not current_platform.is_rocm(): from xformers import ops as xops diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 6d1f67c39f..6074cdef1b 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -7,7 +7,7 @@ from unittest.mock import patch import pytest from vllm import LLM -from vllm.utils import GiB_bytes +from vllm.utils.mem_constants import GiB_bytes from vllm.v1.core.kv_cache_utils import ( generate_scheduler_kv_cache_config, get_kv_cache_configs, diff --git a/tests/utils.py b/tests/utils.py index 9aed55b725..d17dbbeefc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -46,10 +46,10 @@ from vllm.platforms import current_platform from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import ( FlexibleArgumentParser, - GB_bytes, cuda_device_count_stateless, get_open_port, ) +from vllm.utils.mem_constants import GB_bytes if current_platform.is_rocm(): from amdsmi import ( diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index f2a323ad0c..32a2072396 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -23,7 +23,6 @@ from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens from vllm.utils import ( FlexibleArgumentParser, - MemorySnapshot, bind_kv_cache, common_broadcastable_dtype, current_stream, @@ -33,13 +32,13 @@ from vllm.utils import ( join_host_port, make_zmq_path, make_zmq_socket, - memory_profiling, sha256, split_host_port, split_zmq_path, unique_filepath, ) +from vllm.utils.mem_utils import MemorySnapshot, memory_profiling from ..utils import create_new_process_for_each_test, flat_product diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 6b0a5e4b0e..678ec25464 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -14,7 +14,8 @@ from vllm.multimodal.inputs import ( PlaceholderRange, ) from vllm.sampling_params import SamplingParams -from vllm.utils import GiB_bytes, sha256, sha256_cbor +from vllm.utils import sha256, sha256_cbor +from vllm.utils.mem_constants import GiB_bytes from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.kv_cache_utils import ( BlockHash, diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index e471174ef6..1aa0709696 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -13,7 +13,7 @@ from vllm.config import ( ) from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams -from vllm.utils import GiB_bytes +from vllm.utils.mem_constants import GiB_bytes from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput from vllm.v1.worker.tpu_model_runner import ( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index fe52f565c8..e985578f05 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -21,7 +21,8 @@ from vllm.distributed.parallel_state import ( from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from vllm.utils import GiB_bytes, update_environment_variables +from vllm.utils import update_environment_variables +from vllm.utils.mem_constants import GiB_bytes from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput from vllm.v1.kv_cache_interface import ( diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py index b9b2e076fd..66330127b5 100644 --- a/tests/v1/worker/test_worker_memory_snapshot.py +++ b/tests/v1/worker/test_worker_memory_snapshot.py @@ -11,7 +11,7 @@ import pytest import torch from vllm.engine.arg_utils import EngineArgs -from vllm.utils import MemorySnapshot +from vllm.utils.mem_utils import MemorySnapshot from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment # Global queue to track operation order across processes diff --git a/vllm/config/cache.py b/vllm/config/cache.py index c01643fbff..cf2977622a 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -10,7 +10,8 @@ from pydantic.dataclasses import dataclass from vllm.config.utils import config from vllm.logger import init_logger -from vllm.utils import GiB_bytes, get_cpu_memory +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import get_cpu_memory if TYPE_CHECKING: from vllm.config.parallel import ParallelConfig diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 05958ca523..21c2a9e44b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -81,7 +81,8 @@ from vllm.transformers_utils.config import ( maybe_override_with_speculators, ) from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor +from vllm.utils import FlexibleArgumentParser, get_ip, is_in_ray_actor +from vllm.utils.mem_constants import GiB_bytes from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index a29da2a56a..c1531cbfdc 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -17,9 +17,9 @@ from vllm.distributed.device_communicators.shm_object_storage import ( SingleWriterShmRingBuffer, ) from vllm.logger import init_logger -from vllm.utils import GiB_bytes, MiB_bytes from vllm.utils.cache import CacheInfo, LRUCache from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves +from vllm.utils.mem_constants import GiB_bytes, MiB_bytes from .inputs import ( MultiModalBatchedField, diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 1a34e9150c..69f2b1079a 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -151,7 +151,7 @@ class CpuPlatform(Platform): @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: import vllm.envs as envs - from vllm.utils import GiB_bytes + from vllm.utils.mem_constants import GiB_bytes kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE if kv_cache_space is None: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a35dda2d77..f5569154be 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -4,7 +4,6 @@ import contextlib import datetime import enum -import gc import getpass import hashlib import importlib @@ -21,7 +20,6 @@ import sys import tempfile import textwrap import threading -import time import traceback import uuid import warnings @@ -38,12 +36,10 @@ from collections import defaultdict from collections.abc import ( Callable, Collection, - Generator, Iterator, Sequence, ) from concurrent.futures.process import ProcessPoolExecutor -from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps from pathlib import Path from typing import TYPE_CHECKING, Any, TextIO, TypeVar @@ -58,7 +54,6 @@ import psutil import regex as re import setproctitle import torch -import torch.types import yaml import zmq import zmq.asyncio @@ -105,17 +100,6 @@ STR_XFORMERS_ATTN_VAL: str = "XFORMERS" STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" STR_INVALID_VAL: str = "INVALID" -MB_bytes = 1_000_000 -"""The number of bytes in one megabyte (MB).""" - -MiB_bytes = 1 << 20 -"""The number of bytes in one mebibyte (MiB).""" - -GB_bytes = 1_000_000_000 -"""The number of bytes in one gigabyte (GB).""" - -GiB_bytes = 1 << 30 -"""The number of bytes in one gibibyte (GiB).""" # ANSI color codes CYAN = "\033[1;36m" @@ -180,23 +164,6 @@ class Counter: self.counter = 0 -@cache -def get_max_shared_memory_bytes(gpu: int = 0) -> int: - """Returns the maximum shared memory per thread block in bytes.""" - from vllm import _custom_ops as ops - - max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu) - # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py - # will fail - assert max_shared_mem > 0, "max_shared_mem can not be zero" - return int(max_shared_mem) - - -def get_cpu_memory() -> int: - """Returns the total CPU memory of the node in bytes.""" - return psutil.virtual_memory().total - - def random_uuid() -> str: return str(uuid.uuid4().hex) @@ -581,30 +548,6 @@ def is_uva_available() -> bool: return is_pin_memory_available() -class DeviceMemoryProfiler: - def __init__(self, device: torch.types.Device | None = None): - self.device = device - - def current_memory_usage(self) -> float: - # Return the memory usage in bytes. - from vllm.platforms import current_platform - - gc.collect() - return current_platform.get_current_memory_usage(self.device) - - def __enter__(self): - self.initial_memory = self.current_memory_usage() - # This allows us to call methods of the context manager if needed - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.final_memory = self.current_memory_usage() - self.consumed_memory = self.final_memory - self.initial_memory - - # Force garbage collection - gc.collect() - - def make_ndarray_with_pad( x: list[list[T]], pad: T, @@ -1642,183 +1585,6 @@ def kill_process_tree(pid: int): os.kill(pid, signal.SIGKILL) -@dataclass -class MemorySnapshot: - """Memory snapshot.""" - - torch_peak: int = 0 - free_memory: int = 0 - total_memory: int = 0 - cuda_memory: int = 0 - torch_memory: int = 0 - non_torch_memory: int = 0 - timestamp: float = 0.0 - auto_measure: bool = True - - def __post_init__(self): - if self.auto_measure: - self.measure() - - def measure(self): - from vllm.platforms import current_platform - - # we measure the torch peak memory usage via allocated_bytes, - # rather than `torch.cuda.memory_reserved()` . - # After `torch.cuda.reset_peak_memory_stats()`, - # `torch.cuda.memory_reserved()` will keep growing, and only shrink - # when we call `torch.cuda.empty_cache()` or OOM happens. - self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0) - - self.free_memory, self.total_memory = torch.cuda.mem_get_info() - shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark - if ( - current_platform.is_cuda() - and current_platform.get_device_capability() in shared_sysmem_device_mem_sms - ): - # On UMA (Orin, Thor and Spark) platform, - # where both CPU and GPU rely on system memory, - # the cudaMemGetInfo function shows the amount of free system memory - # rather than what’s actually available. - # In the case, - # torch.cuda.mem_get_info() only reports "free" memory, - # which can be lower than what is actually - # available due to not including cache memory. - # There’s also a comprehensive reference page - # that explains how you can compute the proper value yourself. - # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device - self.free_memory = psutil.virtual_memory().available - - self.cuda_memory = self.total_memory - self.free_memory - - # torch.cuda.memory_reserved() is how many bytes - # PyTorch gets from cuda (by calling cudaMalloc, etc.) - # this is used to measure the non-torch memory usage - self.torch_memory = torch.cuda.memory_reserved() - - self.non_torch_memory = self.cuda_memory - self.torch_memory - self.timestamp = time.time() - - def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": - return MemorySnapshot( - torch_peak=self.torch_peak - other.torch_peak, - free_memory=self.free_memory - other.free_memory, - total_memory=self.total_memory - other.total_memory, - cuda_memory=self.cuda_memory - other.cuda_memory, - torch_memory=self.torch_memory - other.torch_memory, - non_torch_memory=self.non_torch_memory - other.non_torch_memory, - timestamp=self.timestamp - other.timestamp, - auto_measure=False, - ) - - -@dataclass -class MemoryProfilingResult: - """Memory profiling result. All numbers are in bytes.""" - - non_kv_cache_memory: int = 0 - torch_peak_increase: int = 0 - non_torch_increase: int = 0 - weights_memory: float = 0 - before_create: MemorySnapshot = field(default_factory=MemorySnapshot) - before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) - after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) - profile_time: float = 0.0 - - def __repr__(self) -> str: - return ( - f"Memory profiling takes {self.profile_time:.2f} seconds. " - f"Total non KV cache memory: " - f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " - f"torch peak memory increase: " - f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " - f"non-torch forward increase memory: " - f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " - f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB." - ) - - -@contextlib.contextmanager -def memory_profiling( - baseline_snapshot: MemorySnapshot, weights_memory: int -) -> Generator[MemoryProfilingResult, None, None]: - """Memory profiling context manager. - baseline_snapshot: the memory snapshot before the current vLLM instance. - weights_memory: memory used by PyTorch when loading the model weights. - Note that, before loading the model weights, we also initialize the device - and distributed environment, which may consume some memory. This part is not - included in the weights_memory because PyTorch does not control it. - - The memory in one GPU can be classified into 3 categories: - 1. memory used by anything other than the current vLLM instance. - 2. memory used by torch in the current vLLM instance. - 3. memory used in the current vLLM instance, but not by torch. - - A quantitive example: - - Before creating the current vLLM instance: - category 1: 1 GiB - category 2: 0 GiB - category 3: 0 GiB - - After creating the current vLLM instance and loading the model, - (i.e. before profiling): - category 1: 1 GiB - category 2: 2 GiB (model weights take 2 GiB) - category 3: 0.5 GiB (memory used by NCCL) - - During profiling (peak): - category 1: 1 GiB - category 2: 4 GiB (peak activation tensors take 2 GiB) - category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) - - After profiling: - category 1: 1 GiB - category 2: 3 GiB (after garbage-collecting activation tensors) - category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) - - In this case, non-kv cache takes 5 GiB in total, including: - a. 2 GiB used by the model weights (category 2) - b. 2 GiB reserved for the peak activation tensors (category 2) - c. 1 GiB used by non-torch components (category 3) - - The memory used for loading weights (a.) is directly given from the argument `weights_memory`. - - The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). - - The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). - """ # noqa - gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - - result = MemoryProfilingResult() - - result.before_create = baseline_snapshot - # the part of memory used for holding the model weights - result.weights_memory = weights_memory - - result.before_profile.measure() - - yield result - - gc.collect() - torch.cuda.empty_cache() - - result.after_profile.measure() - - diff_profile = result.after_profile - result.before_profile - diff_from_create = result.after_profile - result.before_create - result.torch_peak_increase = diff_profile.torch_peak - result.non_torch_increase = diff_from_create.non_torch_memory - result.profile_time = diff_profile.timestamp - - non_torch_memory = result.non_torch_increase - peak_activation_memory = result.torch_peak_increase - result.non_kv_cache_memory = ( - non_torch_memory + peak_activation_memory + result.weights_memory - ) # noqa - - # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 def set_ulimit(target_soft_limit=65535): if sys.platform.startswith("win"): diff --git a/vllm/utils/mem_constants.py b/vllm/utils/mem_constants.py new file mode 100644 index 0000000000..62b725fbb0 --- /dev/null +++ b/vllm/utils/mem_constants.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +MB_bytes = 1_000_000 +"""The number of bytes in one megabyte (MB).""" + +MiB_bytes = 1 << 20 +"""The number of bytes in one mebibyte (MiB).""" + +GB_bytes = 1_000_000_000 +"""The number of bytes in one gigabyte (GB).""" + +GiB_bytes = 1 << 30 +"""The number of bytes in one gibibyte (GiB).""" diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py new file mode 100644 index 0000000000..c6a6757bed --- /dev/null +++ b/vllm/utils/mem_utils.py @@ -0,0 +1,232 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib +import gc +import time +from collections.abc import Generator +from dataclasses import dataclass, field +from functools import cache + +import psutil +import torch +import torch.types + +from .mem_constants import GiB_bytes + + +@cache +def get_max_shared_memory_bytes(gpu: int = 0) -> int: + """Returns the maximum shared memory per thread block in bytes.""" + from vllm import _custom_ops as ops + + max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu) + # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py + # will fail + assert max_shared_mem > 0, "max_shared_mem can not be zero" + return int(max_shared_mem) + + +def get_cpu_memory() -> int: + """Returns the total CPU memory of the node in bytes.""" + return psutil.virtual_memory().total + + +class DeviceMemoryProfiler: + def __init__(self, device: torch.types.Device | None = None): + self.device = device + + def current_memory_usage(self) -> float: + # Return the memory usage in bytes. + from vllm.platforms import current_platform + + gc.collect() + return current_platform.get_current_memory_usage(self.device) + + def __enter__(self): + self.initial_memory = self.current_memory_usage() + # This allows us to call methods of the context manager if needed + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.final_memory = self.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory + + # Force garbage collection + gc.collect() + + +@dataclass +class MemorySnapshot: + """Memory snapshot.""" + + torch_peak: int = 0 + free_memory: int = 0 + total_memory: int = 0 + cuda_memory: int = 0 + torch_memory: int = 0 + non_torch_memory: int = 0 + timestamp: float = 0.0 + auto_measure: bool = True + + def __post_init__(self): + if self.auto_measure: + self.measure() + + def measure(self): + from vllm.platforms import current_platform + + # we measure the torch peak memory usage via allocated_bytes, + # rather than `torch.cuda.memory_reserved()` . + # After `torch.cuda.reset_peak_memory_stats()`, + # `torch.cuda.memory_reserved()` will keep growing, and only shrink + # when we call `torch.cuda.empty_cache()` or OOM happens. + self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0) + + self.free_memory, self.total_memory = torch.cuda.mem_get_info() + shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark + if ( + current_platform.is_cuda() + and current_platform.get_device_capability() in shared_sysmem_device_mem_sms + ): + # On UMA (Orin, Thor and Spark) platform, + # where both CPU and GPU rely on system memory, + # the cudaMemGetInfo function shows the amount of free system memory + # rather than what’s actually available. + # In the case, + # torch.cuda.mem_get_info() only reports "free" memory, + # which can be lower than what is actually + # available due to not including cache memory. + # There’s also a comprehensive reference page + # that explains how you can compute the proper value yourself. + # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device + self.free_memory = psutil.virtual_memory().available + + self.cuda_memory = self.total_memory - self.free_memory + + # torch.cuda.memory_reserved() is how many bytes + # PyTorch gets from cuda (by calling cudaMalloc, etc.) + # this is used to measure the non-torch memory usage + self.torch_memory = torch.cuda.memory_reserved() + + self.non_torch_memory = self.cuda_memory - self.torch_memory + self.timestamp = time.time() + + def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": + return MemorySnapshot( + torch_peak=self.torch_peak - other.torch_peak, + free_memory=self.free_memory - other.free_memory, + total_memory=self.total_memory - other.total_memory, + cuda_memory=self.cuda_memory - other.cuda_memory, + torch_memory=self.torch_memory - other.torch_memory, + non_torch_memory=self.non_torch_memory - other.non_torch_memory, + timestamp=self.timestamp - other.timestamp, + auto_measure=False, + ) + + +@dataclass +class MemoryProfilingResult: + """Memory profiling result. All numbers are in bytes.""" + + non_kv_cache_memory: int = 0 + torch_peak_increase: int = 0 + non_torch_increase: int = 0 + weights_memory: float = 0 + before_create: MemorySnapshot = field(default_factory=MemorySnapshot) + before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) + after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) + profile_time: float = 0.0 + + def __repr__(self) -> str: + return ( + f"Memory profiling takes {self.profile_time:.2f} seconds. " + f"Total non KV cache memory: " + f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " + f"torch peak memory increase: " + f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " + f"non-torch forward increase memory: " + f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " + f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB." + ) + + +@contextlib.contextmanager +def memory_profiling( + baseline_snapshot: MemorySnapshot, weights_memory: int +) -> Generator[MemoryProfilingResult, None, None]: + """Memory profiling context manager. + baseline_snapshot: the memory snapshot before the current vLLM instance. + weights_memory: memory used by PyTorch when loading the model weights. + Note that, before loading the model weights, we also initialize the device + and distributed environment, which may consume some memory. This part is not + included in the weights_memory because PyTorch does not control it. + + The memory in one GPU can be classified into 3 categories: + 1. memory used by anything other than the current vLLM instance. + 2. memory used by torch in the current vLLM instance. + 3. memory used in the current vLLM instance, but not by torch. + + A quantitive example: + + Before creating the current vLLM instance: + category 1: 1 GiB + category 2: 0 GiB + category 3: 0 GiB + + After creating the current vLLM instance and loading the model, + (i.e. before profiling): + category 1: 1 GiB + category 2: 2 GiB (model weights take 2 GiB) + category 3: 0.5 GiB (memory used by NCCL) + + During profiling (peak): + category 1: 1 GiB + category 2: 4 GiB (peak activation tensors take 2 GiB) + category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) + + After profiling: + category 1: 1 GiB + category 2: 3 GiB (after garbage-collecting activation tensors) + category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) + + In this case, non-kv cache takes 5 GiB in total, including: + a. 2 GiB used by the model weights (category 2) + b. 2 GiB reserved for the peak activation tensors (category 2) + c. 1 GiB used by non-torch components (category 3) + + The memory used for loading weights (a.) is directly given from the argument `weights_memory`. + + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). + + The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). + """ # noqa + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + result = MemoryProfilingResult() + + result.before_create = baseline_snapshot + # the part of memory used for holding the model weights + result.weights_memory = weights_memory + + result.before_profile.measure() + + yield result + + gc.collect() + torch.cuda.empty_cache() + + result.after_profile.measure() + + diff_profile = result.after_profile - result.before_profile + diff_from_create = result.after_profile - result.before_create + result.torch_peak_increase = diff_profile.torch_peak + result.non_torch_increase = diff_from_create.non_torch_memory + result.profile_time = diff_profile.timestamp + + non_torch_memory = result.non_torch_increase + peak_activation_memory = result.torch_peak_increase + result.non_kv_cache_memory = ( + non_torch_memory + peak_activation_memory + result.weights_memory + ) # noqa diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 625ff28f2b..01b88d1de8 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -12,7 +12,8 @@ from typing import Any, NewType, TypeAlias from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import GiB_bytes, cdiv, sha256_cbor +from vllm.utils import cdiv, sha256_cbor +from vllm.utils.mem_constants import GiB_bytes from vllm.v1.kv_cache_interface import ( ChunkedLocalAttentionSpec, FullAttentionSpec, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7e72ce937b..a9874b164f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -74,8 +74,6 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import ( STR_DTYPE_TO_TORCH_DTYPE, - DeviceMemoryProfiler, - GiB_bytes, cdiv, check_use_alibi, get_dtype_size, @@ -85,6 +83,8 @@ from vllm.utils import ( supports_dynamo, ) from vllm.utils.jsontree import json_map_leaves +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import DeviceMemoryProfiler from vllm.v1.attention.backends.flash_attn import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 00dc7682c9..32d8da5ec1 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -28,7 +28,8 @@ from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask -from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import MemorySnapshot, memory_profiling from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import (