From 4de7146351d67be0010b7007ba4da48462962153 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 17 Jul 2025 16:37:36 -0700 Subject: [PATCH] [V0 deprecation] Remove V0 HPU backend (#21131) Signed-off-by: Woosuk Kwon --- docker/Dockerfile.hpu | 21 - requirements/hpu.txt | 12 - setup.py | 36 +- vllm/_custom_ops.py | 3 +- vllm/attention/backends/hpu_attn.py | 319 --- vllm/attention/ops/hpu_paged_attn.py | 88 - vllm/config.py | 2 +- vllm/core/block/cpu_gpu_block_allocator.py | 4 +- .../device_communicators/hpu_communicator.py | 46 - vllm/engine/arg_utils.py | 5 +- vllm/envs.py | 15 - vllm/lora/layers.py | 4 - vllm/lora/punica_wrapper/punica_hpu.py | 145 -- vllm/model_executor/custom_op.py | 7 - vllm/model_executor/layers/fused_moe/layer.py | 36 - vllm/model_executor/layers/layernorm.py | 20 - .../model_executor/layers/rotary_embedding.py | 58 - .../layers/vocab_parallel_embedding.py | 16 +- .../model_loader/bitsandbytes_loader.py | 11 +- .../model_loader/default_loader.py | 10 - vllm/platforms/__init__.py | 18 - vllm/platforms/hpu.py | 114 - vllm/platforms/interface.py | 5 - vllm/plugins/__init__.py | 13 - vllm/worker/hpu_model_runner.py | 2320 ----------------- vllm/worker/hpu_worker.py | 485 ---- vllm/worker/multi_step_hpu_worker.py | 123 - 27 files changed, 10 insertions(+), 3926 deletions(-) delete mode 100644 docker/Dockerfile.hpu delete mode 100644 requirements/hpu.txt delete mode 100644 vllm/attention/backends/hpu_attn.py delete mode 100644 vllm/attention/ops/hpu_paged_attn.py delete mode 100644 vllm/distributed/device_communicators/hpu_communicator.py delete mode 100644 vllm/lora/punica_wrapper/punica_hpu.py delete mode 100644 vllm/platforms/hpu.py delete mode 100644 vllm/worker/hpu_model_runner.py delete mode 100644 vllm/worker/hpu_worker.py delete mode 100644 vllm/worker/multi_step_hpu_worker.py diff --git a/docker/Dockerfile.hpu b/docker/Dockerfile.hpu deleted file mode 100644 index 224f142b5f..0000000000 --- a/docker/Dockerfile.hpu +++ /dev/null @@ -1,21 +0,0 @@ -FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - -COPY ./ /workspace/vllm - -WORKDIR /workspace/vllm - -RUN pip install -v -r requirements/hpu.txt - -ENV no_proxy=localhost,127.0.0.1 -ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true - -RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils - -WORKDIR /workspace/ - -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks - -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/requirements/hpu.txt b/requirements/hpu.txt deleted file mode 100644 index a88777268a..0000000000 --- a/requirements/hpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -# Common dependencies --r common.txt - -# Dependencies for HPU code -ray -triton==3.1.0 -pandas -numpy==1.26.4 -tabulate -setuptools>=77.0.3,<80.0.0 -setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624 diff --git a/setup.py b/setup.py index 795d549645..9a5ca3456a 100644 --- a/setup.py +++ b/setup.py @@ -410,29 +410,6 @@ class repackage_wheel(build_ext): package_data[package_name].append(file_name) -def _is_hpu() -> bool: - # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection - if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE: - return VLLM_TARGET_DEVICE == "hpu" - - # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds, - # and if it doesn't, check if habanalabs driver is loaded - is_hpu_available = False - try: - out = subprocess.run(["hl-smi"], capture_output=True, check=True) - is_hpu_available = out.returncode == 0 - except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): - if sys.platform.startswith("linux"): - try: - output = subprocess.check_output( - 'lsmod | grep habanalabs | wc -l', shell=True) - is_hpu_available = int(output) > 0 - except (ValueError, FileNotFoundError, PermissionError, - subprocess.CalledProcessError): - pass - return is_hpu_available - - def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" @@ -440,7 +417,7 @@ def _no_device() -> bool: def _is_cuda() -> bool: has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda - and not (_is_neuron() or _is_tpu() or _is_hpu())) + and not (_is_neuron() or _is_tpu())) def _is_hip() -> bool: @@ -573,12 +550,6 @@ def get_vllm_version() -> str: if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"{sep}neuron{neuron_version_str}" - elif _is_hpu(): - # Get the Intel Gaudi Software Suite version - gaudi_sw_version = str(get_gaudi_sw_version()) - if gaudi_sw_version != MAIN_CUDA_VERSION: - gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3] - version += f"{sep}gaudi{gaudi_sw_version}" elif _is_tpu(): version += f"{sep}tpu" elif _is_cpu(): @@ -625,8 +596,6 @@ def get_requirements() -> list[str]: requirements = _read_requirements("rocm.txt") elif _is_neuron(): requirements = _read_requirements("neuron.txt") - elif _is_hpu(): - requirements = _read_requirements("hpu.txt") elif _is_tpu(): requirements = _read_requirements("tpu.txt") elif _is_cpu(): @@ -635,8 +604,7 @@ def get_requirements() -> list[str]: requirements = _read_requirements("xpu.txt") else: raise ValueError( - "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, " - "or CPU.") + "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.") return requirements diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index f25db40a4e..81f4f6bdad 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -13,8 +13,7 @@ from vllm.scalar_type import ScalarType logger = init_logger(__name__) -if not current_platform.is_tpu() and not current_platform.is_hpu()\ - and not current_platform.is_xpu(): +if not current_platform.is_tpu() and not current_platform.is_xpu(): try: import vllm._C except ImportError as e: diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py deleted file mode 100644 index b8fdf763a0..0000000000 --- a/vllm/attention/backends/hpu_attn.py +++ /dev/null @@ -1,319 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -############################################################################### - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -import vllm_hpu_extension.kernels as kernels -import vllm_hpu_extension.ops as ops -from vllm_hpu_extension.flags import enabled_flags -from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache - -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention, - HPUPagedAttentionMetadata) -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class HPUAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "HPU_ATTN" - - @staticmethod - def get_impl_cls() -> Type["HPUAttentionImpl"]: - return HPUAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return HPUAttentionMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dsts: torch.Tensor, - ) -> None: - HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dsts: torch.Tensor, - ) -> None: - HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts) - - -@dataclass -class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata): - """Metadata for HPUAttentionbackend.""" - # Currently, input sequences can only contain all prompts - # or all decoding. True if all sequences are prompts. - is_prompt: bool - attn_bias: Optional[torch.Tensor] - seq_lens_tensor: Optional[torch.Tensor] - context_lens_tensor: Optional[torch.Tensor] - - -class HPUAttentionImpl(AttentionImpl, torch.nn.Module): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| - |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| - |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - max_seq_len: int = 4096, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - super(AttentionImpl, self).__init__() - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "HPU_ATTN backend.") - if use_irope: - logger.warning_once( - "Using irope in HPU is not supported yet, it will fall back " - "to global attention for long context.") - self.kv_cache_dtype = kv_cache_dtype - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.matmul_qk = Matmul() - self.softmax = Softmax() - self.matmul_av = Matmul() - self.batch2block_matmul = Matmul() - self.block2batch_matmul = Matmul() - self.k_cache = VLLMKVCache() - self.v_cache = VLLMKVCache() - self.fused_scaled_dot_product_attention = kernels.fsdpa() - - self.prefill_impl = 'naive' - if "flex_attention" in enabled_flags(): - self.prefill_impl = 'flex' - if "fsdpa" in enabled_flags(): - assert alibi_slopes is None, \ - 'Prefill with FusedSDPA not supported with alibi slopes!' - self.prefill_impl = 'fsdpa' - - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - self.sliding_window = sliding_window - self.alibi_slopes = alibi_slopes - if alibi_slopes is not None: - alibi_slopes_tensor = torch.tensor(alibi_slopes, - dtype=torch.bfloat16) - self.alibi_slopes = alibi_slopes_tensor - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - if self.prefill_impl == 'fsdpa': - assert alibi_slopes is None, \ - 'Prefill with FusedSDPA not supported with alibi slopes!' - - supported_head_sizes = HPUPagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - self.attn_type = attn_type - if self.attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "HPUAttentionImpl") - - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "HPUAttention with FP8 KV cache not yet supported") - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: HPUAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with xFormers and PagedAttention. - - Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for HPUAttentionImpl") - - batch_size, seq_len, hidden_size = query.shape - _, seq_len_kv, _ = key.shape - - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - block_indices = attn_metadata.block_indices - block_offsets = attn_metadata.block_offsets - key_cache = None - value_cache = None - if attn_metadata.is_prompt and self.attn_type \ - is not AttentionType.ENCODER_ONLY: - key = key.unflatten(0, (block_indices.size(0), -1)) - value = value.unflatten(0, (block_indices.size(0), -1)) - if kv_cache is not None and isinstance(kv_cache, tuple): - key_cache, value_cache = HPUPagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory profiling run. - key_cache = self.k_cache(key, key_cache, block_indices, - block_offsets) - value_cache = self.v_cache(value, value_cache, block_indices, - block_offsets) - - if attn_metadata.is_prompt: - # Prompt run. - query_shape = (batch_size, seq_len, self.num_heads, self.head_size) - kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, - self.head_size) - - attn_bias = attn_metadata.attn_bias - if attn_bias is not None and self.alibi_slopes is not None: - position_bias = _make_alibi_bias(self.alibi_slopes, - self.num_kv_heads, - attn_bias.dtype, - attn_bias.shape[-1]) - attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1)) - attn_bias.add_(position_bias) - - block_list = attn_metadata.block_list if attn_metadata \ - and attn_metadata.block_list is not None else None - - out = ops.prompt_attention( - impl=self.prefill_impl, - query=query.view(query_shape), - key=key.view(kv_shape), - value=value.view(kv_shape), - is_causal=True, - attn_bias=attn_bias, - valid_seq_lengths=attn_metadata.seq_lens_tensor, - **self.common_attention_args(block_list, key_cache, - value_cache)) - output = out.reshape(batch_size, seq_len, hidden_size) - else: - # Decoding run. - output = HPUPagedAttention.forward_decode( - query=query, - block_mapping=attn_metadata.block_mapping, - block_bias=attn_metadata.attn_bias, - block_groups=attn_metadata.block_groups, - **self.common_attention_args(attn_metadata.block_list, - key_cache, value_cache)) - # Reshape the output tensor. - return output.view(batch_size, seq_len, hidden_size) - - def common_attention_args(self, - block_list=None, - key_cache=None, - value_cache=None): - fsdpa_op = self.fused_scaled_dot_product_attention.apply \ - if self.fused_scaled_dot_product_attention is not None else None - return { - 'scale': self.scale, - 'matmul_qk_op': self.matmul_qk, - 'matmul_av_op': self.matmul_av, - 'batch2block_matmul_op': self.batch2block_matmul, - 'block2batch_matmul_op': self.block2batch_matmul, - 'fsdpa_op': fsdpa_op, - 'keys_fetch_func': self.k_cache.fetch_from_cache, - 'values_fetch_func': self.v_cache.fetch_from_cache, - 'softmax_op': self.softmax, - 'block_list': block_list, - 'key_cache': key_cache, - 'value_cache': value_cache, - } - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - num_kv_heads: int, - dtype: torch.dtype, - seq_len: int, -) -> torch.Tensor: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - # Calculate a matrix where each element represents ith element- jth - # element. - bias = bias[None, :] - bias[:, None] - - padded_len = (seq_len + 7) // 8 * 8 - num_heads = alibi_slopes.shape[0] - bias = torch.empty( - 1, # batch size - num_heads, - seq_len, - padded_len, - device=alibi_slopes.device, - dtype=dtype, - )[:, :, :, :seq_len].copy_(bias) - bias.mul_(alibi_slopes[:, None, None]) - if num_heads != num_kv_heads: - bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) - return bias diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py deleted file mode 100644 index 412dd20ec1..0000000000 --- a/vllm/attention/ops/hpu_paged_attn.py +++ /dev/null @@ -1,88 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -############################################################################### - -from dataclasses import dataclass -from typing import List, Optional, Tuple - -import torch -from vllm_hpu_extension import cache_ops, ops - -# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. -_PARTITION_SIZE = 512 - - -@dataclass -class HPUPagedAttentionMetadata: - """Metadata for PagedAttention.""" - block_list: Optional[torch.Tensor] - block_mapping: Optional[torch.Tensor] - block_usage: Optional[torch.Tensor] - block_indices: Optional[torch.Tensor] - block_offsets: Optional[torch.Tensor] - block_groups: Optional[torch.Tensor] - - -class HPUPagedAttention: - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 256] - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return (num_blocks, block_size, num_kv_heads, head_size) - - @staticmethod - def split_kv_cache( - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: - key_cache = kv_cache[0] - value_cache = kv_cache[1] - return key_cache, value_cache - - @staticmethod - def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, kv_cache_dtype: str, - is_prompt: bool) -> None: - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, kv_cache_dtype, is_prompt) - - @staticmethod - def forward_decode(**kwargs) -> torch.Tensor: - return ops.flat_pa(**kwargs) - - @staticmethod - def swap_blocks( - src_kv_cache: Tuple[torch.Tensor, torch.Tensor], - dst_kv_cache: Tuple[torch.Tensor, torch.Tensor], - src_to_dsts: torch.Tensor, - ) -> None: - src_key_cache = src_kv_cache[0] - dst_key_cache = dst_kv_cache[0] - cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts) - - src_value_cache = src_kv_cache[1] - dst_value_cache = dst_kv_cache[1] - cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts) - - @staticmethod - def copy_blocks( - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - src_to_dsts: torch.Tensor, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) diff --git a/vllm/config.py b/vllm/config.py index 22f7401713..526b5db235 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2452,7 +2452,7 @@ class SchedulerConfig: return self.num_scheduler_steps > 1 -Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"] +Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"] @config diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index ea490c3279..92bc5e157e 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -7,7 +7,6 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.platforms import current_platform from vllm.utils import Device @@ -56,8 +55,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): - The block IDs are assigned contiguously, with GPU block IDs coming before CPU block IDs. """ - # For HPU, block id 0 is used only for padding - reserved_blocks = 1 if current_platform.is_hpu() else 0 + reserved_blocks = 0 block_ids = list( range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) num_gpu_blocks -= reserved_blocks diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py deleted file mode 100644 index f00f6b62bf..0000000000 --- a/vllm/distributed/device_communicators/hpu_communicator.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch -import torch.distributed as dist - -from vllm.platforms import current_platform - -from .base_device_communicator import DeviceCommunicatorBase - -if current_platform.is_hpu(): - import habana_frameworks.torch as htorch # noqa: F401 - - -class HpuCommunicator(DeviceCommunicatorBase): - - def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: - # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge - # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used - # (which is required for tensor parallel HPUGraph inference) - htorch.core.mark_step() - dist.all_reduce(input_, group=self.device_group) - return input_ - - def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: - world_size = self.world_size - if dim < 0: - # Convert negative dim to positive. - dim += input_.dim() - input_size = input_.size() - # Allocate output tensor. - output_tensor = torch.empty((world_size, ) + input_size, - dtype=input_.dtype, - device=input_.device) - # All-gather. - htorch.core.mark_step() - dist.all_gather_into_tensor(output_tensor, - input_, - group=self.device_group) - # Reshape - output_tensor = output_tensor.movedim(0, dim) - output_tensor = output_tensor.reshape(input_size[:dim] + - (world_size * - input_size[dim], ) + - input_size[dim + 1:]) - return output_tensor diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ae5eb46fa9..b20defde73 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1365,9 +1365,8 @@ class EngineArgs: supported = False if current_platform.is_rocm() or ( current_platform.is_cuda() - and current_platform.is_device_capability(100)) or ( - current_platform.device_name - == "hpu"): # handle hpu also for OOT platform + and current_platform.is_device_capability(100) + ): # handle hpu also for OOT platform supported = True elif fp8_attention and will_use_fa: from vllm.attention.utils.fa_utils import ( diff --git a/vllm/envs.py b/vllm/envs.py index 502978c768..ba0c55160b 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -106,8 +106,6 @@ if TYPE_CHECKING: VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_CUDART_SO_PATH: Optional[str] = None - VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True - VLLM_HPU_USE_DELAYED_SAMPLING: bool = False VLLM_DP_RANK: int = 0 VLLM_DP_RANK_LOCAL: int = -1 VLLM_DP_SIZE: int = 1 @@ -780,19 +778,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_CUDART_SO_PATH": lambda: os.getenv("VLLM_CUDART_SO_PATH", None), - # Contiguous cache fetching to avoid using costly gather operation on - # Gaudi3. This is only applicable to HPU contiguous cache. If set to true, - # contiguous cache fetch will be used. - "VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH": - lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in - ("1", "true"), - - # Use delayed sampling for HPU to reduce host cpu overhead - # between each step. - "VLLM_HPU_USE_DELAYED_SAMPLING": - lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in - ("1", "true"), - # Rank of the process in the data parallel setting "VLLM_DP_RANK": lambda: int(os.getenv("VLLM_DP_RANK", "0")), diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 39b45027bd..779f026468 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1164,10 +1164,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): posinf=pos_inf, neginf=neg_inf)) - # HPU needs special handling to prune out dummy samples. - if current_platform.is_hpu(): - lora_logits = lora_logits[:logits.shape[0], :] - logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1]] = lora_logits diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py deleted file mode 100644 index b20c9785a7..0000000000 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import TYPE_CHECKING, Optional, Union, final - -import torch -from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, - dispatch_bgmv_linear) - -from .punica_base import PunicaWrapperBase -from .utils import convert_mapping - -if TYPE_CHECKING: - # avoid circuit import - from vllm.lora.layers import LoRAMapping - from vllm.lora.models import LongContextLoRAContext - - -@final -class PunicaWrapperHPU(PunicaWrapperBase): - - def __init__(self, max_num_batched_tokens: int, max_batches: int, - device: Union[torch.device, str], **kwargs): - # Increasing max_num_batched_tokens by 3x to handle increase in - # tensor size due to padding. - PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens, - max_batches, device) - - def _update_base_metadata( - self, - mapping: "LoRAMapping", - lora_index_to_id: list[Optional[int]], - max_loras: int, - vocab_size: int, - extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, - ): - ( - base_indices, - sampler_indices, - sampler_indices_padded, - embeddings_indices, - long_lora_offsets_tensor, - indices_len, - ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size, - extra_vocab_size, self.device, None) - # Updating each element in `long_lora_offsets` with `lora_offset` slows - # down perf in HPU due to a series of `strided_insert` ops during lazy - # graph accumulation. Hence HPU appends `lora_offset` to a list and - # converts it to a tensor only after it is ready. - if long_lora_context: - index_mapping_indices: list[int] = list( - mapping.index_mapping).copy() - long_lora_offsets: list[int] = [] - for i in range(len(index_mapping_indices)): - lora_offset: int = long_lora_context.offsets_by_lora_id.get( - index_mapping_indices[i], 0) - long_lora_offsets.append(lora_offset) - long_lora_offsets_tensor = torch.tensor(long_lora_offsets, - device=self.device, - dtype=torch.long) - indices_len[-1] = long_lora_offsets_tensor.shape[-1] - - self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) - self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) - self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( - sampler_indices_padded) - self._embeddings_indices[:embeddings_indices. - shape[0], :embeddings_indices.shape[1]].copy_( - embeddings_indices) - if long_lora_offsets_tensor is not None: - self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( - long_lora_offsets_tensor) - else: - self._long_lora_indices.zero_() - self.indices_len[:] = indices_len - - def add_lora_embedding(self, - y: torch.Tensor, - x: torch.Tensor, - lora_b_stacked: torch.Tensor, - add_inputs: bool = True, - **kwargs) -> None: - dispatch_bgmv_embedding(y, x, lora_b_stacked, 0) - - def add_lora_linear(self, - y: torch.Tensor, - x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], - scale: float, - output_slices: tuple[int, ...], - *, - buffer: Optional[tuple[torch.Tensor, ...]] = None, - **kwargs) -> None: - y_org = y - x = x.view(-1, x.shape[-1]) - y = y.view(-1, y.shape[-1]) - offset_left = 0 - - for slice_idx in range(len(output_slices)): - dispatch_bgmv_linear( - y[:, offset_left:offset_left + output_slices[slice_idx]], x, - lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale) - offset_left += output_slices[slice_idx] - y = y.view_as(y_org) - - def add_lora_logits(self, - y: torch.Tensor, - x: torch.Tensor, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, - scale, - *, - buffer: Optional[torch.Tensor] = None, - **kwargs) -> None: - y_org = y - y = y.view(-1, y.shape[-1]) - x = x.view(-1, x.shape[-1]) - dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale) - y = y.view_as(y_org) - - def add_shrink( - self, - y: Union[tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, - lora_a_stacked: tuple[torch.Tensor, ...], - scale: float, - **kwargs, - ) -> None: - raise NotImplementedError - - def add_expand( - self, - y: torch.Tensor, - x: Union[tuple[torch.Tensor, ...], torch.Tensor], - lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: Optional[tuple[torch.Tensor, ...]], - output_slices: tuple[int, ...], - offset_start: int = 0, - add_inputs=True, - **kwargs, - ) -> None: - raise NotImplementedError diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 9c88721fb2..f6e79cd676 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -73,11 +73,6 @@ class CustomOp(nn.Module): # NOTE(woosuk): This is a placeholder for future extensions. return self.forward_native(*args, **kwargs) - def forward_hpu(self, *args, **kwargs): - # By default, we assume that Gaudi ops are compatible with the - # PyTorch-native implementation. - return self.forward_native(*args, **kwargs) - def forward_neuron(self, *args, **kwargs): # By default, we assume that Neuron ops are compatible with the # PyTorch-native implementation. @@ -106,8 +101,6 @@ class CustomOp(nn.Module): return self.forward_hip elif current_platform.is_cpu(): return self.forward_cpu - elif current_platform.is_hpu(): - return self.forward_hpu elif current_platform.is_tpu(): return self.forward_tpu elif current_platform.is_xpu(): diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index da772c1115..b3cee55e8b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -475,39 +475,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): activation, ) - def forward_hpu( - self, - layer: torch.nn.Module, - x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, - router_logits: torch.Tensor, - renormalize: bool, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - ) -> torch.Tensor: - assert not use_grouped_topk - assert num_expert_group is None - assert topk_group is None - assert custom_routing_function is None - assert layer is not None - assert apply_router_weight_on_input is False - if scoring_func != "softmax": - raise NotImplementedError( - "Only softmax scoring function is supported for HPU.") - if e_score_correction_bias is not None: - raise NotImplementedError( - "Expert score correction bias is not supported for HPU.") - return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight, - router_logits, top_k) - def forward_tpu( self, layer: torch.nn.Module, @@ -716,9 +683,6 @@ class FusedMoE(torch.nn.Module): if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " "non-grouped topk.") - if current_platform.is_hpu(): - from vllm_hpu_extension.ops import DynamicFusedMOE - self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts) if vllm_config.model_config is not None: model_dtype = vllm_config.model_config.dtype diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index e8d1fd6355..a5fc1db2dc 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -170,26 +170,6 @@ class RMSNorm(CustomOp): else: return norm_func(x, self.weight.data, self.variance_epsilon) - def forward_hpu( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: - from vllm_hpu_extension.kernels import rms_norm - HPUFusedRMSNorm = rms_norm() - if HPUFusedRMSNorm is None: - return self.forward_native(x, residual) - if residual is not None: - orig_shape = x.shape - residual += x.view(residual.shape) - # Note: HPUFusedRMSNorm requires 3D tensors as inputs - x = HPUFusedRMSNorm.apply(residual, self.weight, - self.variance_epsilon) - return x.view(orig_shape), residual - - x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon) - return x - def forward_xpu( self, x: torch.Tensor, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index a4615132a5..dddd4d6a71 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -229,64 +229,6 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache, self.is_neox_style) return query, key - def forward_hpu( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - from habana_frameworks.torch.hpex.kernels import ( - RotaryPosEmbeddingMode, apply_rotary_pos_emb) - if offsets is not None: - offsets = offsets.view(positions.shape[0], -1) - positions = positions + offsets - positions = positions.flatten() - num_tokens = positions.shape[0] - cos_sin = self.cos_sin_cache.index_select(0, positions).view( - num_tokens, 1, -1) - cos, sin = cos_sin.chunk(2, dim=-1) - # HPU RoPE kernel requires hidden dimension for cos and sin to be equal - # to query hidden dimension, so the original tensors need to be - # expanded - # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE - # and expansion of cos/sin tensors via concatenation - # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE - # and expansion of cos/sin tensors via repeat_interleave - rope_mode: RotaryPosEmbeddingMode - if self.is_neox_style: - rope_mode = RotaryPosEmbeddingMode.BLOCKWISE - cos = torch.cat((cos, cos), dim=-1) - sin = torch.cat((sin, sin), dim=-1) - else: - rope_mode = RotaryPosEmbeddingMode.PAIRWISE - sin = torch.repeat_interleave(sin, - 2, - dim=-1, - output_size=cos_sin.shape[-1]) - cos = torch.repeat_interleave(cos, - 2, - dim=-1, - output_size=cos_sin.shape[-1]) - - query_shape = query.shape - query = query.view(num_tokens, -1, self.head_size) - query_rot = query[..., :self.rotary_dim] - query_pass = query[..., self.rotary_dim:] - query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, - rope_mode) - query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) - - if key is not None: - key_shape = key.shape - key = key.view(num_tokens, -1, self.head_size) - key_rot = key[..., :self.rotary_dim] - key_pass = key[..., self.rotary_dim:] - key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, - rope_mode) - key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) - return query, key - def forward_neuron( self, positions: torch.Tensor, diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index f35f969781..a5f262c832 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -388,20 +388,8 @@ class VocabParallelEmbedding(torch.nn.Module): # Copy the data. Select chunk corresponding to current shard. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - - if current_platform.is_hpu(): - # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, - # so we're using a workaround. Remove this when fixed in - # HPU PT bridge. - padded_weight = torch.cat([ - loaded_weight, - torch.zeros(param.shape[0] - loaded_weight.shape[0], - *loaded_weight.shape[1:]) - ]) - param.data.copy_(padded_weight) - else: - param[:loaded_weight.shape[0]].data.copy_(loaded_weight) - param[loaded_weight.shape[0]:].data.fill_(0) + param[:loaded_weight.shape[0]].data.copy_(loaded_weight) + param[loaded_weight.shape[0]:].data.fill_(0) def forward(self, input_): if self.tp_size > 1: diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 907bc3c136..68fcb78569 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -199,10 +199,6 @@ class BitsAndBytesModelLoader(BaseModelLoader): if self.pre_quant: if self.load_8bit: - if current_platform.is_hpu(): - raise ValueError( - "currently hpu supports 4bit quantization only") - return self._quantized_8bit_generator( hf_weights_files, use_safetensors, quant_state_dict), quant_state_dict @@ -306,10 +302,6 @@ class BitsAndBytesModelLoader(BaseModelLoader): in temp_state_dict): quant_state = _parse_quant_state(mapped_weight_name, temp_state_dict) - if current_platform.is_hpu(): - assert quant_state.quant_type == "nf4", ( - "currently hpu supports nf4 quant_type only") - quant_state_dict[mapped_weight_name] = quant_state yield org_weight_name, weight_tensor else: @@ -380,8 +372,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): ...] # bitsandbytes requires data in GPU - if (weight_sub_tensor.is_cuda - or weight_sub_tensor.device.type == "hpu"): + if weight_sub_tensor.is_cuda: loaded_weight = weight_sub_tensor else: loaded_weight = weight_sub_tensor.to( diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 4624ff01dd..2fcae7eb6e 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -218,16 +218,6 @@ class DefaultModelLoader(BaseModelLoader): weights_iterator = _xla_weights_iterator(weights_iterator) - elif current_platform.is_hpu(): - import habana_frameworks.torch.core as htcore - - def _hpu_weights_iterator(iterator: Generator): - for weights in iterator: - yield weights - htcore.mark_step() - - weights_iterator = _hpu_weights_iterator(weights_iterator) - if self.counter_before_loading_weights == 0.0: self.counter_before_loading_weights = time.perf_counter() # Apply the prefix. diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 7b8953fd75..c13659f8a0 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -116,23 +116,6 @@ def rocm_platform_plugin() -> Optional[str]: return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None -def hpu_platform_plugin() -> Optional[str]: - is_hpu = False - logger.debug("Checking if HPU platform is available.") - try: - from importlib import util - is_hpu = util.find_spec('habana_frameworks') is not None - if is_hpu: - logger.debug("Confirmed HPU platform is available.") - else: - logger.debug("HPU platform is not available because " - "habana_frameworks is not found.") - except Exception as e: - logger.debug("HPU platform is not available because: %s", str(e)) - - return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None - - def xpu_platform_plugin() -> Optional[str]: is_xpu = False logger.debug("Checking if XPU platform is available.") @@ -208,7 +191,6 @@ builtin_platform_plugins = { 'tpu': tpu_platform_plugin, 'cuda': cuda_platform_plugin, 'rocm': rocm_platform_plugin, - 'hpu': hpu_platform_plugin, 'xpu': xpu_platform_plugin, 'cpu': cpu_platform_plugin, 'neuron': neuron_platform_plugin, diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py deleted file mode 100644 index 3faf481087..0000000000 --- a/vllm/platforms/hpu.py +++ /dev/null @@ -1,114 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -from typing import TYPE_CHECKING, Optional - -import torch - -from vllm import envs -from vllm.logger import init_logger -from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS - -from .interface import Platform, PlatformEnum, _Backend - -if TYPE_CHECKING: - from vllm.config import VllmConfig -else: - VllmConfig = None - -logger = init_logger(__name__) - - -class HpuPlatform(Platform): - _enum = PlatformEnum.HPU - device_name: str = "hpu" - device_type: str = "hpu" - dispatch_key: str = "HPU" - ray_device_key: str = "HPU" - dist_backend: str = "hccl" - device_control_env_var: str = "HABANA_VISIBLE_MODULES" - - @classmethod - def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, - dtype: torch.dtype, kv_cache_dtype: Optional[str], - block_size: int, use_v1: bool, - use_mla: bool) -> str: - logger.info("Using HPUAttention backend.") - return "vllm.attention.backends.hpu_attn.HPUAttentionBackend" - - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return True - - @classmethod - def inference_mode(cls): - return torch.no_grad() - - @classmethod - def set_device(cls, device: torch.device) -> None: - """ - Set the device for the current platform. - """ - torch.hpu.set_device(device) - - @classmethod - def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - - scheduler_config = vllm_config.scheduler_config - parallel_config = vllm_config.parallel_config - if scheduler_config.is_multi_step: - parallel_config.worker_cls = \ - "vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker" - - if vllm_config.speculative_config is not None: - raise NotImplementedError( - "Speculative decoding is not implemented for HPU") - - if parallel_config.worker_cls == "auto": - parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" - - # NOTE(kzawora): default block size for Gaudi should be 128 - # smaller sizes still work, but very inefficiently - cache_config = vllm_config.cache_config - if cache_config and cache_config.block_size is None: - cache_config.block_size = 128 - if (parallel_config.distributed_executor_backend == 'mp' - and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): - if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", - None) is not None: - logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " - "might cause application hangs on exit. Using " - "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " - "as it was explicitly requested.") - else: - logger.warning( - "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " - "might cause application hangs on exit. Setting " - "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " - "To override that behavior, please set " - "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - - if vllm_config.model_config and vllm_config.model_config.use_mla: - logger.info( - "MLA is enabled on a non-GPU platform; forcing chunked " - "prefill and prefix caching to be disabled.") - vllm_config.scheduler_config.enable_chunked_prefill = False - vllm_config.scheduler_config.chunked_prefill_enabled = False - vllm_config.scheduler_config.max_num_batched_tokens = max( - vllm_config.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS) - - @classmethod - def is_pin_memory_available(cls): - logger.warning("Pin memory is not supported on HPU.") - return False - - @classmethod - def get_punica_wrapper(cls) -> str: - return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU" - - @classmethod - def get_device_communicator_cls(cls) -> str: - return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator" # noqa diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index ae675bcc8d..b8e788de11 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -54,7 +54,6 @@ class _Backend(enum.Enum): FLASHMLA_VLLM_V1 = enum.auto() FLASHMLA = enum.auto() # Supported by V1 CUTLASS_MLA_VLLM_V1 = enum.auto() - HPU_ATTN = enum.auto() PALLAS = enum.auto() PALLAS_VLLM_V1 = enum.auto() IPEX = enum.auto() @@ -69,7 +68,6 @@ class PlatformEnum(enum.Enum): CUDA = enum.auto() ROCM = enum.auto() TPU = enum.auto() - HPU = enum.auto() XPU = enum.auto() CPU = enum.auto() NEURON = enum.auto() @@ -154,9 +152,6 @@ class Platform: def is_tpu(self) -> bool: return self._enum == PlatformEnum.TPU - def is_hpu(self) -> bool: - return self._enum == PlatformEnum.HPU - def is_xpu(self) -> bool: return self._enum == PlatformEnum.XPU diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 2cb177b9ba..51c78ddc1a 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging -import os from typing import Any, Callable import torch @@ -75,18 +74,6 @@ def load_general_plugins(): if current_platform.is_xpu(): # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 torch._dynamo.config.disable = True - elif current_platform.is_hpu(): - # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) - # does not support torch.compile - # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for - # torch.compile support - is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' - if is_lazy: - torch._dynamo.config.disable = True - # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) - # requires enabling lazy collectives - # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 - os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP) # general plugins, we only need to execute the loaded functions diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py deleted file mode 100644 index 5860368298..0000000000 --- a/vllm/worker/hpu_model_runner.py +++ /dev/null @@ -1,2320 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -############################################################################### - -import collections -import contextlib -import dataclasses -import functools -import gc -import itertools -import math -import os -import time -from array import array -from enum import Enum, IntEnum -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, - Optional, Set, Tuple, Type, TypeVar, Union) - -import habana_frameworks.torch as htorch -import habana_frameworks.torch.internal.bridge_config as bc -import torch -import torch.nn as nn -import vllm_hpu_extension.environment as environment -from vllm_hpu_extension.bucketing.common import get_bucketing_context -from vllm_hpu_extension.ops import LoraMask as LoraMask -from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, - HabanaMemoryProfiler, format_bytes) - -import vllm.envs as envs -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import DeviceConfig, VllmConfig -from vllm.distributed import broadcast_tensor_dict -from vllm.distributed.parallel_state import get_world_group -from vllm.forward_context import set_forward_context -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.sampling_metadata import SequenceGroupToSample -from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs -from vllm.sampling_params import SamplingParams -from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - Logprob, SequenceData, SequenceGroupMetadata, - SequenceOutput) -from vllm.utils import (bind_kv_cache, is_pin_memory_available, - make_tensor_with_pad) -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -_TYPE_CACHE = {} -# These values are assumed to be zero in several places. -# Use caution when updating them! -_PAD_SLOT_ID = 0 -_PAD_BLOCK_ID = 0 - -LORA_WARMUP_RANK = 8 - -DUMMY_TOKEN_ID = -1 - - -class PhaseType(Enum): - PREFILL = 'prefill' - PREFIX_PREFILL = 'prefix_prefill' - DECODE = 'decode' - - -def subtuple(obj: object, - typename: str, - to_copy: List[str], - to_override: Optional[Dict[str, object]] = None): - if obj is None: - return None - if to_override is None: - to_override = {} - fields = set(to_copy) | set(to_override.keys()) - if type(obj) is dict: - values = {key: obj[key] for key in fields if key in obj} - else: - values = {f: to_override.get(f, getattr(obj, f)) for f in fields} - if typename not in _TYPE_CACHE: - _TYPE_CACHE[typename] = collections.namedtuple(typename, - ' '.join(fields)) - return _TYPE_CACHE[typename](**values) - - -def round_up(value: int, k: int): - return (value + k - 1) // k * k - - -def align_workers(value, op): - group = get_world_group().cpu_group - world_size = torch.distributed.get_world_size() - if world_size <= 1: - return value - value_t = torch.tensor(value, device='cpu') - torch.distributed.all_reduce(value_t, op=op, group=group) - return value_t.item() - - -def setup_profiler(): - schedule = torch.profiler.schedule(wait=0, warmup=2, active=1, repeat=1) - DEVICE = 'hpu' - activities = [torch.profiler.ProfilerActivity.CPU] - activities.extend([torch.profiler.ProfilerActivity.HPU] if DEVICE == - 'hpu' else []) - #from habana_frameworks.torch.activity_profiler import DebugActivity - #debug_activities=[DebugActivity.BRIDGE_FUNCTION_CALLS] - - profiler = torch.profiler.profile( - schedule=schedule, - activities=activities, - #debug_activities=debug_activities, - on_trace_ready=torch.profiler.tensorboard_trace_handler('.', - use_gzip=True), - record_shapes=False, - with_stack=True) - return profiler - - -def pad_list(input, k, v): - input_len = len(input) - target_len = round_up(input_len, k) - padding = target_len - input_len - return input + [v] * padding - - -def gather_list(input, indices, v): - return [input[i] if i is not None else v for i in indices] - - -def flatten(in_list): - return list(itertools.chain(*in_list)) - - -def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - if is_prompt: - indices = indices.unflatten(0, (-1, block_size))[:, 0] - offsets = None - else: - offsets = torch.fmod(slot_mapping, block_size) - return indices, offsets - - -def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"): - if module.__class__.__name__.endswith(suffix): - - def forward_hook(module, args, output): - htorch.core.mark_step() - return output - - module.register_forward_hook(forward_hook) - - for child_name, child_module in module.named_children(): - modify_decoder_layer(child_module) - - -class HpuModelAdapter: - - def __init__(self, model, vllm_config): - self.model = model - self.sampler = get_sampler() - self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', - '0').lower() in ['1', 'true'] - self.vllm_config = vllm_config - self.block_size = vllm_config.cache_config.block_size - self.dtype = vllm_config.model_config.dtype - enforce_eager = vllm_config.model_config.enforce_eager - - if not htorch.utils.internal.is_lazy() and not enforce_eager: - if os.getenv('VLLM_REGIONAL_COMPILATION', - 'true').lower() == 'true': - self.regional_compilation_layers_list = [ - RMSNorm, VocabParallelEmbedding - ] - self._regional_compilation(self.model) - else: - self.model = torch.compile(self.model, - backend='hpu_backend', - dynamic=False) - - def _regional_compilation(self, - module, - parent_module=None, - module_name=None): - if isinstance(module, torch.nn.ModuleList): - for children_name, children_module in module.named_children(): - self._compile_region(module, children_name, children_module) - elif any( - isinstance(module, layer) - for layer in self.regional_compilation_layers_list): - self._compile_region(parent_module, module_name, module) - else: - for children_name, children_module in module.named_children(): - self._regional_compilation(children_module, module, - children_name) - - def _compile_region(self, model, name, module): - module = torch.compile(module, backend='hpu_backend', dynamic=False) - setattr(model, name, module) - - def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, - dtype): - if (attn_metadata is None - or (self.prefill_use_fusedsdpa \ - and attn_metadata.block_list is None) - or not attn_metadata.is_prompt): - return attn_metadata - - prefill_metadata = attn_metadata - - seq_lens_t = prefill_metadata.seq_lens_tensor - context_lens_t = prefill_metadata.context_lens_tensor - query_lens_t = seq_lens_t - context_lens_t - - block_list = attn_metadata.block_list - max_context_len = (block_list.size(-1) // - batch_size if block_list is not None else 0) - max_context_len = max_context_len * self.block_size - past_mask = torch.arange(0, - max_context_len, - dtype=torch.int32, - device=device) - past_mask = (past_mask.view(1, -1).expand(batch_size, -1).ge( - context_lens_t.view(-1, 1)).view(batch_size, 1, -1).expand( - batch_size, seq_len, -1).view(batch_size, 1, seq_len, -1)) - - len_mask = (torch.arange(0, seq_len, device=device, - dtype=torch.int32).view(1, seq_len).ge( - query_lens_t.unsqueeze(-1)).view( - batch_size, 1, 1, seq_len)) - causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len), - device=device, - dtype=torch.bool), - diagonal=1) - mask = causal_mask.logical_or(len_mask) - mask = torch.concat((past_mask, mask), dim=-1) - attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( - mask, -math.inf)) - attn_metadata = prefill_metadata._replace(attn_bias=attn_bias) - return attn_metadata - - def _set_block_mapping(self, metadata, batch_size, device, dtype): - mask = torch.arange(0, - self.block_size, - device=device, - dtype=torch.int32).unsqueeze(0) - mask = mask >= metadata.block_usage.unsqueeze(-1) - attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( - mask, -math.inf)) - if os.environ.get('VLLM_USE_FAKE_HPU', - '0') == '0' and htorch.utils.internal.is_lazy(): - block_mapping = torch.nn.functional.one_hot(metadata.block_groups, - num_classes=batch_size) - else: - # Unfortunately one_hot on CPU/torch.compile mode/eager mode - # doesn't handle out of bounds classes so we need to convert - # all negative values to 0 (block_mapping) or bs (block_groups) - block_groups = metadata.block_groups.to(torch.long) - block_mapping = torch.nn.functional.relu(block_groups) - block_mapping = torch.nn.functional.one_hot(block_mapping, - num_classes=batch_size) - oob_values = block_groups.lt(0) - block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0) - block_groups.masked_fill_(oob_values, batch_size) - metadata = metadata._replace(block_groups=block_groups) - block_mapping = block_mapping.to(dtype) - metadata = metadata._replace(block_mapping=block_mapping, - attn_bias=attn_bias) - return metadata - - def _update_metadata(self, attn_metadata, batch_size, seq_len, device, - dtype): - if attn_metadata.is_prompt: - meta = attn_metadata - attn_metadata = self._set_attn_bias(meta, batch_size, seq_len, - device, dtype) - else: - meta = attn_metadata - attn_metadata = self._set_block_mapping(meta, batch_size, device, - dtype) - return attn_metadata - - def forward(self, *args, **kwargs): - kwargs = kwargs.copy() - selected_token_indices = kwargs.pop('selected_token_indices') - if 'warmup_mode' in kwargs: - kwargs.pop('warmup_mode') - virtual_engine = 0 - if 'virtual_engine' in kwargs: - virtual_engine = kwargs.pop('virtual_engine') - input_ids = kwargs['input_ids'] - attn_metadata = self._update_metadata(kwargs.pop('attn_metadata'), - input_ids.size(0), - input_ids.size(1), - input_ids.device, self.dtype) - LoraMask.setLoraMask(kwargs.pop('lora_mask')) - with set_forward_context(attn_metadata, self.vllm_config, - virtual_engine): - hidden_states = self.model(*args, **kwargs) - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - hidden_states = hidden_states.index_select(0, - selected_token_indices) - return hidden_states - - def compute_logits(self, *args, **kwargs): - return self.model.compute_logits(*args, **kwargs) - - def sample(self, *args, **kwargs): - return self.sampler(*args, **kwargs) - - -class PreparePromptMetadata(NamedTuple): - input_tokens: torch.Tensor - input_positions: List[List[int]] - attn_metadata: Optional[AttentionMetadata] - seq_lens: List[int] - query_lens: List[int] - lora_index_mapping: List[List[int]] - lora_prompt_mapping: List[List[int]] - lora_requests: Set[LoRARequest] - multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]] - slot_mapping: List[List[int]] - lora_ids: List[int] - - @classmethod - def empty(cls): - return PreparePromptMetadata(input_tokens=[], - input_positions=[], - attn_metadata=None, - seq_lens=[], - query_lens=[], - lora_index_mapping=[], - lora_prompt_mapping=[], - lora_requests=set(), - multi_modal_kwargs=None, - slot_mapping=[], - lora_ids=[]) - - -class PrepareDecodeMetadata(NamedTuple): - input_tokens: torch.Tensor - input_positions: List[List[int]] - attn_metadata: Optional[AttentionMetadata] - lora_index_mapping: List[List[int]] - lora_prompt_mapping: List[List[int]] - lora_requests: Set[LoRARequest] - slot_mapping: List[List[int]] - lora_ids: List[int] - - @classmethod - def empty(cls): - return PrepareDecodeMetadata(input_tokens=[], - input_positions=[], - attn_metadata=None, - lora_index_mapping=[], - lora_prompt_mapping=[], - lora_requests=set(), - slot_mapping=[], - lora_ids=[]) - - -# How batches are constructed. -class BatchType(IntEnum): - # Every batch is prefill. - PREFILL = 0 - # Every batch is decode. - DECODE = 1 - # Batch is a mixture of prefill and decode. - MIXED = 2 - - -TModelInputForHPU = TypeVar('TModelInputForHPU', bound="ModelInputForHPU") - - -@dataclasses.dataclass(frozen=True) -class ModelInputForHPU(ModelRunnerInputBase): - """ - This base class contains metadata needed for the base model forward pass - but not metadata for possible additional steps, e.g., sampling. Model - runners that run additional steps should subclass this method to add - additional fields. - """ - input_tokens: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None - real_batch_size: Optional[int] = None - batch_size_padded: Optional[int] = None - virtual_engine: int = 0 - lora_ids: Optional[List[int]] = None - async_callback: Optional[Callable] = None - is_first_multi_step: bool = True - is_last_step: bool = True - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "real_batch_size": self.real_batch_size, - "batch_size_padded": self.batch_size_padded, - "virtual_engine": self.virtual_engine, - "lora_ids": self.lora_ids, - "is_first_multi_step": self.is_first_multi_step, - "is_last_step": self.is_last_step, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForHPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> TModelInputForHPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -@dataclasses.dataclass(frozen=True) -class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - # Used for speculative decoding. We do not broadcast it because it is only - # used by the driver worker. - is_prompt: Optional[bool] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - "multi_modal_kwargs": self.multi_modal_kwargs, - "lora_ids": self.lora_ids, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForHPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - # FIXME(kzawora): this fails for whatever reason - why? - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): - """ - Helper class for shared methods between GPU model runners. - """ - _model_input_cls: Type[TModelInputForHPU] - - def __init__( - self, - vllm_config: VllmConfig, - is_driver_worker: bool = False, - return_hidden_states: bool = False, - ): - ModelRunnerBase.__init__(self, vllm_config=vllm_config) - environment.set_model_config(self.model_config) - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.sliding_window = (self.model_config.get_sliding_window() - if self.model_config is not None else None) - self.device_config = (self.device_config if self.device_config - is not None else DeviceConfig()) - self.device = self.device_config.device - self.enforce_eager = self.model_config.enforce_eager - self.max_num_seqs = self.scheduler_config.max_num_seqs - # NOTE(kzawora): Change that to scheduler_config.max_num_prefill_seqs - # once padding-aware scheduling gets merged - self.max_num_prefill_seqs = 64 - self.max_model_len = self.scheduler_config.max_model_len - self.max_num_batched_tokens = \ - self.scheduler_config.max_num_batched_tokens - self.block_size = self.cache_config.block_size - - self.pin_memory = is_pin_memory_available() - self.kv_cache_dtype = self.cache_config.cache_dtype - - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - ) - - # Lazy initialization - self.lora_manager: LRUCacheWorkerLoRAManager = None - self.model: torch.nn.Module = None - self.inc_initialized_successfully = False - - # Profiler stats - self.profiler = HabanaHighLevelProfiler() - self.profiler_counter_helper = HabanaProfilerCounterHelper() - self.seen_configs: set = set() - self._mem_margin: Optional[int] = None - HPUBucketingContext = get_bucketing_context() - self.bucketing_ctx = HPUBucketingContext(self.max_num_seqs, - self.max_num_prefill_seqs, - self.block_size, - self.max_num_batched_tokens, - False, self.max_model_len) - self.graphed_buckets: Set[Any] = set() - self._set_gc_threshold() - if self.vllm_config.cache_config.enable_prefix_caching: - os.environ.setdefault("VLLM_CONTIGUOUS_PA", "False") - assert os.environ.get( - "VLLM_CONTIGUOUS_PA", - "").lower() != "true", "Contiguous PA doesn't support APC" - self.use_contiguous_pa = envs.VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH - - # For multi-step scheduling - self.cached_step_outputs: List[torch.Tensor] = [] - # For delayed sampling - self.cached_step_inputs: List[ - ModelInputForHPUWithSamplingMetadata] = [] - - def _set_gc_threshold(self) -> None: - # Read https://docs.python.org/3/library/gc.html#gc.set_threshold - # for comprehensive description of gc generations. - # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority) - # to set particular generation threshold or use simpler - # VLLM_GC_THR_MULTIPLIER to multiply default values. - default_gc_thrs = list(gc.get_threshold()) - requested_gc_thrs = [0] * len(default_gc_thrs) - for i in range(len(default_gc_thrs)): - requested_gc_thrs[i] = int( - os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i])) - if requested_gc_thrs == default_gc_thrs: - gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', - 2)) - requested_gc_thrs = [ - t * gc_thr_multiplier for t in default_gc_thrs - ] - gc.set_threshold(*requested_gc_thrs) - - self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP', - 'false').lower() == 'true' - - def load_model(self) -> None: - import habana_frameworks.torch.core as htcore - if self.model_config.quantization == 'inc' or \ - self.model_config.quantization == 'fp8': - htcore.hpu_set_env() - with HabanaMemoryProfiler() as m: - with HabanaMemoryProfiler() as m_getmodel: - self.model = get_model(vllm_config=self.vllm_config) - msg = ("Pre-loading model weights on " - f"{next(self.model.parameters()).device} " - f"took {m_getmodel.get_summary_string()}") - logger.info(msg) - - if self.lora_config: - assert hasattr(self.model, "embedding_modules" - ), "Model does not have embedding_modules" - assert hasattr( - self.model, "embedding_padding_modules" - ), "Model does not have embedding_padding_modules" - assert not self.lora_config.bias_enabled, \ - "Bias support in LoRA is not enabled in HPU yet." - assert not self.lora_config.fully_sharded_loras, \ - "Fully sharded LoRAs is not enabled in HPU yet." - - # Use get_text_config() in case of multimodal models - text_config = self.model_config.hf_config.get_text_config() - - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, - self.vocab_size, - self.lora_config, - self.device, - self.model.embedding_modules, - self.model.embedding_padding_modules, - max_position_embeddings=text_config. - max_position_embeddings, - ) - self.model = self.lora_manager.create_lora_manager(self.model) - - if self.model_config.quantization == 'inc': - logger.info("Preparing model with INC..") - with HabanaMemoryProfiler() as m_inc: - from neural_compressor.torch.quantization import ( - FP8Config, convert, prepare) - config = FP8Config.from_json_file( - os.getenv("QUANT_CONFIG", "")) - if config.measure: - self.model = prepare(self.model, config) - elif config.quantize: - self.model = convert(self.model, config) - htcore.hpu_initialize(self.model, - mark_only_scales_as_const=True) - self.inc_initialized_successfully = True - logger.info("Preparing model with INC took %s", - m_inc.get_summary_string()) - else: - self.model = self.model.to("hpu") - htcore.mark_step() - modify_decoder_layer(self.model) - torch.hpu.synchronize() - - with HabanaMemoryProfiler() as m_wrap: - self.model = _maybe_wrap_in_hpu_graph( - self.model, vllm_config=self.vllm_config) - msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" - logger.info(msg) - - self.model_memory_usage = m.consumed_device_memory - msg = f"Loading model weights took in total {m.get_summary_string()}" - logger.info(msg) - - def _add_dummy_seq(self, seq_group_metadata_list, is_prompt): - real_batch_size = len(seq_group_metadata_list) - batch_size_padded = self.bucketing_ctx.get_padded_batch_size( - real_batch_size, is_prompt) - batch_size_padding = batch_size_padded - real_batch_size - - seq_group_metadata_list = seq_group_metadata_list.copy() - - if batch_size_padding > 0: - dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( - 0, 0, is_prompt) - seq_group_metadata_list.extend(dummy_seq_group_metadata - for _ in range(batch_size_padding)) - return seq_group_metadata_list, real_batch_size, batch_size_padded - - def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): - return htorch.hpu.wrap_in_hpu_graph( - HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True - ) if htorch.utils.internal.is_lazy() else HpuModelAdapter( - *args, **kwargs) - - def get_model(self) -> nn.Module: - return self.model - - def _use_graphs(self, batch_size, seq_len, is_prompt): - if self.enforce_eager: - return False - if self.skip_warmup: - return True - return (batch_size, seq_len, is_prompt) in self.graphed_buckets - - def _is_valid_bucket(self, bucket): - return bucket[0] * bucket[1] <= self.max_num_batched_tokens - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> PreparePromptMetadata: - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - lora_index_mapping: List[List[int]] = [] - lora_prompt_mapping: List[List[int]] = [] - lora_requests: Set[LoRARequest] = set() - - seq_lens: List[int] = [] - context_lens: List[int] = [] - query_lens: List[int] = [] - prefix_block_tables: List[List[int]] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - - if len(seq_group_metadata_list) == 0: - return PreparePromptMetadata.empty() - - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - computed_block_nums = seq_group_metadata.computed_block_nums - if (self.scheduler_config is not None - and self.scheduler_config.chunked_prefill_enabled - and not (computed_block_nums is None - or computed_block_nums == [])): - raise RuntimeError( - "chunked prefill cannot be used with prefix caching " - "now.") - - token_chunk_size = seq_group_metadata.token_chunk_size - seq_data = seq_group_metadata.seq_data[seq_id] - context_len = seq_data.get_num_computed_tokens() - # We should use get_len here because in case of preemption - # it contains output tokens. - seq_len = min(seq_data.get_len(), context_len + token_chunk_size) - prompt_tokens = seq_data.get_token_ids()[context_len:seq_len] - seq_lens.append(seq_len) - - # NOTE: This only works for oooooooxxx style attention. - if computed_block_nums is not None and len( - computed_block_nums) > 0 and self.sliding_window is None: - # Prefix is not supported with sliding_window - context_len = len(computed_block_nums) * self.block_size - if context_len == seq_len \ - and self.vllm_config.cache_config.enable_prefix_caching: - # Fully cached prompt - compute only last token - context_len = context_len - 1 - prompt_tokens = prompt_tokens[context_len:] - prefix_block_tables.append(computed_block_nums) - elif self.scheduler_config.chunked_prefill_enabled: - if seq_group_metadata.block_tables is not None: - # Prefill has chunked before. - block_table = seq_group_metadata.block_tables[seq_id] - prefix_block_tables.append(block_table) - else: - # The first prefill. - prefix_block_tables.append([]) - else: - prefix_block_tables.append([]) - # Right now, prefill start is always 0. However, this - # assumption can be changed once chunked prefill is introduced. - assert context_len == 0 - - # actual prompt lens - context_lens.append(context_len) - query_lens.append(seq_len - context_len) - input_tokens.append(prompt_tokens) - # NOTE(woosuk): Here we assume that the first token in the prompt - # is always the first token in the sequence. - input_positions.append(list(range(context_len, seq_len))) - - mm_kwargs = seq_group_metadata.multi_modal_data - if mm_kwargs: - multi_modal_kwargs_list.append(mm_kwargs) - - if seq_group_metadata.block_tables is None: - # During memory profiling, the block tables are not initialized - # yet. In this case, we just use a dummy slot mapping. - slot_mapping.append([_PAD_SLOT_ID] * seq_len) - continue - - # Compute the slot mapping. - slot_mapping.append([]) - block_table = seq_group_metadata.block_tables[seq_id] - - # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, seq_len - sliding_window). - # For example, if the prompt len is 10, sliding window is 8, and - # block size is 4, the first two tokens are masked and the slot - # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. - start_idx = 0 - if self.sliding_window is not None: - assert context_len == 0, ( - "Prefix caching is currently not supported with " - "sliding window attention") - start_idx = max(0, seq_len - self.sliding_window) - for i in range(context_len, seq_len): - if i < start_idx: - slot_mapping[-1].append(_PAD_SLOT_ID) - continue - - block_number = block_table[i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping[-1].append(slot) - - max_query_len = max(query_lens) - sum_query_len = sum(query_lens) - real_num_seqs = len(query_lens) - assert max_query_len > 0 - - max_prompt_len = max( - self.bucketing_ctx.get_padded_prompt_seq_len(max_query_len), - self.block_size) - - lora_ids: List[int] = [] - for seq_group_metadata, context_len in zip(seq_group_metadata_list, - context_lens): - lora_id = seq_group_metadata.lora_int_id - lora_ids.append(lora_id) - - if lora_id > 0: - lora_requests.add(seq_group_metadata.lora_request) - - lora_index_mapping += [lora_id] * max_prompt_len - lora_prompt_mapping.extend( - [lora_id] * - (max_prompt_len - if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - - if any(context_lens): - assert not self.scheduler_config.chunked_prefill_enabled - # prefix caching - - max_num_block = max(len(bt) for bt in prefix_block_tables) - prefix_block_list = list( - itertools.chain.from_iterable( - bt if len(bt) == max_num_block else bt + - ([_PAD_BLOCK_ID] * (max_num_block - len(bt))) - for bt in prefix_block_tables)) - - pad_len = len(prefix_block_list) - prefix_block_list = pad_list(prefix_block_list, pad_len, - _PAD_BLOCK_ID) - - prefix_block_list_tensor = torch.tensor(prefix_block_list, - dtype=torch.long, - device=self.device) - else: - prefix_block_list_tensor = None - - input_tokens = make_tensor_with_pad(input_tokens, - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device=self.device) - - input_positions = make_tensor_with_pad(input_positions, - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device=self.device) - - slot_mapping = make_tensor_with_pad(slot_mapping, - max_len=max_prompt_len, - pad=_PAD_SLOT_ID, - dtype=torch.long, - device=self.device) - - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.long, - device=self.device) - - context_lens_tensor = torch.tensor(context_lens, - dtype=torch.long, - device=self.device) - - block_indices, block_offsets = precompute_indices_and_offsets( - self.block_size, slot_mapping, True) - attn_metadata = self.attn_backend.make_metadata( - is_prompt=True, - block_list=prefix_block_list_tensor, - block_mapping=None, - block_usage=None, - block_indices=block_indices, - block_offsets=block_offsets, - block_groups=None, - attn_bias=None, - seq_lens_tensor=seq_lens_tensor, - context_lens_tensor=context_lens_tensor, - num_prefills=real_num_seqs, - num_prefill_tokens=sum_query_len, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps= - None, # FIXME(kzawora): multi-modality will not work here - enable_kv_scales_calculation=False, - ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return PreparePromptMetadata(input_tokens=input_tokens, - input_positions=input_positions, - attn_metadata=attn_metadata, - seq_lens=seq_lens, - query_lens=query_lens, - lora_index_mapping=lora_index_mapping, - lora_prompt_mapping=lora_prompt_mapping, - lora_requests=lora_requests, - multi_modal_kwargs=multi_modal_kwargs, - slot_mapping=slot_mapping, - lora_ids=lora_ids) - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - output=None, - ) -> PrepareDecodeMetadata: - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - seq_lens: List[int] = [] - block_tables: List[List[int]] = [] - lora_index_mapping: List[List[int]] = [] - lora_prompt_mapping: List[List[int]] = [] - lora_requests: Set[LoRARequest] = set() - - if len(seq_group_metadata_list) == 0: - return PrepareDecodeMetadata.empty() - lora_ids: List[int] = [] - - dummy_slots = itertools.cycle( - range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) - - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - assert seq_group_metadata.token_chunk_size == 1 - - seq_ids = list(seq_group_metadata.seq_data.keys()) - lora_id = seq_group_metadata.lora_int_id - lora_ids.append(lora_id) - - if lora_id > 0: - lora_requests.add(seq_group_metadata.lora_request) - - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - if output is None: - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append([position]) - - seq_len = seq_len if self.sliding_window is None else min( - seq_len, self.sliding_window) - seq_lens.append(seq_len) - - block_table = seq_group_metadata.block_tables[seq_id] - num_fully_occupied_blocks = position // self.block_size - block_table = block_table[:num_fully_occupied_blocks + 1] - - if len(block_table) == 0: - block_number = _PAD_BLOCK_ID - else: - block_number = block_table[position // self.block_size] - if block_number == _PAD_BLOCK_ID: - slot = next(dummy_slots) - else: - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append([slot]) - lora_index_mapping.append(lora_id) - lora_prompt_mapping.append(lora_id) - - if self.sliding_window is not None: - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - block_tables.append(block_table) - - if output is None: - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) - else: - real_batch_size = len(seq_group_metadata_list) - input_tokens = output[:real_batch_size] - - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) - - num_decode_tokens = sum(seq_lens) - - last_block_usage = [ - slot[0] % self.block_size + 1 for slot in slot_mapping - ] - block_groups = [[i] * len(bt) for i, bt in enumerate(block_tables)] - block_usage = [[self.block_size] * (len(bt) - 1) + [lbu] - for bt, lbu in zip(block_tables, last_block_usage) - if bt] - - block_list = flatten(block_tables) - block_groups = flatten(block_groups) - block_usage = flatten(block_usage) - - assert len(block_list) == len(block_groups) - assert len(block_list) == len(block_usage) - - padding_fn = None - if self.use_contiguous_pa: - block_bucket_size = max(max(block_list) + 1, len(block_list)) - block_bucket_size = self.bucketing_ctx.get_padded_decode_num_blocks( - block_bucket_size) - indices: List[Any] - indices = [None] * block_bucket_size - for i, bid in enumerate(block_list): - indices[bid] = i - padding_fn = lambda tensor, pad_value: gather_list( - tensor, indices, pad_value) - else: - block_bucket_size = \ - self.bucketing_ctx.get_padded_decode_num_blocks( - len(block_list)) - padding_fn = lambda tensor, pad_value: pad_list( - tensor, block_bucket_size, pad_value) - - block_list = padding_fn(block_list, _PAD_BLOCK_ID) - block_groups = padding_fn(block_groups, -1) - block_usage = padding_fn(block_usage, 1) - - block_list = torch.tensor(block_list, - dtype=torch.int, - device=self.device) - block_groups = torch.tensor(block_groups, - dtype=torch.int, - device=self.device) - block_usage = torch.tensor(block_usage, - dtype=self.model_config.dtype, - device=self.device) - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) - - block_indices, block_offsets = precompute_indices_and_offsets( - self.block_size, slot_mapping, False) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=False, - block_list=block_list, - block_mapping=None, - block_usage=block_usage, - block_indices=block_indices, - block_offsets=block_offsets, - block_groups=block_groups, - attn_bias=None, - seq_lens_tensor=None, - context_lens_tensor=None, - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=num_decode_tokens, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - ) - return PrepareDecodeMetadata(input_tokens=input_tokens, - input_positions=input_positions, - attn_metadata=attn_metadata, - lora_index_mapping=lora_index_mapping, - lora_prompt_mapping=lora_prompt_mapping, - lora_requests=lora_requests, - slot_mapping=slot_mapping, - lora_ids=lora_ids) - - def prepare_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[TModelInputForHPU, SamplingMetadata]: - if len(seq_group_metadata_list) == 0: - return self._model_input_cls(), None - - input_tokens = None - input_positions = None - lora_mapping = None - lora_requests = None - multi_modal_kwargs = None - batch_type = None - seq_lens = None - query_lens = None - real_batch_size = None - batch_size_padded = None - - self.event_start = self.profiler.get_timestamp_us() - is_prompt = seq_group_metadata_list[0].is_prompt - base_event_name = 'prompt' if is_prompt else 'decode' - self.profiler.start('internal', base_event_name) - - seq_group_metadata_list, real_batch_size, batch_size_padded = ( - self._add_dummy_seq(seq_group_metadata_list, is_prompt)) - - prefill_reqs = [] - decode_reqs = [] - for seq_group_meta in seq_group_metadata_list: - if seq_group_meta.is_prompt: - prefill_reqs.append(seq_group_meta) - else: - decode_reqs.append(seq_group_meta) - - # Prepare input tensors. - ( - input_tokens, - input_positions, - prefill_attn_metadata, - seq_lens, - query_lens, - lora_index_mapping, - lora_prompt_mapping, - lora_requests, - multi_modal_kwargs, - slot_mapping, - lora_ids, - ) = self._prepare_prompt(prefill_reqs) - ( - decode_input_tokens, - decode_input_positions, - decode_attn_metadata, - decode_lora_index_mapping, - decode_lora_prompt_mapping, - decode_lora_requests, - decode_slot_mapping, - decode_lora_ids, - ) = self._prepare_decode(decode_reqs) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - seq_lens, query_lens, - self.device, - self.pin_memory) - - if not self.scheduler_config.chunked_prefill_enabled: - assert (len(prefill_reqs) and len(decode_reqs)) == 0 - - num_prefills = len(seq_lens) - num_prefill_tokens = len(input_tokens) - num_decode_tokens = len(decode_input_tokens) - - # NOTE(kzawora): Here we diverge from GPU code - we don't - # support mixed batches, so we either use decode or prefill - # inputs, without coalescing. - assert (num_prefills == 0 and num_decode_tokens > 0) or ( - num_prefills > 0 - and num_decode_tokens == 0), "HPU does not support mixed batches!" - if num_decode_tokens > 0: - input_tokens = decode_input_tokens - input_positions = decode_input_positions - slot_mapping = decode_slot_mapping - lora_index_mapping = decode_lora_index_mapping - lora_prompt_mapping = decode_lora_prompt_mapping - lora_requests = decode_lora_requests - lora_ids = decode_lora_ids - - # FIXME: We need to adjust selected_token_indices to accommodate - # for padding - max_len = input_tokens.size(1) - paddings = [max_len - q for q in query_lens] - paddings = [0] + paddings[:-1] - paddings = list(itertools.accumulate(paddings)) - paddings_prompt_logprobs = [] - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - if seq_group_metadata.sampling_params.prompt_logprobs is not None \ - and seq_group_metadata.is_prompt: - paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i]) - paddings = torch.tensor( - paddings_prompt_logprobs if paddings_prompt_logprobs else paddings, - dtype=sampling_metadata.selected_token_indices.dtype, - device=sampling_metadata.selected_token_indices.device) - sampling_metadata.selected_token_indices.add_(paddings) - - if self.lora_config: - lora_mapping = LoRAMapping( - **dict(index_mapping=lora_index_mapping, - prompt_mapping=lora_prompt_mapping, - is_prefill=(num_prefills > 0))) - else: - lora_mapping = None - - if (prefill_attn_metadata is not None - and decode_attn_metadata is not None): - batch_type = BatchType.MIXED - raise NotImplementedError("Mixed batch is not supported on HPU") - elif prefill_attn_metadata is not None: - batch_type = BatchType.PREFILL - else: - batch_type = BatchType.DECODE - - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": sampling_metadata.selected_token_indices, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - "multi_modal_kwargs": multi_modal_kwargs, - "num_prefill_tokens": num_prefill_tokens, - "num_decode_tokens": num_decode_tokens, - "slot_mapping": slot_mapping, - "num_prefills": num_prefills, - "batch_type": batch_type, - "seq_lens": seq_lens, - "query_lens": query_lens - } - if prefill_attn_metadata is not None: - metadata_dict.update(prefill_attn_metadata.asdict_zerocopy()) - else: - assert decode_attn_metadata is not None - metadata_dict.update(decode_attn_metadata.asdict_zerocopy()) - - attn_metadata = prefill_attn_metadata if \ - prefill_attn_metadata is not None else decode_attn_metadata - - return self._model_input_cls(input_tokens=input_tokens, - seq_lens=seq_lens, - query_lens=query_lens, - input_positions=input_positions, - attn_metadata=attn_metadata, - lora_requests=lora_requests, - lora_mapping=lora_mapping, - multi_modal_kwargs=multi_modal_kwargs, - real_batch_size=real_batch_size, - batch_size_padded=batch_size_padded, - lora_ids=lora_ids), \ - sampling_metadata - - def _seq_len(self, attn_metadata): - if attn_metadata.num_prefills != 0: - return attn_metadata.slot_mapping.size(1) - else: - return attn_metadata.block_list.numel() - - def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: - # NOTE(kzawora): To anyone working on this in the future: - # Trimming metadata is required when using HPUGraphs. - # Attention metadata is going to be hashed by PT bridge, and - # appropriate HPUGraphs will be matched based on all inputs' hash. - - # Before you put more keys in here, make sure you know their - # value type and make sure you know how it's going to be hashed. - # You can find that information in input_hash function - # in habana_frameworks/torch/hpu/graphs.py. You can also hash - # it manually with torch.hpu.graphs.input_hash(attention_metadata) - - # If you use primitive types here - they will get hashed based - # on their value. You *will* get lots of excessive graph captures - # (and an OOM eventually) if you decide to put something like - # seq_len int here. - # If you absolutely need a scalar, put it in a tensor. Tensors - # get hashed using their metadata, not their values: - # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321)) - # input_hash(123) != input_hash(321) - # input_hash("abc") != input_hash("cba") - attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ - 'attn_bias', - 'seq_lens_tensor', - 'context_lens_tensor', - 'block_list', - 'block_mapping', - 'block_usage', - 'slot_mapping', - 'is_prompt', - 'block_indices', - 'block_offsets', - 'block_groups', - ]) - return attention_metadata - - def create_dummy_seq_group_metadata(self, - group_id, - seq_len, - is_prompt, - lora_request=None): - sampling_params = SamplingParams(temperature=0) - num_blocks = math.ceil(seq_len / self.block_size) - seq_len = max(seq_len, 1) - if is_prompt: - input_len = seq_len - output_len = 0 - block_tables = None - else: - input_len = seq_len - 1 - output_len = 1 - block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks} - prompt_token_ids = [0] * input_len - output_token_ids = [1] * output_len - prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 - seq_data = SequenceData(prompt_token_ids_array) - seq_data.output_token_ids = output_token_ids - return SequenceGroupMetadata(request_id=str(group_id), - is_prompt=(output_len == 0), - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=block_tables, - lora_request=lora_request) - - def profile_run(self) -> None: - num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [None] * num_layers - bind_kv_cache( - self.vllm_config.compilation_config.static_forward_context, - [kv_caches]) - _, max_seq_len = self.bucketing_ctx.get_max_prompt_shape() - max_batch_size = min(self.max_num_seqs, - self.max_num_batched_tokens // max_seq_len) - self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, - False, True) - return - - def warmup_scenario(self, - batch_size, - seq_len, - is_prompt, - kv_caches, - is_pt_profiler_run=False, - is_lora_profile_run=False) -> None: - use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - scenario_name = ("warmup_" - f"{'prompt' if is_prompt else 'decode'}_" - f"bs{batch_size}_" - f"seq{seq_len}_" - f"graphs{'T' if use_graphs else 'F'}") - # This represents the maximum number of different requests - # that will have unique loras, an therefore the max amount of memory - # consumption create dummy lora request copies from the lora request - # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config and is_lora_profile_run: - assert self.lora_manager is not None - with self.lora_manager.dummy_lora_cache(): - for idx in range(self.lora_config.max_loras): - lora_id = idx + 1 - dummy_lora_request = LoRARequest( - lora_name=f"warmup_{lora_id}", - lora_int_id=lora_id, - lora_local_path="/not/a/real/path", - ) - self.lora_manager.add_dummy_lora(dummy_lora_request, - rank=LORA_WARMUP_RANK) - dummy_lora_requests.append(dummy_lora_request) - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(batch_size) - ] - self.profiler.start('internal', scenario_name) - times = 3 if use_graphs or is_pt_profiler_run else 1 - if is_prompt: - seqs = [ - self.create_dummy_seq_group_metadata( - i, - seq_len, - is_prompt, - lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) - for i in range(batch_size) - ] - else: - # FIXME: seq_len is actually number of blocks - blocks = [seq_len // batch_size for _ in range(batch_size)] - blocks[0] += seq_len % batch_size - seqs = [ - self.create_dummy_seq_group_metadata( - i, - b * self.block_size - 1, - is_prompt, - lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) - for i, b in enumerate(blocks) - ] - torch.hpu.synchronize() - profiler = None - if is_pt_profiler_run and self.is_driver_worker: - profiler = setup_profiler() - profiler.start() - for _ in range(times): - inputs = self.prepare_model_input(seqs) - is_single_step = \ - self.vllm_config.scheduler_config.num_scheduler_steps == 1 - if is_prompt or is_single_step: - self.execute_model(inputs, None, warmup_mode=True) - else: # decode with multi-step - inputs = dataclasses.replace(inputs, - is_first_multi_step=True, - is_last_step=False) - self.execute_model(inputs, - None, - warmup_mode=True, - num_steps=2, - seqs=seqs) - inputs = dataclasses.replace(inputs, - is_first_multi_step=False, - is_last_step=True) - self.execute_model(inputs, - None, - warmup_mode=True, - num_steps=2, - seqs=seqs) - torch.hpu.synchronize() - if profiler: - profiler.step() - if profiler: - profiler.stop() - self.profiler.end() - gc.collect() - - def remove_all_loras(self): - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.remove_all_adapters() - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_adapter(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_adapter(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.pin_adapter(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_adapters() - - def log_warmup(self, phase, i, max_i, batch_size, seq_len): - free_mem = format_bytes( - HabanaMemoryProfiler.current_free_device_memory()) - dim = "num_blocks" - if phase == "Prompt": - dim = "seq_len" - msg = (f"[Warmup][{phase}][{i+1}/{max_i}] " - f"batch_size:{batch_size} " - f"{dim}:{seq_len} " - f"free_mem:{free_mem}") - logger.info(msg) - - def warmup_all_buckets(self, buckets, is_prompt, kv_caches): - for i, (batch_size, seq_len) in enumerate(reversed(buckets)): - self.log_warmup('Prompt' if is_prompt else 'Decode', i, - len(buckets), batch_size, seq_len) - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - - def warmup_graphs(self, - strategy, - buckets, - is_prompt, - kv_caches, - available_mem, - starting_mem=0, - total_batch_seq=0.001): - total_mem = starting_mem - idx = 0 - phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' - num_candidates = len(buckets) - ordering : Union[Callable[[Any], Tuple[Any, Any]], \ - Callable[[Any], Tuple[Any, Any, Any]]] - if strategy == 'min_tokens': - ordering = lambda b: (b[0] * b[1], b[1], b[0]) - elif strategy == 'max_bs': - ordering = lambda b: (-b[0], b[1]) - else: - raise NotImplementedError( - f'Unsupported graph allocation strategy: {strategy}') - buckets = list(sorted(buckets, key=ordering)) - captured_all = True - for idx, (batch_size, seq_len) in enumerate(buckets): - # Graph memory usage is proportional to seq dimension in a batch - batch_seq = batch_size * seq_len if is_prompt else batch_size - mem_estimate = batch_seq / total_batch_seq * total_mem - if mem_estimate >= available_mem: - captured_all = False - continue - graphed_bucket = (batch_size, seq_len, is_prompt) - if graphed_bucket in self.graphed_buckets: - continue - self.graphed_buckets.add(graphed_bucket) - self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) - with HabanaMemoryProfiler() as mem_prof: - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - used_mem = align_workers(mem_prof.consumed_device_memory, - torch.distributed.ReduceOp.MAX) - available_mem -= used_mem - total_mem += used_mem - total_batch_seq += batch_seq - - return total_mem, total_batch_seq, captured_all - - def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): - num_candidates = len(buckets) - phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' - graphed = list(c[:2] for c in self.graphed_buckets - if c[2] == is_prompt) - if num_candidates == 0: - num_candidates = 1 - msg = (f'{phase} captured:{len(graphed)} ' - f'({100 * len(graphed) / num_candidates:.1f}%) ' - f'used_mem:{format_bytes(total_mem)} ' - f'buckets:{sorted(list(graphed))}') - logger.info(msg) - - @torch.inference_mode() - def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: - max_blocks = kv_caches[0][0].size(0) - self.bucketing_ctx.generate_decode_buckets(max_blocks) - if profile := os.environ.get('VLLM_PT_PROFILE', None): - phase, bs, seq_len, graph = profile.split('_') - is_prompt = phase == 'prompt' - graphs = graph == 't' - if graphs: - self.graphed_buckets.add((int(bs), int(seq_len), is_prompt)) - self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches, - True) - raise AssertionError("Finished profiling") - if not htorch.utils.internal.is_lazy() and not self.enforce_eager: - cache_size_limit = 1 + 3 * ( - len(self.bucketing_ctx.prompt_buckets) + - len(self.bucketing_ctx.decode_buckets)) - torch._dynamo.config.cache_size_limit = max( - cache_size_limit, torch._dynamo.config.cache_size_limit) - # Multiply by 8 to follow the original default ratio between - # the cache_size_limit and accumulated_cache_size_limit - torch._dynamo.config.accumulated_cache_size_limit = max( - cache_size_limit * 8, - torch._dynamo.config.accumulated_cache_size_limit) - if self.skip_warmup: - logger.info("Skipping warmup...") - return - self.profiler.start('internal', 'warmup') - start_mem = HabanaMemoryProfiler.current_device_memory_usage() - start_time = time.perf_counter() - - compile_only_mode_context = functools.partial(bc.env_setting, - "PT_COMPILE_ONLY_MODE", - True) - can_use_compile_only_mode = True - try: - with compile_only_mode_context(): - pass - logger.debug("Using PT_COMPILE_ONLY_MODE.") - except KeyError: - can_use_compile_only_mode = False - logger.warning('Cannot use PT_COMPILE_ONLY_MODE. ' - 'Warmup time will be negatively impacted. ' - 'Please update Gaudi Software Suite.') - with compile_only_mode_context( - ) if can_use_compile_only_mode else contextlib.nullcontext(): - self.warmup_all_buckets(self.bucketing_ctx.prompt_buckets, True, - kv_caches) - self.warmup_all_buckets(self.bucketing_ctx.decode_buckets, False, - kv_caches) - - if not self.enforce_eager and htorch.utils.internal.is_lazy(): - assert self.mem_margin is not None, \ - ("HabanaWorker.determine_num_available_blocks needs " - "to be called before warming up the model.") - free_mem = HabanaMemoryProfiler.current_free_device_memory() - graph_free_mem = free_mem - self.mem_margin - graph_free_mem = align_workers(graph_free_mem, - torch.distributed.ReduceOp.MIN) - prompt_graph_mem_ratio = float( - os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3')) - prompt_available_memory = (prompt_graph_mem_ratio * - graph_free_mem) - decode_available_memory = (graph_free_mem - - prompt_available_memory) - msg = ( - f"Using {format_bytes(graph_free_mem)}" - f"/{format_bytes(free_mem)} " - "of free device memory for HPUGraphs, " - f"{format_bytes(prompt_available_memory)} for prompt and " - f"{format_bytes(decode_available_memory)} for decode " - f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})") - logger.info(msg) - prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY', - 'min_tokens') - decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', - 'max_bs') - mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ - self.warmup_graphs( - prompt_strategy, self.bucketing_ctx.prompt_buckets, - True, kv_caches, prompt_available_memory) - mem_post_decode, decode_batch_seq, decode_captured_all = \ - self.warmup_graphs( - decode_strategy, self.bucketing_ctx.decode_buckets, - False, kv_caches, decode_available_memory) - - # Not all prompt buckets were captured, but all decode buckets - # were captured and we have some free graph-allocated space - # left. Let's try to use it for capturing more prompt buckets. - if (mem_post_decode + mem_post_prompt < graph_free_mem - and not prompt_captured_all and decode_captured_all): - mem_post_prompt, _, prompt_captured_all = ( - self.warmup_graphs( - prompt_strategy, self.bucketing_ctx.prompt_buckets, - True, kv_caches, - graph_free_mem - mem_post_prompt - mem_post_decode, - mem_post_prompt, prompt_batch_seq)) - - # Not all decode buckets were captured, but all prompt buckets - # were captured and we have some free graph-allocated space - # left. Let's try to use it for capturing more decode buckets. - if mem_post_decode + mem_post_prompt < graph_free_mem \ - and not decode_captured_all \ - and prompt_captured_all: - mem_post_decode, _, _ = self.warmup_graphs( - decode_strategy, self.bucketing_ctx.decode_buckets, - False, kv_caches, - graph_free_mem - mem_post_prompt - mem_post_decode, - mem_post_decode, decode_batch_seq) - - self.log_graph_warmup_summary( - self.bucketing_ctx.prompt_buckets, True, mem_post_prompt) - self.log_graph_warmup_summary( - self.bucketing_ctx.decode_buckets, False, mem_post_decode) - - end_time = time.perf_counter() - end_mem = HabanaMemoryProfiler.current_device_memory_usage() - elapsed_time = end_time - start_time - msg = ( - f"Warmup finished in {elapsed_time:.0f} secs, " - f"allocated {format_bytes(end_mem - start_mem)} of device memory") - logger.info(msg) - self.profiler.end() - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - @property - def mem_margin(self) -> Optional[int]: - return self._mem_margin - - @mem_margin.setter - def mem_margin(self, value): - self._mem_margin = value - - -def _maybe_wrap_in_hpu_graph(*args, **kwargs): - return htorch.hpu.wrap_in_hpu_graph( - HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True - ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs) - - -class HabanaProfilerCounterHelper: - - def __init__(self): - self.niter = 0 - self.average_real_throughput = None - self.logged_once = False - self.real_seq_lens = [] - self.prompt_seq_lens = [] - - def capture_seq_group_metadata_stats(self, seq_group_metadata_list): - self.real_seq_lens = [ - len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) - for seq_group_metadata in seq_group_metadata_list - for seq_data in seq_group_metadata.seq_data.values() - ] - self.prompt_seq_lens = [ - len(seq_data.prompt_token_ids) - for seq_group_metadata in seq_group_metadata_list - for seq_data in seq_group_metadata.seq_data.values() - ] - - def get_counter_dict(self, cache_config, duration, seq_len, - batch_size_padded, real_batch_size, is_prompt): - throughput = batch_size_padded / (duration / 1e6) - throughput_effective = real_batch_size / (duration / 1e6) - - real_max_seq_len = max(self.real_seq_lens) - real_num_tokens = sum(self.real_seq_lens) - padded_num_tokens = batch_size_padded * seq_len - batch_token_utilization = real_num_tokens / padded_num_tokens - if self.average_real_throughput is None: - self.average_real_throughput = throughput_effective - else: # https://www.heikohoffmann.de/htmlthesis/node134.html - self.average_real_throughput = self.average_real_throughput + 1 / ( - self.niter + 1) * (throughput_effective - - self.average_real_throughput) - phase = "prompt" if is_prompt else "decode" - counters = { - f'{phase}_bucket_batch_size': batch_size_padded, - f'{phase}_batch_size': real_batch_size, - f'{phase}_bucket_seq_len': seq_len, - f'{phase}_seq_len': real_max_seq_len, - f'{phase}_bucket_gen_throughput': throughput, - f'{phase}_real_gen_throughput': throughput_effective, - f'{phase}_batch_token_utilization': batch_token_utilization, - 'average_real_throughput': self.average_real_throughput, - 'engine_iteration': self.niter, - } - self.niter += 1 - if is_prompt: - prompt_bucket_in_throughput = (seq_len * batch_size_padded) / ( - duration / 1e6) - prompt_real_in_throughput = sum( - self.prompt_seq_lens) / (duration / 1e6) - counters[ - f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput - counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput - - # KV cache might not be created yet (e.g. for profiling run) - if cache_config.num_gpu_blocks is not None and \ - cache_config.num_gpu_blocks != 0: - cache_num_blocks_used = [ - math.ceil(sl / cache_config.block_size) - for sl in self.real_seq_lens - ] - cache_total_num_blocks_used = sum(cache_num_blocks_used) - num_cache_blocks = cache_config.num_gpu_blocks - cache_total_num_free_blocks = \ - num_cache_blocks - cache_total_num_blocks_used - cache_computed_utilization = \ - cache_total_num_blocks_used / num_cache_blocks - max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size) - batch_block_utilization = cache_total_num_blocks_used / ( - batch_size_padded * max_blocks_per_seq) - counters['cache_num_blocks_used'] = cache_total_num_blocks_used - counters['cache_num_free_blocks'] = cache_total_num_free_blocks - counters['cache_computed_utilization'] = cache_computed_utilization - counters[ - f'{phase}_batch_block_utilization'] = batch_block_utilization - if not self.logged_once: - counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks - counters[ - 'const_gpu_memory_utilization'] = \ - cache_config.gpu_memory_utilization - counters['const_block_size'] = cache_config.block_size - self.logged_once = True - return counters - - -def unwrap_model(model): - if isinstance(model, torch._dynamo.eval_frame.OptimizedModule): - return unwrap_model(model._orig_mod) - else: - model = list(vars(model)['_modules'].values())[0] - modules = list(vars(model)['_modules'].values()) - return modules - - -class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): - """ - GPU model runner with sampling step. - """ - _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = ( - ModelInputForHPUWithSamplingMetadata) - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForHPUWithSamplingMetadata: - return ( - ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - )) - - @torch.inference_mode() - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForHPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - If cuda graph is required, this API automatically pads inputs. - """ - with self.profiler.record_event('internal', 'prepare_input_tensors'): - assert seq_group_metadata_list is not None - if self.profiler.enabled: - self.profiler_counter_helper.capture_seq_group_metadata_stats( - seq_group_metadata_list=seq_group_metadata_list) - model_input, sampling_metadata = self.prepare_input_tensors( - seq_group_metadata_list) - assert model_input.attn_metadata is not None - is_prompt = model_input.attn_metadata.is_prompt - - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - is_prompt=is_prompt, - virtual_engine=virtual_engine) - - def finish_measurements(self): - from neural_compressor.torch.quantization import finalize_calibration - finalize_calibration(self.model.model) - - def _num_blocks(self, attn_metadata): - if attn_metadata.block_list is None: - return 0 - return attn_metadata.block_list.numel() - - def _phase(self, attn_metadata): - phase_type: PhaseType - is_prompt = attn_metadata.is_prompt - is_prefix_prefill = is_prompt and attn_metadata.block_list is not None - if is_prompt and is_prefix_prefill: - phase_type = PhaseType.PREFIX_PREFILL - elif is_prompt and not is_prefix_prefill: - phase_type = PhaseType.PREFILL - elif not is_prompt: - phase_type = PhaseType.DECODE - else: - raise ValueError("Unrecognized pass type, likely due to malformed " - "attention metadata") - return phase_type - - def _check_config(self, batch_size, seq_len, attn_metadata, warmup_mode): - is_prefix_caching = self.vllm_config.cache_config.enable_prefix_caching - cfg: Optional[tuple] = None - assert cfg is None, "Configs changed between 2D and 3D" - if is_prefix_caching: - phase = self._phase(attn_metadata) - num_blocks = self._num_blocks(attn_metadata) - cfg = (batch_size, seq_len, num_blocks, phase) - else: - phase = 'prompt' if attn_metadata.is_prompt else 'decode' - cfg = (batch_size, seq_len, phase) - seen = cfg in self.seen_configs - self.seen_configs.add(cfg) - if not seen and not warmup_mode: - logger.warning("Configuration: %s was not warmed-up!", - (phase.value, batch_size, seq_len, - num_blocks) if is_prefix_caching else - (phase, batch_size, seq_len)) - - def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], - is_prompt: bool): - ''' - This is a helper function to create the mask for lora computations. - Lora Mask is needed to ensure we match the correct lora weights for the - for the request. - For Prompt phase we have - lora_mask with shape (batch_size * seq_len, max_loras * max_rank) - lora_logits_mask with shape (batch_size, max_loras * max_rank) - For Decode phase we have both - lora_mask and lora_logits_mask with shape - (batch_size, max_loras * max_rank) - ''' - lora_mask: torch.Tensor = None - lora_logits_mask: torch.Tensor = None - lora_index = 0 - - if self.lora_config: - if is_prompt: - lora_mask = torch.zeros( - input_tokens.shape[0] * input_tokens.shape[1], - (self.lora_config.max_loras) *\ - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - lora_logits_mask = torch.zeros( - input_tokens.shape[0], (self.lora_config.max_loras) * - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - - ones = torch.ones(input_tokens.shape[1], - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - logit_ones = torch.ones(1, - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - - for i in range(len(lora_ids)): - if lora_ids[i] == 0: - continue - lora_index = self.lora_manager._adapter_manager.\ - lora_index_to_id.index(lora_ids[i]) - start_row = i * input_tokens.shape[1] - end_row = start_row + input_tokens.shape[1] - start_col = lora_index * self.lora_config.max_lora_rank - end_col = start_col + self.lora_config.max_lora_rank - lora_mask[start_row:end_row, start_col:end_col] = ones - lora_logits_mask[i, start_col:end_col] = logit_ones - lora_mask = lora_mask.to('hpu') - lora_logits_mask = lora_logits_mask.to('hpu') - else: - lora_mask = torch.zeros(input_tokens.shape[0], - (self.lora_config.max_loras) * - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - ones = torch.ones(1, - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - for i in range(len(lora_ids)): - if lora_ids[i] == 0: - continue - lora_index = self.lora_manager._adapter_manager.\ - lora_index_to_id.index(lora_ids[i]) - start_pos = lora_index * self.lora_config.max_lora_rank - end_pos = start_pos + self.lora_config.max_lora_rank - lora_mask[i, start_pos:end_pos] = ones - lora_mask = lora_mask.to('hpu') - lora_logits_mask = lora_mask - - return lora_mask, lora_logits_mask - - def _get_seq_ids(self, model_input): - return ([ - sg.seq_ids[0] for sg in model_input.sampling_metadata.seq_groups - ]) - - def _pad_to_max_num_seqs(self, tensor, value): - padding_needed = self.max_num_seqs - tensor.size(0) - if padding_needed: - padding = torch.full((padding_needed, *tensor.shape[1:]), - value, - device=tensor.device, - dtype=tensor.dtype) - tensor = torch.cat([tensor, padding]) - return tensor - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForHPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - warmup_mode=False, - seqs=None, - ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - VLLM_DELAYED_SAMPLING = envs.VLLM_HPU_USE_DELAYED_SAMPLING - use_delayed_sampling = VLLM_DELAYED_SAMPLING and not warmup_mode - assert not (use_delayed_sampling and num_steps != 1), \ - 'Delayed sampling is not compatible with MSS!' - assert model_input.input_tokens is not None - if use_delayed_sampling and not model_input.is_prompt and \ - self.is_driver_worker: - num_cached = len(self.cached_step_outputs) - assert num_cached > 0 - cur_seq_ids = self._get_seq_ids(model_input) - cur_seq_id_pos = { - sid: idx - for idx, sid in enumerate(cur_seq_ids) if sid >= 0 - } - htorch.core.mark_step() - for i in range(num_cached): - prev_seq_ids = self._get_seq_ids(self.cached_step_inputs[i]) - target_indices = [ - cur_seq_id_pos.get(psi, -1) for psi in prev_seq_ids - ] - padding = self.cached_step_outputs[i].size(0) - len( - target_indices) - target_indices.extend([-1] * padding) - target_indices = torch.tensor( - target_indices, - device=model_input.input_tokens.device, - dtype=model_input.input_tokens.dtype) - model_input.input_tokens.index_copy_( - 0, target_indices, self.cached_step_outputs[i]) - htorch.core.mark_step() - - if not model_input.is_first_multi_step: - if not model_input.is_last_step: - # not first or last multi-step - return [] - # last multi-step - output = self._decode_sampler_outputs( - model_input) if self.is_driver_worker else [] - torch.hpu.synchronize() - if model_input.is_first_multi_step: - # first multi-step - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - # Rank!=0 workers has is_prompt==None - if use_delayed_sampling and not model_input.is_prompt and \ - model_input.input_tokens.size(1) == 1: - if self.is_driver_worker: - model_kwargs_broadcast_data = { - "input_tokens": model_input.input_tokens - } - broadcast_tensor_dict(model_kwargs_broadcast_data, src=0) - input_tokens = model_input.input_tokens - - else: - model_kwargs_broadcast_data = broadcast_tensor_dict(src=0) - input_tokens = model_kwargs_broadcast_data["input_tokens"] - else: - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - sampling_metadata = model_input.sampling_metadata - real_batch_size = model_input.real_batch_size - batch_size_padded = model_input.batch_size_padded - assert input_tokens is not None - assert input_positions is not None - assert sampling_metadata is not None - assert attn_metadata is not None - is_prompt = attn_metadata.is_prompt - assert is_prompt is not None - batch_size = input_tokens.size(0) - seq_len = self._seq_len(attn_metadata) - use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - self._check_config(batch_size, seq_len, attn_metadata, warmup_mode) - - lora_mask: torch.Tensor = None - lora_logits_mask: torch.Tensor = None - if self.lora_config: - assert model_input.lora_ids is not None - lora_mask, lora_logits_mask = self.create_lora_mask( - input_tokens, model_input.lora_ids, - attn_metadata.is_prompt) - - execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, - "attn_metadata": self.trim_attn_metadata(attn_metadata), - "intermediate_tensors": intermediate_tensors, - "lora_mask": lora_mask, - "virtual_engine": model_input.virtual_engine, - **(model_input.multi_modal_kwargs or {}), - } - if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update( - {"bypass_hpu_graphs": not use_graphs}) - - htorch.core.mark_step() - if self.is_driver_worker: - model_event_name = ("model_" - f"{'prompt' if is_prompt else 'decode'}_" - f"bs{batch_size}_" - f"seq{seq_len}_" - f"graphs{'T' if use_graphs else 'F'}") - else: - model_event_name = 'model_executable' - if num_steps > 1 or use_delayed_sampling: - # in case of multi-step scheduling - # we only want to pythonize in the last step - sampling_metadata.skip_sampler_cpu_output = True - self.model.sampler.include_gpu_probs_tensor = True - cache_orig_output_tokens_len: List[Dict] = [] - - def try_revert_dummy_output_tokens(): - if len(cache_orig_output_tokens_len) > 0: - # Reuse the original output token ids length - for i, seq_group_metadata in enumerate( - seq_group_metadata_list): - for j, data in seq_group_metadata.seq_data.items(): - orig_output_tokens_len = \ - cache_orig_output_tokens_len[i][j] - data.output_token_ids = \ - data.output_token_ids[:orig_output_tokens_len] - - for i in range(num_steps): - if i != 0 and not self.is_driver_worker: - broadcast_data = broadcast_tensor_dict(src=0) - if 'early_exit' in broadcast_data and broadcast_data[ - 'early_exit']: - return [output] if num_steps == 1 else [] - execute_model_kwargs.update({ - "input_ids": - broadcast_data["input_ids"], - "positions": - broadcast_data["positions"], - "attn_metadata": - self.trim_attn_metadata( - broadcast_data["attn_metadata"]) - }) - with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model.forward( - **execute_model_kwargs, - selected_token_indices=sampling_metadata. - selected_token_indices) - - if self.lora_config: - LoraMask.setLoraMask( - lora_logits_mask.index_select( - 0, sampling_metadata.selected_token_indices)) - - # Compute the logits. - with self.profiler.record_event( - 'internal', - ('compute_logits_' - f'{"prompt" if is_prompt else "decode"}_bs' - f'{batch_size}_' - f'seq{seq_len}')): - if num_steps == 1: - sampling_metadata.selected_token_indices = None - logits = self.model.compute_logits(hidden_states, - sampling_metadata) - htorch.core.mark_step() - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - continue - - if use_delayed_sampling: - fake_output = self._delayed_sampler_outputs(model_input) - - with self.profiler.record_event( - 'internal', ('sample_' - f'{"prompt" if is_prompt else "decode"}_' - f'bs{batch_size}_' - f'seq{seq_len}')): - output = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, - ) - if num_steps > 1: - output = output.sampled_token_ids - self.cached_step_outputs.append(output) - if use_delayed_sampling and self.is_driver_worker: - self._patch_prev_output() - output = self._pad_to_max_num_seqs( - output.sampled_token_ids, DUMMY_TOKEN_ID) - self.cached_step_outputs.append(output) - self.cached_step_inputs.append(model_input) - htorch.core.mark_step() - if model_input.async_callback is not None: - model_input.async_callback() - if i < num_steps - 1: - if i == 0: - if model_input.async_callback is not None: - ctx = model_input.async_callback.keywords[ # type: ignore - "ctx"] - seq_group_metadata_list = \ - ctx.seq_group_metadata_list - elif seqs is not None: - seq_group_metadata_list = seqs - else: - raise RuntimeError( - "seq_group_metadata_list is uninitialized") - for i, seq_group_metadata in enumerate( - seq_group_metadata_list): - # Skip empty steps - seq_group_metadata.state.current_step += ( - num_steps - 2) - # Cache the original output token ids - cache_orig_output_tokens_len.append({}) - for j, data in seq_group_metadata.seq_data.items(): - cache_orig_output_tokens_len[i][j] = \ - len(data.output_token_ids) - for seq_group_metadata in seq_group_metadata_list: - for data in seq_group_metadata.seq_data.values(): - max_output_len = sampling_metadata.seq_groups[ - 0].sampling_params.max_tokens - if len(data.output_token_ids) < max_output_len - 1: - # add a place holder for prepare_decode - # arbitrary value, this could be any token - dummy_token = (540, ) - data.output_token_ids += (dummy_token) - else: - broadcast_tensor_dict({'early_exit': True}, - src=0) - if num_steps == 1: - return [output] - else: - try_revert_dummy_output_tokens() - return [] - - result = self._prepare_decode(seq_group_metadata_list, - output=output) - execute_model_kwargs.update({ - "input_ids": - result.input_tokens, - "positions": - result.input_positions, - "attn_metadata": - self.trim_attn_metadata(result.attn_metadata) - }) - model_kwargs_broadcast_data = { - "input_ids": result.input_tokens, - "positions": result.input_positions, - "attn_metadata": vars(result.attn_metadata) - } - broadcast_tensor_dict(model_kwargs_broadcast_data, src=0) - else: - try_revert_dummy_output_tokens() - - if self.is_driver_worker and self.profiler.enabled: - # Stop recording 'execute_model' event - self.profiler.end() - event_end = self.profiler.get_timestamp_us() - counters = self.profiler_counter_helper.get_counter_dict( - cache_config=self.cache_config, - duration=event_end - self.event_start, - seq_len=seq_len, - batch_size_padded=batch_size_padded, - real_batch_size=real_batch_size, - is_prompt=is_prompt) - self.profiler.record_counter(self.event_start, counters) - if num_steps == 1: - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - assert model_input.sampling_metadata is not None - if model_input.is_prompt: - output.prefill_hidden_states = hidden_states - output.hidden_states = hidden_states - if use_delayed_sampling: - if self.is_driver_worker: - return [fake_output] - else: - return [] - - return [output] if self.is_driver_worker else [] - else: - return [] - return output if type(output) is list else [output] - - def _delayed_sampler_outputs(self, model_input): - next_token_ids = [[DUMMY_TOKEN_ID]] * len( - model_input.sampling_metadata.seq_groups) - sampler_output = self._make_decode_output( - next_token_ids, model_input.sampling_metadata.seq_groups) - return sampler_output - - def _decode_sampler_outputs(self, model_input): - use_async_out_proc = model_input.async_callback is not None - sampler_outputs = [] - num_outputs = len(self.cached_step_outputs) - for i in range(num_outputs): - next_token_ids = self.cached_step_outputs.pop(0) - next_token_ids = next_token_ids.cpu().tolist() - sampler_output = self._make_decode_output( - next_token_ids, model_input.sampling_metadata.seq_groups) - sampler_outputs.append(sampler_output) - - if i < num_outputs - 1 and use_async_out_proc: - assert model_input.async_callback is not None - ctx = model_input.async_callback.keywords[ # type: ignore - "ctx"] - ctx.append_output( - outputs=[sampler_output], - seq_group_metadata_list=ctx.seq_group_metadata_list, - scheduler_outputs=ctx.scheduler_outputs, - is_async=False, - is_last_step=False, - is_first_step_output=False) - model_input.async_callback() - - if use_async_out_proc: - return [sampler_outputs[-1]] - else: - return sampler_outputs - - def _make_decode_output( - self, - next_token_ids: List[List[int]], - seq_groups: List[SequenceGroupToSample], - ) -> SamplerOutput: - zero_logprob = Logprob(0.0) - sampler_outputs = [] - batch_idx = 0 - for seq_group in seq_groups: - seq_ids = seq_group.seq_ids - seq_outputs = [] - for seq_id in seq_ids: - next_token_id = next_token_ids[batch_idx][0] - seq_outputs.append( - SequenceOutput(seq_id, next_token_id, - {next_token_id: zero_logprob})) - batch_idx += 1 - sampler_outputs.append( - CompletionSequenceGroupOutput(seq_outputs, None)) - return SamplerOutput(sampler_outputs) - - def shutdown_inc(self): - can_finalize_inc = False - from contextlib import suppress - with suppress(AttributeError): - can_finalize_inc = (self.model_config.quantization == 'inc') and \ - (self.model.model is not None) and \ - self.inc_initialized_successfully and \ - not getattr(self, "_is_inc_finalized", False) - if can_finalize_inc: - from neural_compressor.torch.quantization import ( - finalize_calibration) - finalize_calibration(self.model.model) - self._is_inc_finalized = True - - def __del__(self): - self.shutdown_inc() - - def _patch_prev_output(self): - assert len(self.cached_step_inputs) == len(self.cached_step_outputs), \ - f'''Inputs and outputs are out of sync! - {len(self.cached_step_inputs)} vs {len(self.cached_step_outputs)}''' - if len(self.cached_step_inputs) == 0: - return - model_input = self.cached_step_inputs.pop(0) - delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze( - -1).tolist() - ctx = model_input.async_callback.keywords["ctx"] # type: ignore - # If there's no output to patch with, which is usually the case when - # we're starting a new request after all requests are completed. - if len(ctx.output_queue) == 0: - return - assert len( - ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' - output_data = ctx.output_queue[0] - assert len(output_data.outputs) == 1 - for fake_out, real_out in zip(output_data.outputs[0], delayed_output): - fake_out.samples[0].output_token = real_out - for sg, real_out in zip(output_data.seq_group_metadata_list, - delayed_output): - assert len(sg.seq_data) == 1 - seq_data = list(sg.seq_data.values())[0] - # This is a hack. Assigning output_token_ids triggers - # a cache recomputation and we only need to update the last token - seq_data.output_token_ids_array[-1] = real_out - seq_data._cached_all_token_ids[-1] = real_out diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py deleted file mode 100644 index 560110df0a..0000000000 --- a/vllm/worker/hpu_worker.py +++ /dev/null @@ -1,485 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -############################################################################### - -import contextlib -import gc -import os -from typing import List, Optional, Set, Tuple, Type - -import habana_frameworks.torch as htorch # noqa:F401 -import torch -import torch.distributed -from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes - -import vllm.envs as envs -from vllm.config import ParallelConfig, VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest -from vllm.utils import bind_kv_cache -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.hpu_model_runner import HPUModelRunner -from vllm.worker.model_runner_base import ModelRunnerBase -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class HPUWorker(LocalOrDistributedWorkerBase): - """A worker class that executes (a partition of) the model on a HPU. - - Each worker is associated with a single HPU. The worker is responsible for - maintaining the KV cache and executing the model on the HPU. In case of - distributed inference, each worker is assigned a partition of the model. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False, - model_runner_cls: Optional[Type[ModelRunnerBase]] = None, - ) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - if self.is_driver_worker: - assert self.rank == 0, "The driver worker must have rank 0." - - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - self.model_runner: HPUModelRunner = HPUModelRunner( - vllm_config=vllm_config, is_driver_worker=is_driver_worker) - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[HPUCacheEngine] - # Initialize gpu_cache as pooling models don't initialize kv_caches - self.hpu_cache: Optional[List[List[torch.Tensor]]] = None - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.HPU, - ], - with_stack=True, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True)) - else: - self.profiler = None - - def start_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.start() - - def stop_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.stop() - - def _set_env_vars(self): - local_rank = self.local_rank - if self.parallel_config.world_size == 1: - local_rank = -1 - import os - os.environ["LOCAL_RANK"] = str(local_rank) - os.environ["ID"] = str(local_rank) - os.environ["WORLD_SIZE"] = str(self.parallel_config.world_size) - os.environ["RANK"] = str(self.rank) - - def init_device(self) -> None: - if self.device_config.device.type == "hpu": - self.device = torch.device("hpu") - torch.hpu.set_device(self.device) - else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") - # Initialize the distributed environment. - if self.model_config.quantization == 'inc': - self._set_env_vars() - init_worker_distributed_environment(self.parallel_config, self.rank, - self.distributed_init_method, - self.local_rank) - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - self.model_runner.load_model() - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[List[SamplerOutput]]: - # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 - # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 - # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 - # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501 - log_graph_compilation_all = os.environ.get( - 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' - log_graph_compilation = os.environ.get( - 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', - '0') != '0' or log_graph_compilation_all - log_cpu_fallbacks_all = os.environ.get( - 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' - log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', - '0') != '0' or log_cpu_fallbacks_all - if (log_graph_compilation or log_cpu_fallbacks) and \ - execute_model_req is not None: - from habana_frameworks.torch.hpu.metrics import metric_localcontext - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - is_prompt = any([ - seq_group_metadata.is_prompt - for seq_group_metadata in seq_group_metadata_list - ]) - max_context_len = max([ - max([ - len(v.prompt_token_ids) + len(v.output_token_ids) - for v in seq_group_metadata.seq_data.values() - ]) for seq_group_metadata in seq_group_metadata_list - ]) # whoa, that's some spicy stuff right here - max_num_blocks = ( - (max_context_len - 1) // self.cache_config.block_size) + 1 - input_stats = (f'is_prompt: {is_prompt}, ' - f'num_seqs: {len(seq_group_metadata_list)}, ' - f'max_context_len: {max_context_len}, ' - f'max_num_blocks {max_num_blocks}') - gc_ctx = metric_localcontext( - "graph_compilation" - ) if log_graph_compilation else contextlib.nullcontext() - cpu_fallback_ctx = metric_localcontext( - "cpu_fallback" - ) if log_cpu_fallbacks else contextlib.nullcontext() - with gc_ctx as gc_local_metric, \ - cpu_fallback_ctx as cpu_fallback_local_metric: - output = LocalOrDistributedWorkerBase.execute_model( - self, execute_model_req) - if (log_graph_compilation and gc_local_metric.stats()[0][1] - > 0) or log_graph_compilation_all: - msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " - f"{gc_local_metric.stats()}, {input_stats}") - logger.warning(msg) - if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] - > 0) or log_cpu_fallbacks_all: - msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " - f"{cpu_fallback_local_metric.stats()}, {input_stats}") - logger.warning(msg) - - return output - - output = LocalOrDistributedWorkerBase.execute_model( - self, execute_model_req) - return output - - @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - - Tip: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - with HabanaMemoryProfiler() as m: - self.model_runner.profile_run() - torch.hpu.synchronize() - msg = ("Model profiling run " - f"took {m.get_summary_string()}") - logger.info(msg) - # At this point we should've allocated the maximum workspace for all - # recipes we will use the extra memory for graphs/blocks - free_hpu_memory = torch.hpu.mem_get_info()[0] - - cache_block_size = self.get_cache_block_size_bytes() - graph_reserved_mem = (float( - os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1')) - if not self.model_config.enforce_eager else 0) - graph_headroom = 1 - graph_reserved_mem - available_hpu_memory = free_hpu_memory * \ - self.cache_config.gpu_memory_utilization - hpu_memory_margin = free_hpu_memory * ( - 1 - self.cache_config.gpu_memory_utilization) - self.model_runner.mem_margin = hpu_memory_margin - cache_size_bytes = available_hpu_memory * graph_headroom - graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom) - msg = ( - f"Free device memory: {format_bytes(free_hpu_memory)}, " - f"{format_bytes(available_hpu_memory)} usable " - f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization})," - f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs " - f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), " - f"{format_bytes(cache_size_bytes)} reserved for KV cache") - logger.info(msg) - num_hpu_blocks = int(cache_size_bytes // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) - num_hpu_blocks = max(num_hpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - self.model_runner.bucketing_ctx.num_hpu_blocks = num_hpu_blocks - - if self.model_runner.lora_manager: - self.model_runner.remove_all_loras() - - gc.collect() - return num_hpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Allocate GPU and CPU KV cache with the specified number of blocks. - - This also warms up the model, which may record CUDA graphs. - """ - raise_if_cache_size_invalid( - num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len, - self.parallel_config.pipeline_parallel_size) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - with HabanaMemoryProfiler() as m: - self._init_cache_engine() - torch.hpu.synchronize() - msg = ("Initializing cache engine " - f"took {m.get_summary_string()}") - logger.info(msg) - self._warm_up_model() - - def _init_cache_engine(self): - assert self.cache_config.num_gpu_blocks is not None - self.cache_engine = [ - HPUCacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - self.hpu_cache = [ - self.cache_engine[ve].gpu_cache - for ve in range(self.parallel_config.pipeline_parallel_size) - ] - bind_kv_cache(self.compilation_config.static_forward_context, - self.hpu_cache) - - def _warm_up_model(self) -> None: - # NOTE(kzawora): We should use virtual engine index here - # for pipeline parallelism. Using 0 for now. - assert self.hpu_cache is not None - self.model_runner.warmup_model(self.hpu_cache[0]) - # Reset the seed to ensure that the random state is not affected by - # the model initialization and profiling. - set_random_seed(self.model_config.seed) - - def finish_measurements(self): - self.model_runner.finish_measurements() - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return self.hpu_cache - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - virtual_engine = execute_model_req.virtual_engine - num_seq_groups = len(execute_model_req.seq_group_metadata_list) - # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. - # they contain parameters to launch cudamemcpyasync. - blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, - device="cpu", - dtype=torch.int64).view(-1, 2) - blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, - device="cpu", - dtype=torch.int64).view(-1, 2) - # `blocks_to_copy` is a gpu tensor. The src and tgt of - # blocks to copy are in the same device, and `blocks_to_copy` - # can be used directly within cuda kernels. - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device=self.device, - dtype=torch.int64).view(-1, 2) - - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - ) - - @torch.inference_mode() - def execute_worker(self, worker_input: WorkerInput) -> None: - virtual_engine = worker_input.virtual_engine - # Issue cache operations. - if (worker_input.blocks_to_swap_in is not None - and worker_input.blocks_to_swap_in.numel() > 0): - self.cache_engine[virtual_engine].swap_in( - worker_input.blocks_to_swap_in) - if (worker_input.blocks_to_swap_out is not None - and worker_input.blocks_to_swap_out.numel() > 0): - self.cache_engine[virtual_engine].swap_out( - worker_input.blocks_to_swap_out) - if (worker_input.blocks_to_copy is not None - and worker_input.blocks_to_copy.numel() > 0): - self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_runner.list_loras() - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def shutdown_inc(self): - self.model_runner.shutdown_inc() - - @property - def max_model_len(self) -> int: - return self.model_config.max_model_len - - @property - def vocab_size(self) -> int: - return self.model_runner.vocab_size - - def get_cache_block_size_bytes(self) -> int: - """Get the size of the KV cache block size in bytes. - """ - return HPUCacheEngine.get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - - -def init_worker_distributed_environment( - parallel_config: ParallelConfig, - rank: int, - distributed_init_method: Optional[str] = None, - local_rank: int = -1, -) -> None: - """Initialize the distributed environment.""" - init_distributed_environment(parallel_config.world_size, - rank, - distributed_init_method, - local_rank, - backend=current_platform.dist_backend) - - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - - if torch.distributed.is_initialized(): - torch_world_size = torch.distributed.get_world_size() - if torch_world_size != parallel_config.world_size: - raise RuntimeError( - "torch.distributed is already initialized but the torch world " - "size does not match parallel_config.world_size " - f"({torch_world_size} vs. {parallel_config.world_size}).") - elif not distributed_init_method: - raise ValueError( - "distributed_init_method must be set if torch.distributed " - "is not already initialized") - else: - torch.distributed.init_process_group( - backend="hccl", - world_size=parallel_config.world_size, - rank=rank, - init_method=distributed_init_method, - ) - - # A small all_reduce for warmup & checking conformance. - dummy_tensor_hpu = torch.ones(1).to('hpu') - torch.distributed.all_reduce(dummy_tensor_hpu) - assert dummy_tensor_hpu.item() == parallel_config.world_size - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - - -def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len, - pipeline_parallel_size) -> None: - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size) - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - - -class HPUCacheEngine(CacheEngine): - - def _allocate_kv_cache( - self, - num_blocks: int, - device: str, - ) -> List[Tuple[torch.Tensor, torch.Tensor]]: - """Allocates KV cache on the specified device.""" - kv_cache_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] - for _ in range(self.num_attention_layers): - key_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) - value_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) - kv_layer = (key_cache, value_cache) - kv_cache.append(kv_layer) - return kv_cache diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py deleted file mode 100644 index f0210c13c7..0000000000 --- a/vllm/worker/multi_step_hpu_worker.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -############################################################################### -# Copyright (C) 2025 Habana Labs, Ltd. an Intel Company -############################################################################### - -import dataclasses -from typing import Dict, Optional, Tuple - -import torch - -from vllm.distributed import broadcast_tensor_dict -from vllm.sequence import ExecuteModelRequest -from vllm.worker.hpu_model_runner import ModelInputForHPU -from vllm.worker.hpu_worker import HPUWorker -from vllm.worker.worker_base import WorkerInput - - -class MultiStepHPUWorker(HPUWorker): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.cached_model_input: Optional[ModelInputForHPU] = None - - def _get_driver_input_and_broadcast( - self, execute_model_req: ExecuteModelRequest - ) -> Tuple[ModelInputForHPU, WorkerInput, Dict[str, torch.Tensor]]: - """ - Get the driver input and broadcast it to other workers. - """ - assert self.is_driver_worker - assert execute_model_req.virtual_engine == 0 - - is_first_multi_step = execute_model_req.is_first_multi_step - is_last_step = execute_model_req.is_last_step - - if is_first_multi_step: - # on first step we prepare the worker input and model input normally - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - worker_input = dataclasses.replace( - worker_input, - num_steps=execute_model_req.num_lookahead_slots + 1) - model_input: ModelInputForHPU = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list, - execute_model_req.virtual_engine, - execute_model_req.finished_requests_ids)) - - if execute_model_req.async_callback: - model_input = dataclasses.replace( - model_input, - async_callback=execute_model_req.async_callback) - else: - # on subsequent steps we reuse the worker input and model input - assert self.cached_model_input is not None - model_input = self.cached_model_input - worker_input = WorkerInput() - - model_input = dataclasses.replace( - model_input, - is_first_multi_step=is_first_multi_step, - is_last_step=is_last_step) - - if self.do_metadata_broadcast: - if is_first_multi_step: - broadcast_data = worker_input.as_broadcastable_tensor_dict() - broadcast_data.update( - model_input.as_broadcastable_tensor_dict()) - broadcast_tensor_dict(broadcast_data, src=0) - else: - broadcast_data = { - "is_first_multi_step": is_first_multi_step, - "is_last_step": is_last_step, - } - broadcast_tensor_dict(broadcast_data, src=0) - - # Returning empty dict here to keep this compatible with - # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast` - return model_input, worker_input, {} - - def prepare_input( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[Tuple[ModelInputForHPU, WorkerInput, Dict[str, - torch.Tensor]]]: - if self.is_driver_worker: - if execute_model_req is None: - if self.do_metadata_broadcast: - # This signals that there's no more requests to process for - # now. All workers are running infinite loop with - # broadcast_tensor_dict, and it stops the loop when the - # driver broadcasts an empty input. Send an empty input to - # notify all other workers to stop their execution loop. - broadcast_tensor_dict({}, src=0) - return None - model_input, worker_input, _ = self._get_driver_input_and_broadcast( - execute_model_req) - if model_input.is_first_multi_step: - self.cached_model_input = model_input - return model_input, worker_input, {} - else: - broadcast_data = broadcast_tensor_dict(src=0) - if not broadcast_data: - return None - - if len(broadcast_data) == 2: - assert self.cached_model_input is not None - self.cached_model_input = dataclasses.replace( - self.cached_model_input, - is_first_multi_step=broadcast_data["is_first_multi_step"], - is_last_step=broadcast_data["is_last_step"]) - empty_worker_input = WorkerInput() - return self.cached_model_input, empty_worker_input, {} - - worker_input = WorkerInput.from_broadcasted_tensor_dict( - broadcast_data) - model_input = ( - self.model_runner. - make_model_input_from_broadcasted_tensor_dict(broadcast_data)) - self.cached_model_input = model_input - return model_input, worker_input, {}