[V0 deprecation] Remove V0 HPU backend (#21131)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@ -1,21 +0,0 @@
|
||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN pip install -v -r requirements/hpu.txt
|
||||
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
@ -1,12 +0,0 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
# Dependencies for HPU code
|
||||
ray
|
||||
triton==3.1.0
|
||||
pandas
|
||||
numpy==1.26.4
|
||||
tabulate
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
|
36
setup.py
36
setup.py
@ -410,29 +410,6 @@ class repackage_wheel(build_ext):
|
||||
package_data[package_name].append(file_name)
|
||||
|
||||
|
||||
def _is_hpu() -> bool:
|
||||
# if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
|
||||
if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
|
||||
return VLLM_TARGET_DEVICE == "hpu"
|
||||
|
||||
# if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
|
||||
# and if it doesn't, check if habanalabs driver is loaded
|
||||
is_hpu_available = False
|
||||
try:
|
||||
out = subprocess.run(["hl-smi"], capture_output=True, check=True)
|
||||
is_hpu_available = out.returncode == 0
|
||||
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
|
||||
if sys.platform.startswith("linux"):
|
||||
try:
|
||||
output = subprocess.check_output(
|
||||
'lsmod | grep habanalabs | wc -l', shell=True)
|
||||
is_hpu_available = int(output) > 0
|
||||
except (ValueError, FileNotFoundError, PermissionError,
|
||||
subprocess.CalledProcessError):
|
||||
pass
|
||||
return is_hpu_available
|
||||
|
||||
|
||||
def _no_device() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "empty"
|
||||
|
||||
@ -440,7 +417,7 @@ def _no_device() -> bool:
|
||||
def _is_cuda() -> bool:
|
||||
has_cuda = torch.version.cuda is not None
|
||||
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
|
||||
and not (_is_neuron() or _is_tpu() or _is_hpu()))
|
||||
and not (_is_neuron() or _is_tpu()))
|
||||
|
||||
|
||||
def _is_hip() -> bool:
|
||||
@ -573,12 +550,6 @@ def get_vllm_version() -> str:
|
||||
if neuron_version != MAIN_CUDA_VERSION:
|
||||
neuron_version_str = neuron_version.replace(".", "")[:3]
|
||||
version += f"{sep}neuron{neuron_version_str}"
|
||||
elif _is_hpu():
|
||||
# Get the Intel Gaudi Software Suite version
|
||||
gaudi_sw_version = str(get_gaudi_sw_version())
|
||||
if gaudi_sw_version != MAIN_CUDA_VERSION:
|
||||
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
|
||||
version += f"{sep}gaudi{gaudi_sw_version}"
|
||||
elif _is_tpu():
|
||||
version += f"{sep}tpu"
|
||||
elif _is_cpu():
|
||||
@ -625,8 +596,6 @@ def get_requirements() -> list[str]:
|
||||
requirements = _read_requirements("rocm.txt")
|
||||
elif _is_neuron():
|
||||
requirements = _read_requirements("neuron.txt")
|
||||
elif _is_hpu():
|
||||
requirements = _read_requirements("hpu.txt")
|
||||
elif _is_tpu():
|
||||
requirements = _read_requirements("tpu.txt")
|
||||
elif _is_cpu():
|
||||
@ -635,8 +604,7 @@ def get_requirements() -> list[str]:
|
||||
requirements = _read_requirements("xpu.txt")
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
|
||||
"or CPU.")
|
||||
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
|
||||
return requirements
|
||||
|
||||
|
||||
|
@ -13,8 +13,7 @@ from vllm.scalar_type import ScalarType
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if not current_platform.is_tpu() and not current_platform.is_hpu()\
|
||||
and not current_platform.is_xpu():
|
||||
if not current_platform.is_tpu() and not current_platform.is_xpu():
|
||||
try:
|
||||
import vllm._C
|
||||
except ImportError as e:
|
||||
|
@ -1,319 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
###############################################################################
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
|
||||
###############################################################################
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import vllm_hpu_extension.kernels as kernels
|
||||
import vllm_hpu_extension.ops as ops
|
||||
from vllm_hpu_extension.flags import enabled_flags
|
||||
from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionLayer,
|
||||
AttentionMetadata, AttentionType,
|
||||
is_quantized_kv_cache)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
|
||||
HPUPagedAttentionMetadata)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HPUAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "HPU_ATTN"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["HPUAttentionImpl"]:
|
||||
return HPUAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> Type["AttentionMetadata"]:
|
||||
return HPUAttentionMetadata
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
|
||||
num_kv_heads, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: torch.Tensor,
|
||||
dst_kv_cache: torch.Tensor,
|
||||
src_to_dsts: torch.Tensor,
|
||||
) -> None:
|
||||
HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[torch.Tensor],
|
||||
src_to_dsts: torch.Tensor,
|
||||
) -> None:
|
||||
HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts)
|
||||
|
||||
|
||||
@dataclass
|
||||
class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
|
||||
"""Metadata for HPUAttentionbackend."""
|
||||
# Currently, input sequences can only contain all prompts
|
||||
# or all decoding. True if all sequences are prompts.
|
||||
is_prompt: bool
|
||||
attn_bias: Optional[torch.Tensor]
|
||||
seq_lens_tensor: Optional[torch.Tensor]
|
||||
context_lens_tensor: Optional[torch.Tensor]
|
||||
|
||||
|
||||
class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
|
||||
"""
|
||||
If the input tensors contain prompt tokens, the layout is as follows:
|
||||
|<--------------- num_prefill_tokens ----------------->|
|
||||
|<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
|
||||
|
||||
Otherwise, the layout is as follows:
|
||||
|<----------------- num_decode_tokens ------------------>|
|
||||
|<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
|
||||
|
||||
Generation tokens can contain padding when cuda-graph is used.
|
||||
Currently, prompt tokens don't contain any padding.
|
||||
|
||||
The prompts might have different lengths, while the generation tokens
|
||||
always have length 1.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
max_seq_len: int = 4096,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super(AttentionImpl, self).__init__()
|
||||
if kv_sharing_target_layer_name is not None:
|
||||
raise NotImplementedError("KV sharing is not supported in V0 "
|
||||
"HPU_ATTN backend.")
|
||||
if use_irope:
|
||||
logger.warning_once(
|
||||
"Using irope in HPU is not supported yet, it will fall back "
|
||||
"to global attention for long context.")
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
self.matmul_qk = Matmul()
|
||||
self.softmax = Softmax()
|
||||
self.matmul_av = Matmul()
|
||||
self.batch2block_matmul = Matmul()
|
||||
self.block2batch_matmul = Matmul()
|
||||
self.k_cache = VLLMKVCache()
|
||||
self.v_cache = VLLMKVCache()
|
||||
self.fused_scaled_dot_product_attention = kernels.fsdpa()
|
||||
|
||||
self.prefill_impl = 'naive'
|
||||
if "flex_attention" in enabled_flags():
|
||||
self.prefill_impl = 'flex'
|
||||
if "fsdpa" in enabled_flags():
|
||||
assert alibi_slopes is None, \
|
||||
'Prefill with FusedSDPA not supported with alibi slopes!'
|
||||
self.prefill_impl = 'fsdpa'
|
||||
|
||||
self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
|
||||
self.sliding_window = sliding_window
|
||||
self.alibi_slopes = alibi_slopes
|
||||
if alibi_slopes is not None:
|
||||
alibi_slopes_tensor = torch.tensor(alibi_slopes,
|
||||
dtype=torch.bfloat16)
|
||||
self.alibi_slopes = alibi_slopes_tensor
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
|
||||
if self.prefill_impl == 'fsdpa':
|
||||
assert alibi_slopes is None, \
|
||||
'Prefill with FusedSDPA not supported with alibi slopes!'
|
||||
|
||||
supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
|
||||
if head_size not in supported_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by PagedAttention. "
|
||||
f"Supported head sizes are: {supported_head_sizes}.")
|
||||
|
||||
self.attn_type = attn_type
|
||||
if self.attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"HPUAttentionImpl")
|
||||
|
||||
if is_quantized_kv_cache(self.kv_cache_dtype):
|
||||
raise NotImplementedError(
|
||||
"HPUAttention with FP8 KV cache not yet supported")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
layer: AttentionLayer,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: HPUAttentionMetadata,
|
||||
output: Optional[torch.Tensor] = None,
|
||||
output_scale: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Forward pass with xFormers and PagedAttention.
|
||||
|
||||
Args:
|
||||
query: shape = [num_tokens, num_heads * head_size]
|
||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
||||
attn_metadata: Metadata for attention.
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
"""
|
||||
if output_scale is not None:
|
||||
raise NotImplementedError(
|
||||
"fused output quantization is not yet supported"
|
||||
" for HPUAttentionImpl")
|
||||
|
||||
batch_size, seq_len, hidden_size = query.shape
|
||||
_, seq_len_kv, _ = key.shape
|
||||
|
||||
key = key.view(-1, self.num_kv_heads, self.head_size)
|
||||
value = value.view(-1, self.num_kv_heads, self.head_size)
|
||||
block_indices = attn_metadata.block_indices
|
||||
block_offsets = attn_metadata.block_offsets
|
||||
key_cache = None
|
||||
value_cache = None
|
||||
if attn_metadata.is_prompt and self.attn_type \
|
||||
is not AttentionType.ENCODER_ONLY:
|
||||
key = key.unflatten(0, (block_indices.size(0), -1))
|
||||
value = value.unflatten(0, (block_indices.size(0), -1))
|
||||
if kv_cache is not None and isinstance(kv_cache, tuple):
|
||||
key_cache, value_cache = HPUPagedAttention.split_kv_cache(
|
||||
kv_cache, self.num_kv_heads, self.head_size)
|
||||
|
||||
# Reshape the input keys and values and store them in the cache.
|
||||
# If kv_cache is not provided, the new key and value tensors are
|
||||
# not cached. This happens during the initial memory profiling run.
|
||||
key_cache = self.k_cache(key, key_cache, block_indices,
|
||||
block_offsets)
|
||||
value_cache = self.v_cache(value, value_cache, block_indices,
|
||||
block_offsets)
|
||||
|
||||
if attn_metadata.is_prompt:
|
||||
# Prompt run.
|
||||
query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
|
||||
kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
|
||||
self.head_size)
|
||||
|
||||
attn_bias = attn_metadata.attn_bias
|
||||
if attn_bias is not None and self.alibi_slopes is not None:
|
||||
position_bias = _make_alibi_bias(self.alibi_slopes,
|
||||
self.num_kv_heads,
|
||||
attn_bias.dtype,
|
||||
attn_bias.shape[-1])
|
||||
attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
|
||||
attn_bias.add_(position_bias)
|
||||
|
||||
block_list = attn_metadata.block_list if attn_metadata \
|
||||
and attn_metadata.block_list is not None else None
|
||||
|
||||
out = ops.prompt_attention(
|
||||
impl=self.prefill_impl,
|
||||
query=query.view(query_shape),
|
||||
key=key.view(kv_shape),
|
||||
value=value.view(kv_shape),
|
||||
is_causal=True,
|
||||
attn_bias=attn_bias,
|
||||
valid_seq_lengths=attn_metadata.seq_lens_tensor,
|
||||
**self.common_attention_args(block_list, key_cache,
|
||||
value_cache))
|
||||
output = out.reshape(batch_size, seq_len, hidden_size)
|
||||
else:
|
||||
# Decoding run.
|
||||
output = HPUPagedAttention.forward_decode(
|
||||
query=query,
|
||||
block_mapping=attn_metadata.block_mapping,
|
||||
block_bias=attn_metadata.attn_bias,
|
||||
block_groups=attn_metadata.block_groups,
|
||||
**self.common_attention_args(attn_metadata.block_list,
|
||||
key_cache, value_cache))
|
||||
# Reshape the output tensor.
|
||||
return output.view(batch_size, seq_len, hidden_size)
|
||||
|
||||
def common_attention_args(self,
|
||||
block_list=None,
|
||||
key_cache=None,
|
||||
value_cache=None):
|
||||
fsdpa_op = self.fused_scaled_dot_product_attention.apply \
|
||||
if self.fused_scaled_dot_product_attention is not None else None
|
||||
return {
|
||||
'scale': self.scale,
|
||||
'matmul_qk_op': self.matmul_qk,
|
||||
'matmul_av_op': self.matmul_av,
|
||||
'batch2block_matmul_op': self.batch2block_matmul,
|
||||
'block2batch_matmul_op': self.block2batch_matmul,
|
||||
'fsdpa_op': fsdpa_op,
|
||||
'keys_fetch_func': self.k_cache.fetch_from_cache,
|
||||
'values_fetch_func': self.v_cache.fetch_from_cache,
|
||||
'softmax_op': self.softmax,
|
||||
'block_list': block_list,
|
||||
'key_cache': key_cache,
|
||||
'value_cache': value_cache,
|
||||
}
|
||||
|
||||
|
||||
def _make_alibi_bias(
|
||||
alibi_slopes: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
dtype: torch.dtype,
|
||||
seq_len: int,
|
||||
) -> torch.Tensor:
|
||||
bias = torch.arange(seq_len, dtype=dtype)
|
||||
# NOTE(zhuohan): HF uses
|
||||
# `bias = bias[None, :].repeat(seq_len, 1)`
|
||||
# here. We find that both biases give the same results, but
|
||||
# the bias below more accurately follows the original ALiBi
|
||||
# paper.
|
||||
# Calculate a matrix where each element represents ith element- jth
|
||||
# element.
|
||||
bias = bias[None, :] - bias[:, None]
|
||||
|
||||
padded_len = (seq_len + 7) // 8 * 8
|
||||
num_heads = alibi_slopes.shape[0]
|
||||
bias = torch.empty(
|
||||
1, # batch size
|
||||
num_heads,
|
||||
seq_len,
|
||||
padded_len,
|
||||
device=alibi_slopes.device,
|
||||
dtype=dtype,
|
||||
)[:, :, :, :seq_len].copy_(bias)
|
||||
bias.mul_(alibi_slopes[:, None, None])
|
||||
if num_heads != num_kv_heads:
|
||||
bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
|
||||
return bias
|
@ -1,88 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
###############################################################################
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
|
||||
###############################################################################
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from vllm_hpu_extension import cache_ops, ops
|
||||
|
||||
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
||||
_PARTITION_SIZE = 512
|
||||
|
||||
|
||||
@dataclass
|
||||
class HPUPagedAttentionMetadata:
|
||||
"""Metadata for PagedAttention."""
|
||||
block_list: Optional[torch.Tensor]
|
||||
block_mapping: Optional[torch.Tensor]
|
||||
block_usage: Optional[torch.Tensor]
|
||||
block_indices: Optional[torch.Tensor]
|
||||
block_offsets: Optional[torch.Tensor]
|
||||
block_groups: Optional[torch.Tensor]
|
||||
|
||||
|
||||
class HPUPagedAttention:
|
||||
|
||||
@staticmethod
|
||||
def get_supported_head_sizes() -> List[int]:
|
||||
return [64, 80, 96, 112, 128, 256]
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return (num_blocks, block_size, num_kv_heads, head_size)
|
||||
|
||||
@staticmethod
|
||||
def split_kv_cache(
|
||||
kv_cache: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
key_cache = kv_cache[0]
|
||||
value_cache = kv_cache[1]
|
||||
return key_cache, value_cache
|
||||
|
||||
@staticmethod
|
||||
def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
slot_mapping: torch.Tensor, kv_cache_dtype: str,
|
||||
is_prompt: bool) -> None:
|
||||
cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
|
||||
slot_mapping, kv_cache_dtype, is_prompt)
|
||||
|
||||
@staticmethod
|
||||
def forward_decode(**kwargs) -> torch.Tensor:
|
||||
return ops.flat_pa(**kwargs)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
|
||||
dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
|
||||
src_to_dsts: torch.Tensor,
|
||||
) -> None:
|
||||
src_key_cache = src_kv_cache[0]
|
||||
dst_key_cache = dst_kv_cache[0]
|
||||
cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)
|
||||
|
||||
src_value_cache = src_kv_cache[1]
|
||||
dst_value_cache = dst_kv_cache[1]
|
||||
cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||
src_to_dsts: torch.Tensor,
|
||||
) -> None:
|
||||
key_caches = [kv_cache[0] for kv_cache in kv_caches]
|
||||
value_caches = [kv_cache[1] for kv_cache in kv_caches]
|
||||
cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
|
@ -2452,7 +2452,7 @@ class SchedulerConfig:
|
||||
return self.num_scheduler_steps > 1
|
||||
|
||||
|
||||
Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
|
||||
Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
|
||||
|
||||
|
||||
@config
|
||||
|
@ -7,7 +7,6 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
|
||||
DeviceAwareBlockAllocator)
|
||||
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
||||
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import Device
|
||||
|
||||
|
||||
@ -56,8 +55,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
||||
- The block IDs are assigned contiguously, with GPU block IDs coming
|
||||
before CPU block IDs.
|
||||
"""
|
||||
# For HPU, block id 0 is used only for padding
|
||||
reserved_blocks = 1 if current_platform.is_hpu() else 0
|
||||
reserved_blocks = 0
|
||||
block_ids = list(
|
||||
range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
|
||||
num_gpu_blocks -= reserved_blocks
|
||||
|
@ -1,46 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .base_device_communicator import DeviceCommunicatorBase
|
||||
|
||||
if current_platform.is_hpu():
|
||||
import habana_frameworks.torch as htorch # noqa: F401
|
||||
|
||||
|
||||
class HpuCommunicator(DeviceCommunicatorBase):
|
||||
|
||||
def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
|
||||
# FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
|
||||
# occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
|
||||
# (which is required for tensor parallel HPUGraph inference)
|
||||
htorch.core.mark_step()
|
||||
dist.all_reduce(input_, group=self.device_group)
|
||||
return input_
|
||||
|
||||
def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
|
||||
world_size = self.world_size
|
||||
if dim < 0:
|
||||
# Convert negative dim to positive.
|
||||
dim += input_.dim()
|
||||
input_size = input_.size()
|
||||
# Allocate output tensor.
|
||||
output_tensor = torch.empty((world_size, ) + input_size,
|
||||
dtype=input_.dtype,
|
||||
device=input_.device)
|
||||
# All-gather.
|
||||
htorch.core.mark_step()
|
||||
dist.all_gather_into_tensor(output_tensor,
|
||||
input_,
|
||||
group=self.device_group)
|
||||
# Reshape
|
||||
output_tensor = output_tensor.movedim(0, dim)
|
||||
output_tensor = output_tensor.reshape(input_size[:dim] +
|
||||
(world_size *
|
||||
input_size[dim], ) +
|
||||
input_size[dim + 1:])
|
||||
return output_tensor
|
@ -1365,9 +1365,8 @@ class EngineArgs:
|
||||
supported = False
|
||||
if current_platform.is_rocm() or (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100)) or (
|
||||
current_platform.device_name
|
||||
== "hpu"): # handle hpu also for OOT platform
|
||||
and current_platform.is_device_capability(100)
|
||||
): # handle hpu also for OOT platform
|
||||
supported = True
|
||||
elif fp8_attention and will_use_fa:
|
||||
from vllm.attention.utils.fa_utils import (
|
||||
|
15
vllm/envs.py
15
vllm/envs.py
@ -106,8 +106,6 @@ if TYPE_CHECKING:
|
||||
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
|
||||
VLLM_RAY_BUNDLE_INDICES: str = ""
|
||||
VLLM_CUDART_SO_PATH: Optional[str] = None
|
||||
VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
|
||||
VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
|
||||
VLLM_DP_RANK: int = 0
|
||||
VLLM_DP_RANK_LOCAL: int = -1
|
||||
VLLM_DP_SIZE: int = 1
|
||||
@ -780,19 +778,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_CUDART_SO_PATH":
|
||||
lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
|
||||
|
||||
# Contiguous cache fetching to avoid using costly gather operation on
|
||||
# Gaudi3. This is only applicable to HPU contiguous cache. If set to true,
|
||||
# contiguous cache fetch will be used.
|
||||
"VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH":
|
||||
lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
|
||||
("1", "true"),
|
||||
|
||||
# Use delayed sampling for HPU to reduce host cpu overhead
|
||||
# between each step.
|
||||
"VLLM_HPU_USE_DELAYED_SAMPLING":
|
||||
lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
|
||||
("1", "true"),
|
||||
|
||||
# Rank of the process in the data parallel setting
|
||||
"VLLM_DP_RANK":
|
||||
lambda: int(os.getenv("VLLM_DP_RANK", "0")),
|
||||
|
@ -1164,10 +1164,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
|
||||
posinf=pos_inf,
|
||||
neginf=neg_inf))
|
||||
|
||||
# HPU needs special handling to prune out dummy samples.
|
||||
if current_platform.is_hpu():
|
||||
lora_logits = lora_logits[:logits.shape[0], :]
|
||||
|
||||
logits[:,
|
||||
self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
|
||||
lora_logits.shape[1]] = lora_logits
|
||||
|
@ -1,145 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import TYPE_CHECKING, Optional, Union, final
|
||||
|
||||
import torch
|
||||
from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
|
||||
dispatch_bgmv_linear)
|
||||
|
||||
from .punica_base import PunicaWrapperBase
|
||||
from .utils import convert_mapping
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# avoid circuit import
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
from vllm.lora.models import LongContextLoRAContext
|
||||
|
||||
|
||||
@final
|
||||
class PunicaWrapperHPU(PunicaWrapperBase):
|
||||
|
||||
def __init__(self, max_num_batched_tokens: int, max_batches: int,
|
||||
device: Union[torch.device, str], **kwargs):
|
||||
# Increasing max_num_batched_tokens by 3x to handle increase in
|
||||
# tensor size due to padding.
|
||||
PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
|
||||
max_batches, device)
|
||||
|
||||
def _update_base_metadata(
|
||||
self,
|
||||
mapping: "LoRAMapping",
|
||||
lora_index_to_id: list[Optional[int]],
|
||||
max_loras: int,
|
||||
vocab_size: int,
|
||||
extra_vocab_size: int,
|
||||
long_lora_context: Optional["LongContextLoRAContext"] = None,
|
||||
):
|
||||
(
|
||||
base_indices,
|
||||
sampler_indices,
|
||||
sampler_indices_padded,
|
||||
embeddings_indices,
|
||||
long_lora_offsets_tensor,
|
||||
indices_len,
|
||||
) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
|
||||
extra_vocab_size, self.device, None)
|
||||
# Updating each element in `long_lora_offsets` with `lora_offset` slows
|
||||
# down perf in HPU due to a series of `strided_insert` ops during lazy
|
||||
# graph accumulation. Hence HPU appends `lora_offset` to a list and
|
||||
# converts it to a tensor only after it is ready.
|
||||
if long_lora_context:
|
||||
index_mapping_indices: list[int] = list(
|
||||
mapping.index_mapping).copy()
|
||||
long_lora_offsets: list[int] = []
|
||||
for i in range(len(index_mapping_indices)):
|
||||
lora_offset: int = long_lora_context.offsets_by_lora_id.get(
|
||||
index_mapping_indices[i], 0)
|
||||
long_lora_offsets.append(lora_offset)
|
||||
long_lora_offsets_tensor = torch.tensor(long_lora_offsets,
|
||||
device=self.device,
|
||||
dtype=torch.long)
|
||||
indices_len[-1] = long_lora_offsets_tensor.shape[-1]
|
||||
|
||||
self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
|
||||
self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
|
||||
self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
|
||||
sampler_indices_padded)
|
||||
self._embeddings_indices[:embeddings_indices.
|
||||
shape[0], :embeddings_indices.shape[1]].copy_(
|
||||
embeddings_indices)
|
||||
if long_lora_offsets_tensor is not None:
|
||||
self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
|
||||
long_lora_offsets_tensor)
|
||||
else:
|
||||
self._long_lora_indices.zero_()
|
||||
self.indices_len[:] = indices_len
|
||||
|
||||
def add_lora_embedding(self,
|
||||
y: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
lora_b_stacked: torch.Tensor,
|
||||
add_inputs: bool = True,
|
||||
**kwargs) -> None:
|
||||
dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
|
||||
|
||||
def add_lora_linear(self,
|
||||
y: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
lora_a_stacked: tuple[torch.Tensor, ...],
|
||||
lora_b_stacked: tuple[torch.Tensor, ...],
|
||||
lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
|
||||
scale: float,
|
||||
output_slices: tuple[int, ...],
|
||||
*,
|
||||
buffer: Optional[tuple[torch.Tensor, ...]] = None,
|
||||
**kwargs) -> None:
|
||||
y_org = y
|
||||
x = x.view(-1, x.shape[-1])
|
||||
y = y.view(-1, y.shape[-1])
|
||||
offset_left = 0
|
||||
|
||||
for slice_idx in range(len(output_slices)):
|
||||
dispatch_bgmv_linear(
|
||||
y[:, offset_left:offset_left + output_slices[slice_idx]], x,
|
||||
lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
|
||||
offset_left += output_slices[slice_idx]
|
||||
y = y.view_as(y_org)
|
||||
|
||||
def add_lora_logits(self,
|
||||
y: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
lora_a_stacked: torch.Tensor,
|
||||
lora_b_stacked: torch.Tensor,
|
||||
scale,
|
||||
*,
|
||||
buffer: Optional[torch.Tensor] = None,
|
||||
**kwargs) -> None:
|
||||
y_org = y
|
||||
y = y.view(-1, y.shape[-1])
|
||||
x = x.view(-1, x.shape[-1])
|
||||
dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
|
||||
y = y.view_as(y_org)
|
||||
|
||||
def add_shrink(
|
||||
self,
|
||||
y: Union[tuple[torch.Tensor, ...], torch.Tensor],
|
||||
x: torch.Tensor,
|
||||
lora_a_stacked: tuple[torch.Tensor, ...],
|
||||
scale: float,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
def add_expand(
|
||||
self,
|
||||
y: torch.Tensor,
|
||||
x: Union[tuple[torch.Tensor, ...], torch.Tensor],
|
||||
lora_b_stacked: tuple[torch.Tensor, ...],
|
||||
lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
|
||||
output_slices: tuple[int, ...],
|
||||
offset_start: int = 0,
|
||||
add_inputs=True,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
raise NotImplementedError
|
@ -73,11 +73,6 @@ class CustomOp(nn.Module):
|
||||
# NOTE(woosuk): This is a placeholder for future extensions.
|
||||
return self.forward_native(*args, **kwargs)
|
||||
|
||||
def forward_hpu(self, *args, **kwargs):
|
||||
# By default, we assume that Gaudi ops are compatible with the
|
||||
# PyTorch-native implementation.
|
||||
return self.forward_native(*args, **kwargs)
|
||||
|
||||
def forward_neuron(self, *args, **kwargs):
|
||||
# By default, we assume that Neuron ops are compatible with the
|
||||
# PyTorch-native implementation.
|
||||
@ -106,8 +101,6 @@ class CustomOp(nn.Module):
|
||||
return self.forward_hip
|
||||
elif current_platform.is_cpu():
|
||||
return self.forward_cpu
|
||||
elif current_platform.is_hpu():
|
||||
return self.forward_hpu
|
||||
elif current_platform.is_tpu():
|
||||
return self.forward_tpu
|
||||
elif current_platform.is_xpu():
|
||||
|
@ -475,39 +475,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
activation,
|
||||
)
|
||||
|
||||
def forward_hpu(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
use_grouped_topk: bool,
|
||||
top_k: int,
|
||||
router_logits: torch.Tensor,
|
||||
renormalize: bool,
|
||||
topk_group: Optional[int] = None,
|
||||
num_expert_group: Optional[int] = None,
|
||||
global_num_experts: int = -1,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||
apply_router_weight_on_input: bool = False,
|
||||
activation: str = "silu",
|
||||
) -> torch.Tensor:
|
||||
assert not use_grouped_topk
|
||||
assert num_expert_group is None
|
||||
assert topk_group is None
|
||||
assert custom_routing_function is None
|
||||
assert layer is not None
|
||||
assert apply_router_weight_on_input is False
|
||||
if scoring_func != "softmax":
|
||||
raise NotImplementedError(
|
||||
"Only softmax scoring function is supported for HPU.")
|
||||
if e_score_correction_bias is not None:
|
||||
raise NotImplementedError(
|
||||
"Expert score correction bias is not supported for HPU.")
|
||||
return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
|
||||
router_logits, top_k)
|
||||
|
||||
def forward_tpu(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
@ -716,9 +683,6 @@ class FusedMoE(torch.nn.Module):
|
||||
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
||||
raise ValueError("Only softmax scoring function is supported for "
|
||||
"non-grouped topk.")
|
||||
if current_platform.is_hpu():
|
||||
from vllm_hpu_extension.ops import DynamicFusedMOE
|
||||
self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
|
||||
|
||||
if vllm_config.model_config is not None:
|
||||
model_dtype = vllm_config.model_config.dtype
|
||||
|
@ -170,26 +170,6 @@ class RMSNorm(CustomOp):
|
||||
else:
|
||||
return norm_func(x, self.weight.data, self.variance_epsilon)
|
||||
|
||||
def forward_hpu(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
||||
from vllm_hpu_extension.kernels import rms_norm
|
||||
HPUFusedRMSNorm = rms_norm()
|
||||
if HPUFusedRMSNorm is None:
|
||||
return self.forward_native(x, residual)
|
||||
if residual is not None:
|
||||
orig_shape = x.shape
|
||||
residual += x.view(residual.shape)
|
||||
# Note: HPUFusedRMSNorm requires 3D tensors as inputs
|
||||
x = HPUFusedRMSNorm.apply(residual, self.weight,
|
||||
self.variance_epsilon)
|
||||
return x.view(orig_shape), residual
|
||||
|
||||
x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon)
|
||||
return x
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
|
@ -229,64 +229,6 @@ class RotaryEmbedding(CustomOp):
|
||||
self.cos_sin_cache, self.is_neox_style)
|
||||
return query, key
|
||||
|
||||
def forward_hpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: Optional[torch.Tensor] = None,
|
||||
offsets: Optional[torch.Tensor] = None,
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
from habana_frameworks.torch.hpex.kernels import (
|
||||
RotaryPosEmbeddingMode, apply_rotary_pos_emb)
|
||||
if offsets is not None:
|
||||
offsets = offsets.view(positions.shape[0], -1)
|
||||
positions = positions + offsets
|
||||
positions = positions.flatten()
|
||||
num_tokens = positions.shape[0]
|
||||
cos_sin = self.cos_sin_cache.index_select(0, positions).view(
|
||||
num_tokens, 1, -1)
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
# HPU RoPE kernel requires hidden dimension for cos and sin to be equal
|
||||
# to query hidden dimension, so the original tensors need to be
|
||||
# expanded
|
||||
# GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
|
||||
# and expansion of cos/sin tensors via concatenation
|
||||
# GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
|
||||
# and expansion of cos/sin tensors via repeat_interleave
|
||||
rope_mode: RotaryPosEmbeddingMode
|
||||
if self.is_neox_style:
|
||||
rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
|
||||
cos = torch.cat((cos, cos), dim=-1)
|
||||
sin = torch.cat((sin, sin), dim=-1)
|
||||
else:
|
||||
rope_mode = RotaryPosEmbeddingMode.PAIRWISE
|
||||
sin = torch.repeat_interleave(sin,
|
||||
2,
|
||||
dim=-1,
|
||||
output_size=cos_sin.shape[-1])
|
||||
cos = torch.repeat_interleave(cos,
|
||||
2,
|
||||
dim=-1,
|
||||
output_size=cos_sin.shape[-1])
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., :self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim:]
|
||||
query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0,
|
||||
rope_mode)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
if key is not None:
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., :self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim:]
|
||||
key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0,
|
||||
rope_mode)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_neuron(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
|
@ -388,20 +388,8 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
|
||||
# Copy the data. Select chunk corresponding to current shard.
|
||||
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
|
||||
|
||||
if current_platform.is_hpu():
|
||||
# FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here,
|
||||
# so we're using a workaround. Remove this when fixed in
|
||||
# HPU PT bridge.
|
||||
padded_weight = torch.cat([
|
||||
loaded_weight,
|
||||
torch.zeros(param.shape[0] - loaded_weight.shape[0],
|
||||
*loaded_weight.shape[1:])
|
||||
])
|
||||
param.data.copy_(padded_weight)
|
||||
else:
|
||||
param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
|
||||
param[loaded_weight.shape[0]:].data.fill_(0)
|
||||
param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
|
||||
param[loaded_weight.shape[0]:].data.fill_(0)
|
||||
|
||||
def forward(self, input_):
|
||||
if self.tp_size > 1:
|
||||
|
@ -199,10 +199,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
|
||||
if self.pre_quant:
|
||||
if self.load_8bit:
|
||||
if current_platform.is_hpu():
|
||||
raise ValueError(
|
||||
"currently hpu supports 4bit quantization only")
|
||||
|
||||
return self._quantized_8bit_generator(
|
||||
hf_weights_files, use_safetensors,
|
||||
quant_state_dict), quant_state_dict
|
||||
@ -306,10 +302,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
in temp_state_dict):
|
||||
quant_state = _parse_quant_state(mapped_weight_name,
|
||||
temp_state_dict)
|
||||
if current_platform.is_hpu():
|
||||
assert quant_state.quant_type == "nf4", (
|
||||
"currently hpu supports nf4 quant_type only")
|
||||
|
||||
quant_state_dict[mapped_weight_name] = quant_state
|
||||
yield org_weight_name, weight_tensor
|
||||
else:
|
||||
@ -380,8 +372,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
...]
|
||||
|
||||
# bitsandbytes requires data in GPU
|
||||
if (weight_sub_tensor.is_cuda
|
||||
or weight_sub_tensor.device.type == "hpu"):
|
||||
if weight_sub_tensor.is_cuda:
|
||||
loaded_weight = weight_sub_tensor
|
||||
else:
|
||||
loaded_weight = weight_sub_tensor.to(
|
||||
|
@ -218,16 +218,6 @@ class DefaultModelLoader(BaseModelLoader):
|
||||
|
||||
weights_iterator = _xla_weights_iterator(weights_iterator)
|
||||
|
||||
elif current_platform.is_hpu():
|
||||
import habana_frameworks.torch.core as htcore
|
||||
|
||||
def _hpu_weights_iterator(iterator: Generator):
|
||||
for weights in iterator:
|
||||
yield weights
|
||||
htcore.mark_step()
|
||||
|
||||
weights_iterator = _hpu_weights_iterator(weights_iterator)
|
||||
|
||||
if self.counter_before_loading_weights == 0.0:
|
||||
self.counter_before_loading_weights = time.perf_counter()
|
||||
# Apply the prefix.
|
||||
|
@ -116,23 +116,6 @@ def rocm_platform_plugin() -> Optional[str]:
|
||||
return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
|
||||
|
||||
|
||||
def hpu_platform_plugin() -> Optional[str]:
|
||||
is_hpu = False
|
||||
logger.debug("Checking if HPU platform is available.")
|
||||
try:
|
||||
from importlib import util
|
||||
is_hpu = util.find_spec('habana_frameworks') is not None
|
||||
if is_hpu:
|
||||
logger.debug("Confirmed HPU platform is available.")
|
||||
else:
|
||||
logger.debug("HPU platform is not available because "
|
||||
"habana_frameworks is not found.")
|
||||
except Exception as e:
|
||||
logger.debug("HPU platform is not available because: %s", str(e))
|
||||
|
||||
return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
|
||||
|
||||
|
||||
def xpu_platform_plugin() -> Optional[str]:
|
||||
is_xpu = False
|
||||
logger.debug("Checking if XPU platform is available.")
|
||||
@ -208,7 +191,6 @@ builtin_platform_plugins = {
|
||||
'tpu': tpu_platform_plugin,
|
||||
'cuda': cuda_platform_plugin,
|
||||
'rocm': rocm_platform_plugin,
|
||||
'hpu': hpu_platform_plugin,
|
||||
'xpu': xpu_platform_plugin,
|
||||
'cpu': cpu_platform_plugin,
|
||||
'neuron': neuron_platform_plugin,
|
||||
|
@ -1,114 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||
|
||||
from .interface import Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HpuPlatform(Platform):
|
||||
_enum = PlatformEnum.HPU
|
||||
device_name: str = "hpu"
|
||||
device_type: str = "hpu"
|
||||
dispatch_key: str = "HPU"
|
||||
ray_device_key: str = "HPU"
|
||||
dist_backend: str = "hccl"
|
||||
device_control_env_var: str = "HABANA_VISIBLE_MODULES"
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
logger.info("Using HPUAttention backend.")
|
||||
return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.no_grad()
|
||||
|
||||
@classmethod
|
||||
def set_device(cls, device: torch.device) -> None:
|
||||
"""
|
||||
Set the device for the current platform.
|
||||
"""
|
||||
torch.hpu.set_device(device)
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if scheduler_config.is_multi_step:
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker"
|
||||
|
||||
if vllm_config.speculative_config is not None:
|
||||
raise NotImplementedError(
|
||||
"Speculative decoding is not implemented for HPU")
|
||||
|
||||
if parallel_config.worker_cls == "auto":
|
||||
parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
|
||||
|
||||
# NOTE(kzawora): default block size for Gaudi should be 128
|
||||
# smaller sizes still work, but very inefficiently
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 128
|
||||
if (parallel_config.distributed_executor_backend == 'mp'
|
||||
and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
|
||||
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
|
||||
None) is not None:
|
||||
logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
|
||||
"might cause application hangs on exit. Using "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
|
||||
"as it was explicitly requested.")
|
||||
else:
|
||||
logger.warning(
|
||||
"On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
|
||||
"might cause application hangs on exit. Setting "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
|
||||
"To override that behavior, please set "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
if vllm_config.model_config and vllm_config.model_config.use_mla:
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = max(
|
||||
vllm_config.scheduler_config.max_model_len,
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls):
|
||||
logger.warning("Pin memory is not supported on HPU.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_punica_wrapper(cls) -> str:
|
||||
return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator" # noqa
|
@ -54,7 +54,6 @@ class _Backend(enum.Enum):
|
||||
FLASHMLA_VLLM_V1 = enum.auto()
|
||||
FLASHMLA = enum.auto() # Supported by V1
|
||||
CUTLASS_MLA_VLLM_V1 = enum.auto()
|
||||
HPU_ATTN = enum.auto()
|
||||
PALLAS = enum.auto()
|
||||
PALLAS_VLLM_V1 = enum.auto()
|
||||
IPEX = enum.auto()
|
||||
@ -69,7 +68,6 @@ class PlatformEnum(enum.Enum):
|
||||
CUDA = enum.auto()
|
||||
ROCM = enum.auto()
|
||||
TPU = enum.auto()
|
||||
HPU = enum.auto()
|
||||
XPU = enum.auto()
|
||||
CPU = enum.auto()
|
||||
NEURON = enum.auto()
|
||||
@ -154,9 +152,6 @@ class Platform:
|
||||
def is_tpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.TPU
|
||||
|
||||
def is_hpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.HPU
|
||||
|
||||
def is_xpu(self) -> bool:
|
||||
return self._enum == PlatformEnum.XPU
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Callable
|
||||
|
||||
import torch
|
||||
@ -75,18 +74,6 @@ def load_general_plugins():
|
||||
if current_platform.is_xpu():
|
||||
# see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
|
||||
torch._dynamo.config.disable = True
|
||||
elif current_platform.is_hpu():
|
||||
# NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
|
||||
# does not support torch.compile
|
||||
# Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
|
||||
# torch.compile support
|
||||
is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
|
||||
if is_lazy:
|
||||
torch._dynamo.config.disable = True
|
||||
# NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
|
||||
# requires enabling lazy collectives
|
||||
# see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
|
||||
os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
|
||||
|
||||
plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
|
||||
# general plugins, we only need to execute the loaded functions
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,485 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
###############################################################################
|
||||
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
|
||||
###############################################################################
|
||||
|
||||
import contextlib
|
||||
import gc
|
||||
import os
|
||||
from typing import List, Optional, Set, Tuple, Type
|
||||
|
||||
import habana_frameworks.torch as htorch # noqa:F401
|
||||
import torch
|
||||
import torch.distributed
|
||||
from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor import set_random_seed
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import bind_kv_cache
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.worker.hpu_model_runner import HPUModelRunner
|
||||
from vllm.worker.model_runner_base import ModelRunnerBase
|
||||
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
|
||||
WorkerInput)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HPUWorker(LocalOrDistributedWorkerBase):
|
||||
"""A worker class that executes (a partition of) the model on a HPU.
|
||||
|
||||
Each worker is associated with a single HPU. The worker is responsible for
|
||||
maintaining the KV cache and executing the model on the HPU. In case of
|
||||
distributed inference, each worker is assigned a partition of the model.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
distributed_init_method: str,
|
||||
is_driver_worker: bool = False,
|
||||
model_runner_cls: Optional[Type[ModelRunnerBase]] = None,
|
||||
) -> None:
|
||||
WorkerBase.__init__(self, vllm_config=vllm_config)
|
||||
self.parallel_config.rank = rank
|
||||
self.local_rank = local_rank
|
||||
self.rank = rank
|
||||
self.distributed_init_method = distributed_init_method
|
||||
self.is_driver_worker = is_driver_worker
|
||||
if self.is_driver_worker:
|
||||
assert self.rank == 0, "The driver worker must have rank 0."
|
||||
|
||||
if self.model_config.trust_remote_code:
|
||||
# note: lazy import to avoid importing torch before initializing
|
||||
from vllm.utils import init_cached_hf_modules
|
||||
init_cached_hf_modules()
|
||||
|
||||
self.model_runner: HPUModelRunner = HPUModelRunner(
|
||||
vllm_config=vllm_config, is_driver_worker=is_driver_worker)
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# initialize_cache.
|
||||
self.cache_engine: List[HPUCacheEngine]
|
||||
# Initialize gpu_cache as pooling models don't initialize kv_caches
|
||||
self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||
# Torch profiler. Enabled and configured through env vars:
|
||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
logger.info("Profiling enabled. Traces will be saved to: %s",
|
||||
torch_profiler_trace_dir)
|
||||
self.profiler = torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.HPU,
|
||||
],
|
||||
with_stack=True,
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
torch_profiler_trace_dir, use_gzip=True))
|
||||
else:
|
||||
self.profiler = None
|
||||
|
||||
def start_profile(self):
|
||||
if self.profiler is None:
|
||||
raise RuntimeError("Profiler is not enabled.")
|
||||
self.profiler.start()
|
||||
|
||||
def stop_profile(self):
|
||||
if self.profiler is None:
|
||||
raise RuntimeError("Profiler is not enabled.")
|
||||
self.profiler.stop()
|
||||
|
||||
def _set_env_vars(self):
|
||||
local_rank = self.local_rank
|
||||
if self.parallel_config.world_size == 1:
|
||||
local_rank = -1
|
||||
import os
|
||||
os.environ["LOCAL_RANK"] = str(local_rank)
|
||||
os.environ["ID"] = str(local_rank)
|
||||
os.environ["WORLD_SIZE"] = str(self.parallel_config.world_size)
|
||||
os.environ["RANK"] = str(self.rank)
|
||||
|
||||
def init_device(self) -> None:
|
||||
if self.device_config.device.type == "hpu":
|
||||
self.device = torch.device("hpu")
|
||||
torch.hpu.set_device(self.device)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Not support device type: {self.device_config.device}")
|
||||
# Initialize the distributed environment.
|
||||
if self.model_config.quantization == 'inc':
|
||||
self._set_env_vars()
|
||||
init_worker_distributed_environment(self.parallel_config, self.rank,
|
||||
self.distributed_init_method,
|
||||
self.local_rank)
|
||||
# Set random seed.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
def load_model(self):
|
||||
self.model_runner.load_model()
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None,
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
|
||||
log_graph_compilation_all = os.environ.get(
|
||||
'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
|
||||
log_graph_compilation = os.environ.get(
|
||||
'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
|
||||
'0') != '0' or log_graph_compilation_all
|
||||
log_cpu_fallbacks_all = os.environ.get(
|
||||
'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
|
||||
log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
|
||||
'0') != '0' or log_cpu_fallbacks_all
|
||||
if (log_graph_compilation or log_cpu_fallbacks) and \
|
||||
execute_model_req is not None:
|
||||
from habana_frameworks.torch.hpu.metrics import metric_localcontext
|
||||
seq_group_metadata_list = execute_model_req.seq_group_metadata_list
|
||||
is_prompt = any([
|
||||
seq_group_metadata.is_prompt
|
||||
for seq_group_metadata in seq_group_metadata_list
|
||||
])
|
||||
max_context_len = max([
|
||||
max([
|
||||
len(v.prompt_token_ids) + len(v.output_token_ids)
|
||||
for v in seq_group_metadata.seq_data.values()
|
||||
]) for seq_group_metadata in seq_group_metadata_list
|
||||
]) # whoa, that's some spicy stuff right here
|
||||
max_num_blocks = (
|
||||
(max_context_len - 1) // self.cache_config.block_size) + 1
|
||||
input_stats = (f'is_prompt: {is_prompt}, '
|
||||
f'num_seqs: {len(seq_group_metadata_list)}, '
|
||||
f'max_context_len: {max_context_len}, '
|
||||
f'max_num_blocks {max_num_blocks}')
|
||||
gc_ctx = metric_localcontext(
|
||||
"graph_compilation"
|
||||
) if log_graph_compilation else contextlib.nullcontext()
|
||||
cpu_fallback_ctx = metric_localcontext(
|
||||
"cpu_fallback"
|
||||
) if log_cpu_fallbacks else contextlib.nullcontext()
|
||||
with gc_ctx as gc_local_metric, \
|
||||
cpu_fallback_ctx as cpu_fallback_local_metric:
|
||||
output = LocalOrDistributedWorkerBase.execute_model(
|
||||
self, execute_model_req)
|
||||
if (log_graph_compilation and gc_local_metric.stats()[0][1]
|
||||
> 0) or log_graph_compilation_all:
|
||||
msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
|
||||
f"{gc_local_metric.stats()}, {input_stats}")
|
||||
logger.warning(msg)
|
||||
if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1]
|
||||
> 0) or log_cpu_fallbacks_all:
|
||||
msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
|
||||
f"{cpu_fallback_local_metric.stats()}, {input_stats}")
|
||||
logger.warning(msg)
|
||||
|
||||
return output
|
||||
|
||||
output = LocalOrDistributedWorkerBase.execute_model(
|
||||
self, execute_model_req)
|
||||
return output
|
||||
|
||||
@torch.inference_mode()
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Profiles the peak memory usage of the model to determine how many
|
||||
KV blocks may be allocated without OOMs.
|
||||
|
||||
The engine will first conduct a profiling of the existing memory usage.
|
||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||
that can be allocated with the remaining free memory.
|
||||
|
||||
Tip:
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
"""
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
|
||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||
# of the model.
|
||||
with HabanaMemoryProfiler() as m:
|
||||
self.model_runner.profile_run()
|
||||
torch.hpu.synchronize()
|
||||
msg = ("Model profiling run "
|
||||
f"took {m.get_summary_string()}")
|
||||
logger.info(msg)
|
||||
# At this point we should've allocated the maximum workspace for all
|
||||
# recipes we will use the extra memory for graphs/blocks
|
||||
free_hpu_memory = torch.hpu.mem_get_info()[0]
|
||||
|
||||
cache_block_size = self.get_cache_block_size_bytes()
|
||||
graph_reserved_mem = (float(
|
||||
os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1'))
|
||||
if not self.model_config.enforce_eager else 0)
|
||||
graph_headroom = 1 - graph_reserved_mem
|
||||
available_hpu_memory = free_hpu_memory * \
|
||||
self.cache_config.gpu_memory_utilization
|
||||
hpu_memory_margin = free_hpu_memory * (
|
||||
1 - self.cache_config.gpu_memory_utilization)
|
||||
self.model_runner.mem_margin = hpu_memory_margin
|
||||
cache_size_bytes = available_hpu_memory * graph_headroom
|
||||
graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom)
|
||||
msg = (
|
||||
f"Free device memory: {format_bytes(free_hpu_memory)}, "
|
||||
f"{format_bytes(available_hpu_memory)} usable "
|
||||
f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization}),"
|
||||
f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs "
|
||||
f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), "
|
||||
f"{format_bytes(cache_size_bytes)} reserved for KV cache")
|
||||
logger.info(msg)
|
||||
num_hpu_blocks = int(cache_size_bytes // cache_block_size)
|
||||
num_cpu_blocks = int(self.cache_config.swap_space_bytes //
|
||||
cache_block_size)
|
||||
num_hpu_blocks = max(num_hpu_blocks, 0)
|
||||
num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||
self.model_runner.bucketing_ctx.num_hpu_blocks = num_hpu_blocks
|
||||
|
||||
if self.model_runner.lora_manager:
|
||||
self.model_runner.remove_all_loras()
|
||||
|
||||
gc.collect()
|
||||
return num_hpu_blocks, num_cpu_blocks
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Allocate GPU and CPU KV cache with the specified number of blocks.
|
||||
|
||||
This also warms up the model, which may record CUDA graphs.
|
||||
"""
|
||||
raise_if_cache_size_invalid(
|
||||
num_gpu_blocks, self.cache_config.block_size,
|
||||
self.model_config.max_model_len,
|
||||
self.parallel_config.pipeline_parallel_size)
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
with HabanaMemoryProfiler() as m:
|
||||
self._init_cache_engine()
|
||||
torch.hpu.synchronize()
|
||||
msg = ("Initializing cache engine "
|
||||
f"took {m.get_summary_string()}")
|
||||
logger.info(msg)
|
||||
self._warm_up_model()
|
||||
|
||||
def _init_cache_engine(self):
|
||||
assert self.cache_config.num_gpu_blocks is not None
|
||||
self.cache_engine = [
|
||||
HPUCacheEngine(self.cache_config, self.model_config,
|
||||
self.parallel_config, self.device_config)
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
self.hpu_cache = [
|
||||
self.cache_engine[ve].gpu_cache
|
||||
for ve in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
bind_kv_cache(self.compilation_config.static_forward_context,
|
||||
self.hpu_cache)
|
||||
|
||||
def _warm_up_model(self) -> None:
|
||||
# NOTE(kzawora): We should use virtual engine index here
|
||||
# for pipeline parallelism. Using 0 for now.
|
||||
assert self.hpu_cache is not None
|
||||
self.model_runner.warmup_model(self.hpu_cache[0])
|
||||
# Reset the seed to ensure that the random state is not affected by
|
||||
# the model initialization and profiling.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
def finish_measurements(self):
|
||||
self.model_runner.finish_measurements()
|
||||
|
||||
@property
|
||||
def do_metadata_broadcast(self) -> bool:
|
||||
return self.parallel_config.tensor_parallel_size > 1
|
||||
|
||||
@property
|
||||
def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
|
||||
return self.hpu_cache
|
||||
|
||||
@torch.inference_mode()
|
||||
def prepare_worker_input(
|
||||
self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
|
||||
virtual_engine = execute_model_req.virtual_engine
|
||||
num_seq_groups = len(execute_model_req.seq_group_metadata_list)
|
||||
# `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
|
||||
# they contain parameters to launch cudamemcpyasync.
|
||||
blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
|
||||
device="cpu",
|
||||
dtype=torch.int64).view(-1, 2)
|
||||
blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
|
||||
device="cpu",
|
||||
dtype=torch.int64).view(-1, 2)
|
||||
# `blocks_to_copy` is a gpu tensor. The src and tgt of
|
||||
# blocks to copy are in the same device, and `blocks_to_copy`
|
||||
# can be used directly within cuda kernels.
|
||||
blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
|
||||
device=self.device,
|
||||
dtype=torch.int64).view(-1, 2)
|
||||
|
||||
return WorkerInput(
|
||||
num_seq_groups=num_seq_groups,
|
||||
blocks_to_swap_in=blocks_to_swap_in,
|
||||
blocks_to_swap_out=blocks_to_swap_out,
|
||||
blocks_to_copy=blocks_to_copy,
|
||||
virtual_engine=virtual_engine,
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_worker(self, worker_input: WorkerInput) -> None:
|
||||
virtual_engine = worker_input.virtual_engine
|
||||
# Issue cache operations.
|
||||
if (worker_input.blocks_to_swap_in is not None
|
||||
and worker_input.blocks_to_swap_in.numel() > 0):
|
||||
self.cache_engine[virtual_engine].swap_in(
|
||||
worker_input.blocks_to_swap_in)
|
||||
if (worker_input.blocks_to_swap_out is not None
|
||||
and worker_input.blocks_to_swap_out.numel() > 0):
|
||||
self.cache_engine[virtual_engine].swap_out(
|
||||
worker_input.blocks_to_swap_out)
|
||||
if (worker_input.blocks_to_copy is not None
|
||||
and worker_input.blocks_to_copy.numel() > 0):
|
||||
self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.model_runner.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return self.model_runner.remove_lora(lora_id)
|
||||
|
||||
def pin_lora(self, lora_id: int) -> bool:
|
||||
return self.model_runner.pin_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.model_runner.list_loras()
|
||||
|
||||
def add_prompt_adapter(
|
||||
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def list_prompt_adapters(self) -> Set[int]:
|
||||
raise NotImplementedError(
|
||||
"Prompt Adapter is not implemented for HPU backend.")
|
||||
|
||||
def shutdown_inc(self):
|
||||
self.model_runner.shutdown_inc()
|
||||
|
||||
@property
|
||||
def max_model_len(self) -> int:
|
||||
return self.model_config.max_model_len
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return self.model_runner.vocab_size
|
||||
|
||||
def get_cache_block_size_bytes(self) -> int:
|
||||
"""Get the size of the KV cache block size in bytes.
|
||||
"""
|
||||
return HPUCacheEngine.get_cache_block_size(self.cache_config,
|
||||
self.model_config,
|
||||
self.parallel_config)
|
||||
|
||||
|
||||
def init_worker_distributed_environment(
|
||||
parallel_config: ParallelConfig,
|
||||
rank: int,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
local_rank: int = -1,
|
||||
) -> None:
|
||||
"""Initialize the distributed environment."""
|
||||
init_distributed_environment(parallel_config.world_size,
|
||||
rank,
|
||||
distributed_init_method,
|
||||
local_rank,
|
||||
backend=current_platform.dist_backend)
|
||||
|
||||
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
|
||||
parallel_config.pipeline_parallel_size)
|
||||
|
||||
if torch.distributed.is_initialized():
|
||||
torch_world_size = torch.distributed.get_world_size()
|
||||
if torch_world_size != parallel_config.world_size:
|
||||
raise RuntimeError(
|
||||
"torch.distributed is already initialized but the torch world "
|
||||
"size does not match parallel_config.world_size "
|
||||
f"({torch_world_size} vs. {parallel_config.world_size}).")
|
||||
elif not distributed_init_method:
|
||||
raise ValueError(
|
||||
"distributed_init_method must be set if torch.distributed "
|
||||
"is not already initialized")
|
||||
else:
|
||||
torch.distributed.init_process_group(
|
||||
backend="hccl",
|
||||
world_size=parallel_config.world_size,
|
||||
rank=rank,
|
||||
init_method=distributed_init_method,
|
||||
)
|
||||
|
||||
# A small all_reduce for warmup & checking conformance.
|
||||
dummy_tensor_hpu = torch.ones(1).to('hpu')
|
||||
torch.distributed.all_reduce(dummy_tensor_hpu)
|
||||
assert dummy_tensor_hpu.item() == parallel_config.world_size
|
||||
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
|
||||
parallel_config.pipeline_parallel_size)
|
||||
|
||||
|
||||
def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len,
|
||||
pipeline_parallel_size) -> None:
|
||||
if num_gpu_blocks <= 0:
|
||||
raise ValueError("No available memory for the cache blocks. "
|
||||
"Try increasing `gpu_memory_utilization` when "
|
||||
"initializing the engine.")
|
||||
max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
|
||||
if max_model_len > max_seq_len:
|
||||
raise ValueError(
|
||||
f"The model's max seq len ({max_model_len}) "
|
||||
"is larger than the maximum number of tokens that can be "
|
||||
f"stored in KV cache ({max_seq_len}). Try increasing "
|
||||
"`gpu_memory_utilization` or decreasing `max_model_len` when "
|
||||
"initializing the engine.")
|
||||
|
||||
|
||||
class HPUCacheEngine(CacheEngine):
|
||||
|
||||
def _allocate_kv_cache(
|
||||
self,
|
||||
num_blocks: int,
|
||||
device: str,
|
||||
) -> List[Tuple[torch.Tensor, torch.Tensor]]:
|
||||
"""Allocates KV cache on the specified device."""
|
||||
kv_cache_shape = self.attn_backend.get_kv_cache_shape(
|
||||
num_blocks, self.block_size, self.num_kv_heads, self.head_size)
|
||||
kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
|
||||
for _ in range(self.num_attention_layers):
|
||||
key_cache = torch.zeros(kv_cache_shape,
|
||||
dtype=self.dtype,
|
||||
device=device)
|
||||
value_cache = torch.zeros(kv_cache_shape,
|
||||
dtype=self.dtype,
|
||||
device=device)
|
||||
kv_layer = (key_cache, value_cache)
|
||||
kv_cache.append(kv_layer)
|
||||
return kv_cache
|
@ -1,123 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
###############################################################################
|
||||
# Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
|
||||
###############################################################################
|
||||
|
||||
import dataclasses
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.distributed import broadcast_tensor_dict
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.worker.hpu_model_runner import ModelInputForHPU
|
||||
from vllm.worker.hpu_worker import HPUWorker
|
||||
from vllm.worker.worker_base import WorkerInput
|
||||
|
||||
|
||||
class MultiStepHPUWorker(HPUWorker):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.cached_model_input: Optional[ModelInputForHPU] = None
|
||||
|
||||
def _get_driver_input_and_broadcast(
|
||||
self, execute_model_req: ExecuteModelRequest
|
||||
) -> Tuple[ModelInputForHPU, WorkerInput, Dict[str, torch.Tensor]]:
|
||||
"""
|
||||
Get the driver input and broadcast it to other workers.
|
||||
"""
|
||||
assert self.is_driver_worker
|
||||
assert execute_model_req.virtual_engine == 0
|
||||
|
||||
is_first_multi_step = execute_model_req.is_first_multi_step
|
||||
is_last_step = execute_model_req.is_last_step
|
||||
|
||||
if is_first_multi_step:
|
||||
# on first step we prepare the worker input and model input normally
|
||||
worker_input: WorkerInput = self.prepare_worker_input(
|
||||
execute_model_req=execute_model_req)
|
||||
worker_input = dataclasses.replace(
|
||||
worker_input,
|
||||
num_steps=execute_model_req.num_lookahead_slots + 1)
|
||||
model_input: ModelInputForHPU = (
|
||||
self.model_runner.prepare_model_input(
|
||||
execute_model_req.seq_group_metadata_list,
|
||||
execute_model_req.virtual_engine,
|
||||
execute_model_req.finished_requests_ids))
|
||||
|
||||
if execute_model_req.async_callback:
|
||||
model_input = dataclasses.replace(
|
||||
model_input,
|
||||
async_callback=execute_model_req.async_callback)
|
||||
else:
|
||||
# on subsequent steps we reuse the worker input and model input
|
||||
assert self.cached_model_input is not None
|
||||
model_input = self.cached_model_input
|
||||
worker_input = WorkerInput()
|
||||
|
||||
model_input = dataclasses.replace(
|
||||
model_input,
|
||||
is_first_multi_step=is_first_multi_step,
|
||||
is_last_step=is_last_step)
|
||||
|
||||
if self.do_metadata_broadcast:
|
||||
if is_first_multi_step:
|
||||
broadcast_data = worker_input.as_broadcastable_tensor_dict()
|
||||
broadcast_data.update(
|
||||
model_input.as_broadcastable_tensor_dict())
|
||||
broadcast_tensor_dict(broadcast_data, src=0)
|
||||
else:
|
||||
broadcast_data = {
|
||||
"is_first_multi_step": is_first_multi_step,
|
||||
"is_last_step": is_last_step,
|
||||
}
|
||||
broadcast_tensor_dict(broadcast_data, src=0)
|
||||
|
||||
# Returning empty dict here to keep this compatible with
|
||||
# `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
|
||||
return model_input, worker_input, {}
|
||||
|
||||
def prepare_input(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None,
|
||||
) -> Optional[Tuple[ModelInputForHPU, WorkerInput, Dict[str,
|
||||
torch.Tensor]]]:
|
||||
if self.is_driver_worker:
|
||||
if execute_model_req is None:
|
||||
if self.do_metadata_broadcast:
|
||||
# This signals that there's no more requests to process for
|
||||
# now. All workers are running infinite loop with
|
||||
# broadcast_tensor_dict, and it stops the loop when the
|
||||
# driver broadcasts an empty input. Send an empty input to
|
||||
# notify all other workers to stop their execution loop.
|
||||
broadcast_tensor_dict({}, src=0)
|
||||
return None
|
||||
model_input, worker_input, _ = self._get_driver_input_and_broadcast(
|
||||
execute_model_req)
|
||||
if model_input.is_first_multi_step:
|
||||
self.cached_model_input = model_input
|
||||
return model_input, worker_input, {}
|
||||
else:
|
||||
broadcast_data = broadcast_tensor_dict(src=0)
|
||||
if not broadcast_data:
|
||||
return None
|
||||
|
||||
if len(broadcast_data) == 2:
|
||||
assert self.cached_model_input is not None
|
||||
self.cached_model_input = dataclasses.replace(
|
||||
self.cached_model_input,
|
||||
is_first_multi_step=broadcast_data["is_first_multi_step"],
|
||||
is_last_step=broadcast_data["is_last_step"])
|
||||
empty_worker_input = WorkerInput()
|
||||
return self.cached_model_input, empty_worker_input, {}
|
||||
|
||||
worker_input = WorkerInput.from_broadcasted_tensor_dict(
|
||||
broadcast_data)
|
||||
model_input = (
|
||||
self.model_runner.
|
||||
make_model_input_from_broadcasted_tensor_dict(broadcast_data))
|
||||
self.cached_model_input = model_input
|
||||
return model_input, worker_input, {}
|
Reference in New Issue
Block a user