mirror of
https://github.com/vllm-project/vllm-ascend.git
synced 2025-10-20 13:43:53 +08:00
[Misc]Clean up useless import from vllm (#2049)
Clean up useless import from vllm to make code more clear.
- vLLM version: v0.10.0
- vLLM main:
18cc33dd60
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@ -27,11 +27,11 @@ from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.utils import direct_register_custom_op
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, nd_to_nz_spec)
|
||||
from vllm_ascend.worker.npu_input_batch import InputBatch
|
||||
|
||||
|
||||
class AscendAttentionBackend(AttentionBackend):
|
||||
|
@ -25,11 +25,11 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionLayer, AttentionType)
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID, CommonAttentionState
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d)
|
||||
from vllm_ascend.worker.npu_input_batch import InputBatch
|
||||
|
||||
|
||||
class AscendAttentionTorchairBackend(AttentionBackend):
|
||||
|
@ -24,7 +24,8 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union
|
||||
import torch
|
||||
from acl.rt import memcpy # type: ignore # noqa: F401
|
||||
from vllm.logger import logger
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
|
||||
|
||||
def find_loaded_library(lib_name) -> Optional[str]:
|
||||
@ -199,7 +200,7 @@ class CaMemAllocator:
|
||||
size_in_bytes,
|
||||
dtype=torch.uint8,
|
||||
device='cpu',
|
||||
pin_memory=is_pin_memory_available())
|
||||
pin_memory=NPUPlatform.is_pin_memory_available())
|
||||
cpu_ptr = cpu_backup_tensor.data_ptr()
|
||||
ACL_MEMCPY_DEVICE_TO_HOST = 2
|
||||
dest_max = cpu_ptr + size_in_bytes * 2
|
||||
|
@ -44,7 +44,6 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
|
||||
from vllm.distributed.parallel_state import (get_dp_group, get_pp_group,
|
||||
get_tp_group)
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.inputs import INPUT_REGISTRY
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
@ -52,7 +51,6 @@ from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models.interfaces import supports_transcription
|
||||
from vllm.model_executor.models.interfaces_base import (
|
||||
VllmModelForPooling, is_pooling_model, is_text_generation_model)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.multimodal.utils import group_mm_inputs_by_modality
|
||||
from vllm.pooling_params import PoolingParams
|
||||
@ -60,7 +58,6 @@ from vllm.sampling_params import SamplingType
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||
LazyLoader, cdiv)
|
||||
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheSpec)
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
|
||||
@ -169,13 +166,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.device = device
|
||||
self.dtype = self.model_config.dtype
|
||||
self.sampler = Sampler()
|
||||
# Multi-modal data support
|
||||
self.input_registry = INPUT_REGISTRY
|
||||
self.mm_registry = MULTIMODAL_REGISTRY
|
||||
self.max_num_encoder_input_tokens, self.encoder_cache_size = compute_encoder_budget(
|
||||
model_config=self.model_config,
|
||||
scheduler_config=self.scheduler_config,
|
||||
mm_registry=self.mm_registry)
|
||||
|
||||
# Lazy initialization, these will be set after __init__
|
||||
self.kv_caches: List[torch.Tensor] = []
|
||||
|
Reference in New Issue
Block a user