[Misc]Clean up useless import from vllm (#2049)

Clean up useless  import from vllm to make code more clear.

- vLLM version: v0.10.0
- vLLM main:
18cc33dd60

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-28 16:01:59 +08:00
committed by GitHub
parent 34cfdf5520
commit 4a008c4dac
4 changed files with 5 additions and 14 deletions

View File

@ -27,11 +27,11 @@ from vllm.attention.backends.utils import CommonAttentionState
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.utils import direct_register_custom_op
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.ops.attention import vanilla_chunked_prefill
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d, nd_to_nz_spec)
from vllm_ascend.worker.npu_input_batch import InputBatch
class AscendAttentionBackend(AttentionBackend):

View File

@ -25,11 +25,11 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionLayer, AttentionType)
from vllm.attention.backends.utils import PAD_SLOT_ID, CommonAttentionState
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d)
from vllm_ascend.worker.npu_input_batch import InputBatch
class AscendAttentionTorchairBackend(AttentionBackend):

View File

@ -24,7 +24,8 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union
import torch
from acl.rt import memcpy # type: ignore # noqa: F401
from vllm.logger import logger
from vllm.utils import is_pin_memory_available
from vllm_ascend.platform import NPUPlatform
def find_loaded_library(lib_name) -> Optional[str]:
@ -199,7 +200,7 @@ class CaMemAllocator:
size_in_bytes,
dtype=torch.uint8,
device='cpu',
pin_memory=is_pin_memory_available())
pin_memory=NPUPlatform.is_pin_memory_available())
cpu_ptr = cpu_backup_tensor.data_ptr()
ACL_MEMCPY_DEVICE_TO_HOST = 2
dest_max = cpu_ptr + size_in_bytes * 2

View File

@ -44,7 +44,6 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
from vllm.distributed.parallel_state import (get_dp_group, get_pp_group,
get_tp_group)
from vllm.forward_context import get_forward_context
from vllm.inputs import INPUT_REGISTRY
from vllm.logger import logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@ -52,7 +51,6 @@ from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models.interfaces import supports_transcription
from vllm.model_executor.models.interfaces_base import (
VllmModelForPooling, is_pooling_model, is_text_generation_model)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.multimodal.utils import group_mm_inputs_by_modality
from vllm.pooling_params import PoolingParams
@ -60,7 +58,6 @@ from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
LazyLoader, cdiv)
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheSpec)
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
@ -169,13 +166,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.device = device
self.dtype = self.model_config.dtype
self.sampler = Sampler()
# Multi-modal data support
self.input_registry = INPUT_REGISTRY
self.mm_registry = MULTIMODAL_REGISTRY
self.max_num_encoder_input_tokens, self.encoder_cache_size = compute_encoder_budget(
model_config=self.model_config,
scheduler_config=self.scheduler_config,
mm_registry=self.mm_registry)
# Lazy initialization, these will be set after __init__
self.kv_caches: List[torch.Tensor] = []