mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Bugfix] Fix HPU multiprocessing executor (#12167)
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
This commit is contained in:
@ -1293,7 +1293,7 @@ class ParallelConfig:
|
||||
raise ValueError(f"worker-use-ray can't be used with "
|
||||
f"distributed executor backend "
|
||||
f"'{self.distributed_executor_backend}'.")
|
||||
ray_only_devices = ["tpu", "hpu"]
|
||||
ray_only_devices = ["tpu"]
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.device_type in ray_only_devices
|
||||
and self.world_size > 1):
|
||||
|
@ -397,7 +397,7 @@ class EngineArgs:
|
||||
'or equal to the number of GPUs available, "mp" will be used to '
|
||||
'keep processing on a single host. Otherwise, this will default '
|
||||
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
|
||||
'and hpu only support Ray for distributed inference.')
|
||||
'only supports Ray for distributed inference.')
|
||||
|
||||
parser.add_argument(
|
||||
'--worker-use-ray',
|
||||
|
@ -1,7 +1,9 @@
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .interface import Platform, PlatformEnum, _Backend
|
||||
@ -58,6 +60,22 @@ class HpuPlatform(Platform):
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 128
|
||||
if (parallel_config.distributed_executor_backend == 'mp'
|
||||
and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
|
||||
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
|
||||
None) is not None:
|
||||
logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
|
||||
"might cause application hangs on exit. Using "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
|
||||
"as it was explicitly requested.")
|
||||
else:
|
||||
logger.warning(
|
||||
"On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
|
||||
"might cause application hangs on exit. Setting "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
|
||||
"To override that behavior, please set "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls):
|
||||
|
@ -130,7 +130,6 @@ class HPUWorker(LocalOrDistributedWorkerBase):
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None,
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
assert execute_model_req is not None
|
||||
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
|
||||
# VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501
|
||||
@ -144,7 +143,8 @@ class HPUWorker(LocalOrDistributedWorkerBase):
|
||||
'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
|
||||
log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
|
||||
'0') != '0' or log_cpu_fallbacks_all
|
||||
if log_graph_compilation or log_cpu_fallbacks:
|
||||
if (log_graph_compilation or log_cpu_fallbacks) and \
|
||||
execute_model_req is not None:
|
||||
from habana_frameworks.torch.hpu.metrics import metric_localcontext
|
||||
seq_group_metadata_list = execute_model_req.seq_group_metadata_list
|
||||
is_prompt = any([
|
||||
|
Reference in New Issue
Block a user