[Bugfix] Fix HPU multiprocessing executor (#12167)

Signed-off-by: Konrad Zawora <kzawora@habana.ai>
This commit is contained in:
Konrad Zawora
2025-01-22 19:07:07 +01:00
committed by GitHub
parent 84bee4bd5c
commit 96f6a7596f
4 changed files with 22 additions and 4 deletions

View File

@ -1293,7 +1293,7 @@ class ParallelConfig:
raise ValueError(f"worker-use-ray can't be used with "
f"distributed executor backend "
f"'{self.distributed_executor_backend}'.")
ray_only_devices = ["tpu", "hpu"]
ray_only_devices = ["tpu"]
from vllm.platforms import current_platform
if (current_platform.device_type in ray_only_devices
and self.world_size > 1):

View File

@ -397,7 +397,7 @@ class EngineArgs:
'or equal to the number of GPUs available, "mp" will be used to '
'keep processing on a single host. Otherwise, this will default '
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
'and hpu only support Ray for distributed inference.')
'only supports Ray for distributed inference.')
parser.add_argument(
'--worker-use-ray',

View File

@ -1,7 +1,9 @@
import os
from typing import TYPE_CHECKING, Optional
import torch
from vllm import envs
from vllm.logger import init_logger
from .interface import Platform, PlatformEnum, _Backend
@ -58,6 +60,22 @@ class HpuPlatform(Platform):
cache_config = vllm_config.cache_config
if cache_config and cache_config.block_size is None:
cache_config.block_size = 128
if (parallel_config.distributed_executor_backend == 'mp'
and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
None) is not None:
logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
"might cause application hangs on exit. Using "
"VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
"as it was explicitly requested.")
else:
logger.warning(
"On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
"might cause application hangs on exit. Setting "
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
"To override that behavior, please set "
"VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@classmethod
def is_pin_memory_available(cls):

View File

@ -130,7 +130,6 @@ class HPUWorker(LocalOrDistributedWorkerBase):
self,
execute_model_req: Optional[ExecuteModelRequest] = None,
) -> Optional[List[SamplerOutput]]:
assert execute_model_req is not None
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
# VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
# VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501
@ -144,7 +143,8 @@ class HPUWorker(LocalOrDistributedWorkerBase):
'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
'0') != '0' or log_cpu_fallbacks_all
if log_graph_compilation or log_cpu_fallbacks:
if (log_graph_compilation or log_cpu_fallbacks) and \
execute_model_req is not None:
from habana_frameworks.torch.hpu.metrics import metric_localcontext
seq_group_metadata_list = execute_model_req.seq_group_metadata_list
is_prompt = any([