[BugFix][CPU] Fix x86 SHM distributed module initialization (#18536)

Signed-off-by: jiang.li <jiang1.li@intel.com>
This commit is contained in:
Li, Jiang
2025-05-22 22:35:00 +08:00
committed by GitHub
parent 3f505233fd
commit 93f71673ce

View File

@ -22,8 +22,10 @@ class CpuCommunicator(DeviceCommunicatorBase):
super().__init__(cpu_group, device, device_group, unique_name)
self.dist_module = torch.distributed
if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) \
and hasattr(torch.ops._C, "init_shm_manager"):
if (current_platform.get_cpu_architecture()
== CpuArchEnum.X86) and hasattr(
torch.ops._C,
"init_shm_manager") and unique_name.startswith("tp"):
self.dist_module = _CPUSHMDistributed(self)
def all_reduce(self, input_):
@ -96,6 +98,8 @@ class _CPUSHMDistributed:
def __init__(self, communicator: CpuCommunicator):
instance_identifier = os.environ["VLLM_DIST_IDENT"]
unique_name = communicator.unique_name
instance_identifier = f"{instance_identifier}-{unique_name}"
self.communicator = communicator
group_ranks = [str(rank) for rank in self.communicator.ranks]