[Bugfix][EP+DP] Fix internode check (#19112)

Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
This commit is contained in:
Tyler Michael Smith
2025-06-04 11:39:23 -04:00
committed by GitHub
parent c8dcc15921
commit d459fae0a2
2 changed files with 1 additions and 8 deletions

View File

@ -84,10 +84,6 @@ class PPLXAll2AllManager(All2AllManagerBase):
assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa
super().__init__(cpu_group)
# TODO(tms): Disable pplx-a2a intranode as it fails with the error:
# failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa
self.internode = True
if self.internode:
# inter-node communication needs nvshmem,
# intra-node communication uses p2p mapping directly
@ -178,7 +174,6 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
num_rdma_bytes = 1024 * 1024 * 1024
num_qps_per_rank = self.num_sms // 2
else:
assert self.intranode
num_rdma_bytes = 0
num_qps_per_rank = 1
@ -243,7 +238,6 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
if self.internode:
num_rdma_bytes = 1024 * 1024 * 1024
else:
assert self.intranode
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
hidden=token_hidden_size,

View File

@ -49,8 +49,7 @@ class All2AllManagerBase:
# all2all communication often has separate implementations for
# intra-node and inter-node communication
self.intranode = in_the_same_node_as(cpu_group, source_rank=0)
self.internode = not self.intranode
self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
def get_handle(self, kwargs):
# get a handle for the all2all communication,