[Bugfix] Fix CustomAllreduce nvlink topology detection (#3974)

[Bugfix] Fix CustomAllreduce pcie nvlink topology detection (#3974) (#4159)
2025-10-20 23:03:52 +08:00 · 2024-04-18 15:32:47 -07:00
parent cd2f63fb36
commit 8f9c28fd40
1 changed files with 4 additions and 2 deletions
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@ -145,8 +145,10 @@ def _is_full_nvlink(rank, world_size):
    for i in range(world_size):
        if i != rank:
            try:
-                link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i)
-                if not link_state:
+                peer_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                    handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                if p2p_status != pynvml.NVML_P2P_STATUS_OK:
                    return False
            except pynvml.NVMLError as error:
                logger.info(