[Bugfix] Fix CustomAllreduce nvlink topology detection (#3974)

[Bugfix] Fix CustomAllreduce pcie nvlink topology detection (#3974) (#4159)
This commit is contained in:
Adam Tilghman
2024-04-18 15:32:47 -07:00
committed by GitHub
parent cd2f63fb36
commit 8f9c28fd40

View File

@ -145,8 +145,10 @@ def _is_full_nvlink(rank, world_size):
for i in range(world_size):
if i != rank:
try:
link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i)
if not link_state:
peer_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
p2p_status = pynvml.nvmlDeviceGetP2PStatus(
handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
if p2p_status != pynvml.NVML_P2P_STATUS_OK:
return False
except pynvml.NVMLError as error:
logger.info(