gloo: support ibverbs in cmake (#153425)

This updates the gloo submodule in PyTorch to a version that supports the new ibverbs backend that can be used with PyTorch.

Test plan:

```
sudo dnf install rdma-core-devel
USE_GLOO_IBVERBS=ON python setup.py develop
torchrun --nproc_per_node 2 ~/scripts/gloo_ibverbs_test.py
```

```py
"""
run with:

torchrun --nproc_per_node 2 ~/scripts/gloo_ibverbs_test.py
"""

import os

os.environ["GLOO_DEVICE_TRANSPORT"] = "IBVERBS"

import torch
import torch.distributed as dist

dist.init_process_group("gloo")

rank = dist.get_rank()

if rank == 0:
    device = "cpu"
else:
    device = "cuda"

print(device)

t = torch.full((10, 100), fill_value=(rank+1), device=device)
target = torch.full((10, 100), fill_value=3, device=device)

dist.all_reduce(t)

torch.testing.assert_close(t, target)

t = torch.full((10, 100), fill_value=(rank+1), device=device)

if rank == 0:
    dist.send(t, dst=1)
else:
    dist.recv(t, src=0)
    torch.testing.assert_close(t, torch.full_like(t, 1))
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/153425
Approved by: https://github.com/fduwjj
This commit is contained in:
Tristan Rice
2025-05-13 17:09:00 +00:00
committed by PyTorch MergeBot
parent dde705864a
commit 9c3cef437c
5 changed files with 13 additions and 2 deletions

View File

@ -331,6 +331,9 @@ cmake_dependent_option(
cmake_dependent_option(
USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
cmake_dependent_option(
USE_GLOO_IBVERBS "Use Gloo with ibverbs backend. Only available if USE_GLOO is on." OFF
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
cmake_dependent_option(
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
cmake_dependent_option(

View File

@ -1217,6 +1217,10 @@ if(USE_GLOO)
set(GLOO_INSTALL OFF CACHE BOOL "" FORCE)
set(GLOO_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
if(USE_GLOO_IBVERBS)
set(USE_IBVERBS ON)
endif()
# Temporarily override variables to avoid building Gloo tests/benchmarks
set(__BUILD_TEST ${BUILD_TEST})
set(__BUILD_BENCHMARK ${BUILD_BENCHMARK})

View File

@ -188,6 +188,7 @@ function(caffe2_print_configuration_summary)
message(STATUS " USE_MPI : ${USE_MPI}")
message(STATUS " USE_GLOO : ${USE_GLOO}")
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
endif()
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")

View File

@ -151,7 +151,10 @@ static std::shared_ptr<::gloo::transport::Device> makeIBVerbsDevice(
const std::string& interface,
const std::string& hostname,
bool lazyInit) {
TORCH_CHECK(hostname.empty(), "ibverbs transport does not support hostname");
if (!hostname.empty()) {
TORCH_WARN(
"ibverbs transport does not support hostname, defaulting to any");
}
TORCH_CHECK(!lazyInit, "transport does not support lazy init");