UCC PG build in CI (#81583)

- Modifies the current cmake build definitions to use `find_package` to find UCX and UCC installed in the system
- Install UCX and UCC in CUDA dockers
- Build PyTorch with `USE_UCC=1` in pipelines
- Currently, we are not running unit tests with the UCC PG. Those tests will be added in future PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/81583
Approved by: https://github.com/vtlam, https://github.com/malfet
This commit is contained in:
Xiang Gao
2022-08-10 00:23:47 +00:00
committed by PyTorch MergeBot
parent b4f7e22640
commit cda210e23b
10 changed files with 89 additions and 22 deletions

View File

@ -84,6 +84,8 @@ if [[ "$image" == *xenial* ]] || [[ "$image" == *bionic* ]]; then
fi
TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/ubuntu/14.04/x86_64"
UCX_COMMIT=v1.13.x
UCC_COMMIT=a7bda274b10f8adf5bb729f01da064f4e735fb23
# It's annoying to rename jobs every time you want to rewrite a
# configuration, so we hardcode everything here rather than do it
@ -147,6 +149,8 @@ case "$image" in
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
CUDA_VERSION=11.7.0
@ -157,6 +161,8 @@ case "$image" in
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
pytorch-linux-xenial-py3-clang5-asan)
ANACONDA_PYTHON_VERSION=3.7
@ -277,6 +283,8 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
@ -286,6 +294,8 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
*)
# Catch-all for builds that are not hardcoded.
@ -375,6 +385,8 @@ docker build \
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906}" \
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \

View File

@ -67,7 +67,8 @@ install_ubuntu() {
wget \
sudo \
vim \
jq
jq \
libtool
# Should resolve issues related to various apt package repository cert issues
# see: https://github.com/pytorch/pytorch/issues/65931

View File

@ -0,0 +1,41 @@
#!/bin/bash
set -ex
function install_ucx() {
set -ex
git clone --recursive https://github.com/openucx/ucx.git
pushd ucx
git checkout ${UCX_COMMIT}
git submodule update --init --recursive
./autogen.sh
./configure --prefix=$UCX_HOME \
--enable-mt \
--enable-profiling \
--enable-stats
time make -j
sudo make install
popd
rm -rf ucx
}
function install_ucc() {
set -ex
git clone --recursive https://github.com/openucx/ucc.git
pushd ucc
git checkout ${UCC_COMMIT}
git submodule update --init --recursive
./autogen.sh
./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-nccl=no
time make -j
sudo make install
popd
rm -rf ucc
}
install_ucx
install_ucc

View File

@ -62,6 +62,17 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
RUN rm install_vision.sh
ENV INSTALLED_VISION ${VISION}
# (optional) Install UCC
ARG UCX_COMMIT
ARG UCC_COMMIT
ENV UCX_COMMIT $UCX_COMMIT
ENV UCC_COMMIT $UCC_COMMIT
ENV UCX_HOME /usr
ENV UCC_HOME /usr
ADD ./common/install_ucc.sh install_ucc.sh
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
RUN rm install_ucc.sh
COPY ./common/install_openssl.sh install_openssl.sh
ENV OPENSSL_ROOT_DIR /opt/openssl
RUN bash ./install_openssl.sh

View File

@ -58,6 +58,17 @@ RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
# (optional) Install UCC
ARG UCX_COMMIT
ARG UCC_COMMIT
ENV UCX_COMMIT $UCX_COMMIT
ENV UCC_COMMIT $UCC_COMMIT
ENV UCX_HOME /usr
ENV UCC_HOME /usr
ADD ./common/install_ucc.sh install_ucc.sh
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
RUN rm install_ucc.sh
# (optional) Install protobuf for ONNX
ARG PROTOBUF
COPY ./common/install_protobuf.sh install_protobuf.sh

View File

@ -45,6 +45,10 @@ fi
if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
# enable split torch_cuda build option in CMake
export BUILD_SPLIT_CUDA=ON
if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* ]]; then
export USE_UCC=1
export USE_SYSTEM_UCC=1
fi
fi
if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* || ${BUILD_ENVIRONMENT} == *"onnx"* ]]; then

View File

@ -918,11 +918,6 @@ if(HAVE_SOVERSION)
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
endif()
if(USE_UCC)
target_link_libraries(torch_cpu PRIVATE __caffe2_ucc)
target_compile_definitions(torch_cpu PRIVATE USE_UCC)
endif()
if(USE_ROCM)
filter_list(__caffe2_hip_srcs_cpp Caffe2_HIP_SRCS "\\.(cu|hip)$")
set_source_files_properties(${__caffe2_hip_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)

View File

@ -2,19 +2,14 @@ if(NOT __UCC_INCLUDED)
set(__UCC_INCLUDED TRUE)
if(USE_SYSTEM_UCC)
set(UCX_HOME $ENV{UCX_HOME} CACHE PATH "UCX install directory")
set(UCC_HOME $ENV{UCC_HOME} CACHE PATH "UCC install directory")
add_library(__caffe2_ucc INTERFACE)
target_include_directories(__caffe2_ucc INTERFACE ${UCX_HOME}/include/)
target_include_directories(__caffe2_ucc INTERFACE ${UCC_HOME}/include/)
target_link_libraries(__caffe2_ucc INTERFACE ${UCX_HOME}/lib/libucp.so)
target_link_libraries(__caffe2_ucc INTERFACE ${UCX_HOME}/lib/libucs.so)
target_link_libraries(__caffe2_ucc INTERFACE ${UCC_HOME}/lib/libucc.so)
find_package(UCC REQUIRED)
find_package(UCX REQUIRED)
if(UCC_FOUND AND UCX_FOUND)
add_library(__caffe2_ucc INTERFACE)
target_link_libraries(__caffe2_ucc INTERFACE ucx::ucs ucx::ucp ucc::ucc)
target_include_directories(__caffe2_ucc INTERFACE ${UCC_INCLUDE_DIRS})
endif()
else()
message(FATAL_ERROR "USE_SYSTEM_UCC=OFF is not supported yet when using UCC")
endif()
endif()

View File

@ -259,9 +259,6 @@ if(USE_DISTRIBUTED)
if(USE_NCCL)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
endif()
if(USE_UCC)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_ucc)
endif()
# Same for MPI.
if(USE_MPI)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})

View File

@ -62,10 +62,10 @@ except ImportError:
try:
from torch._C._distributed_c10d import ProcessGroupUCC
ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
except ImportError:
_UCC_AVAILABLE = False
logger = logging.getLogger(__name__)
PG_WRAPPER_STORE_PREFIX = "pg_wrapper"