mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
UCC PG build in CI (#81583)
- Modifies the current cmake build definitions to use `find_package` to find UCX and UCC installed in the system - Install UCX and UCC in CUDA dockers - Build PyTorch with `USE_UCC=1` in pipelines - Currently, we are not running unit tests with the UCC PG. Those tests will be added in future PRs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/81583 Approved by: https://github.com/vtlam, https://github.com/malfet
This commit is contained in:
committed by
PyTorch MergeBot
parent
b4f7e22640
commit
cda210e23b
@ -84,6 +84,8 @@ if [[ "$image" == *xenial* ]] || [[ "$image" == *bionic* ]]; then
|
||||
fi
|
||||
|
||||
TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/ubuntu/14.04/x86_64"
|
||||
UCX_COMMIT=v1.13.x
|
||||
UCC_COMMIT=a7bda274b10f8adf5bb729f01da064f4e735fb23
|
||||
|
||||
# It's annoying to rename jobs every time you want to rewrite a
|
||||
# configuration, so we hardcode everything here rather than do it
|
||||
@ -147,6 +149,8 @@ case "$image" in
|
||||
DB=yes
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${UCX_COMMIT}
|
||||
UCC_COMMIT=${UCC_COMMIT}
|
||||
;;
|
||||
pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
|
||||
CUDA_VERSION=11.7.0
|
||||
@ -157,6 +161,8 @@ case "$image" in
|
||||
DB=yes
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${UCX_COMMIT}
|
||||
UCC_COMMIT=${UCC_COMMIT}
|
||||
;;
|
||||
pytorch-linux-xenial-py3-clang5-asan)
|
||||
ANACONDA_PYTHON_VERSION=3.7
|
||||
@ -277,6 +283,8 @@ case "$image" in
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
UCX_COMMIT=${UCX_COMMIT}
|
||||
UCC_COMMIT=${UCC_COMMIT}
|
||||
;;
|
||||
pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.8
|
||||
@ -286,6 +294,8 @@ case "$image" in
|
||||
PROTOBUF=yes
|
||||
DB=yes
|
||||
VISION=yes
|
||||
UCX_COMMIT=${UCX_COMMIT}
|
||||
UCC_COMMIT=${UCC_COMMIT}
|
||||
;;
|
||||
*)
|
||||
# Catch-all for builds that are not hardcoded.
|
||||
@ -375,6 +385,8 @@ docker build \
|
||||
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
|
||||
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906}" \
|
||||
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
|
||||
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
|
||||
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
|
||||
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
||||
-t "$tmp_tag" \
|
||||
"$@" \
|
||||
|
@ -67,7 +67,8 @@ install_ubuntu() {
|
||||
wget \
|
||||
sudo \
|
||||
vim \
|
||||
jq
|
||||
jq \
|
||||
libtool
|
||||
|
||||
# Should resolve issues related to various apt package repository cert issues
|
||||
# see: https://github.com/pytorch/pytorch/issues/65931
|
||||
|
41
.circleci/docker/common/install_ucc.sh
Executable file
41
.circleci/docker/common/install_ucc.sh
Executable file
@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
function install_ucx() {
|
||||
set -ex
|
||||
git clone --recursive https://github.com/openucx/ucx.git
|
||||
pushd ucx
|
||||
git checkout ${UCX_COMMIT}
|
||||
git submodule update --init --recursive
|
||||
|
||||
./autogen.sh
|
||||
./configure --prefix=$UCX_HOME \
|
||||
--enable-mt \
|
||||
--enable-profiling \
|
||||
--enable-stats
|
||||
time make -j
|
||||
sudo make install
|
||||
|
||||
popd
|
||||
rm -rf ucx
|
||||
}
|
||||
|
||||
function install_ucc() {
|
||||
set -ex
|
||||
git clone --recursive https://github.com/openucx/ucc.git
|
||||
pushd ucc
|
||||
git checkout ${UCC_COMMIT}
|
||||
git submodule update --init --recursive
|
||||
|
||||
./autogen.sh
|
||||
./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-nccl=no
|
||||
time make -j
|
||||
sudo make install
|
||||
|
||||
popd
|
||||
rm -rf ucc
|
||||
}
|
||||
|
||||
install_ucx
|
||||
install_ucc
|
@ -62,6 +62,17 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
|
||||
RUN rm install_vision.sh
|
||||
ENV INSTALLED_VISION ${VISION}
|
||||
|
||||
# (optional) Install UCC
|
||||
ARG UCX_COMMIT
|
||||
ARG UCC_COMMIT
|
||||
ENV UCX_COMMIT $UCX_COMMIT
|
||||
ENV UCC_COMMIT $UCC_COMMIT
|
||||
ENV UCX_HOME /usr
|
||||
ENV UCC_HOME /usr
|
||||
ADD ./common/install_ucc.sh install_ucc.sh
|
||||
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
|
||||
RUN rm install_ucc.sh
|
||||
|
||||
COPY ./common/install_openssl.sh install_openssl.sh
|
||||
ENV OPENSSL_ROOT_DIR /opt/openssl
|
||||
RUN bash ./install_openssl.sh
|
||||
|
@ -58,6 +58,17 @@ RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
|
||||
ENV DESIRED_CUDA ${CUDA_VERSION}
|
||||
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
|
||||
|
||||
# (optional) Install UCC
|
||||
ARG UCX_COMMIT
|
||||
ARG UCC_COMMIT
|
||||
ENV UCX_COMMIT $UCX_COMMIT
|
||||
ENV UCC_COMMIT $UCC_COMMIT
|
||||
ENV UCX_HOME /usr
|
||||
ENV UCC_HOME /usr
|
||||
ADD ./common/install_ucc.sh install_ucc.sh
|
||||
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
|
||||
RUN rm install_ucc.sh
|
||||
|
||||
# (optional) Install protobuf for ONNX
|
||||
ARG PROTOBUF
|
||||
COPY ./common/install_protobuf.sh install_protobuf.sh
|
||||
|
@ -45,6 +45,10 @@ fi
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
|
||||
# enable split torch_cuda build option in CMake
|
||||
export BUILD_SPLIT_CUDA=ON
|
||||
if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* ]]; then
|
||||
export USE_UCC=1
|
||||
export USE_SYSTEM_UCC=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* || ${BUILD_ENVIRONMENT} == *"onnx"* ]]; then
|
||||
|
@ -918,11 +918,6 @@ if(HAVE_SOVERSION)
|
||||
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
|
||||
endif()
|
||||
|
||||
if(USE_UCC)
|
||||
target_link_libraries(torch_cpu PRIVATE __caffe2_ucc)
|
||||
target_compile_definitions(torch_cpu PRIVATE USE_UCC)
|
||||
endif()
|
||||
|
||||
if(USE_ROCM)
|
||||
filter_list(__caffe2_hip_srcs_cpp Caffe2_HIP_SRCS "\\.(cu|hip)$")
|
||||
set_source_files_properties(${__caffe2_hip_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
|
||||
|
19
cmake/External/ucc.cmake
vendored
19
cmake/External/ucc.cmake
vendored
@ -2,19 +2,14 @@ if(NOT __UCC_INCLUDED)
|
||||
set(__UCC_INCLUDED TRUE)
|
||||
|
||||
if(USE_SYSTEM_UCC)
|
||||
set(UCX_HOME $ENV{UCX_HOME} CACHE PATH "UCX install directory")
|
||||
set(UCC_HOME $ENV{UCC_HOME} CACHE PATH "UCC install directory")
|
||||
|
||||
add_library(__caffe2_ucc INTERFACE)
|
||||
|
||||
target_include_directories(__caffe2_ucc INTERFACE ${UCX_HOME}/include/)
|
||||
target_include_directories(__caffe2_ucc INTERFACE ${UCC_HOME}/include/)
|
||||
|
||||
target_link_libraries(__caffe2_ucc INTERFACE ${UCX_HOME}/lib/libucp.so)
|
||||
target_link_libraries(__caffe2_ucc INTERFACE ${UCX_HOME}/lib/libucs.so)
|
||||
target_link_libraries(__caffe2_ucc INTERFACE ${UCC_HOME}/lib/libucc.so)
|
||||
find_package(UCC REQUIRED)
|
||||
find_package(UCX REQUIRED)
|
||||
if(UCC_FOUND AND UCX_FOUND)
|
||||
add_library(__caffe2_ucc INTERFACE)
|
||||
target_link_libraries(__caffe2_ucc INTERFACE ucx::ucs ucx::ucp ucc::ucc)
|
||||
target_include_directories(__caffe2_ucc INTERFACE ${UCC_INCLUDE_DIRS})
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "USE_SYSTEM_UCC=OFF is not supported yet when using UCC")
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
@ -259,9 +259,6 @@ if(USE_DISTRIBUTED)
|
||||
if(USE_NCCL)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
||||
endif()
|
||||
if(USE_UCC)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_ucc)
|
||||
endif()
|
||||
# Same for MPI.
|
||||
if(USE_MPI)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
|
||||
|
@ -62,10 +62,10 @@ except ImportError:
|
||||
|
||||
try:
|
||||
from torch._C._distributed_c10d import ProcessGroupUCC
|
||||
ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
|
||||
except ImportError:
|
||||
_UCC_AVAILABLE = False
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PG_WRAPPER_STORE_PREFIX = "pg_wrapper"
|
||||
|
Reference in New Issue
Block a user