Enable ROCm multi-gpu with Gloo

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18640

Differential Revision: D15185822

Pulled By: bddppq

fbshipit-source-id: 1b49ab3fb0f251cfc7ef3ddd62033ae0065a4ec3
This commit is contained in:
Junjie Bai
2019-05-07 09:49:38 -07:00
committed by Facebook Github Bot
parent cf55670bdd
commit bc5398451e
13 changed files with 71 additions and 21 deletions

View File

@ -24,5 +24,7 @@ set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
# HIP source
# HIP sources, include, test sources
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_HIP_INCLUDE ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)

View File

@ -10,13 +10,24 @@ if(USE_GLOO)
"${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
)
set(Caffe2_CONTRIB_GLOO_GPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
if(USE_CUDA)
set(Caffe2_CONTRIB_GLOO_GPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
)
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
endif(USE_CUDA)
if(USE_ROCM)
set(Caffe2_CONTRIB_GLOO_HIP_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/hip/allreduce_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/hip/broadcast_ops_gpu.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/hip/common_world_ops_gpu.cc"
)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_GLOO_HIP_SRC} PARENT_SCOPE)
set(Caffe2_HIP_INCLUDE ${GLOO_HIP_INCLUDE} ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
endif(USE_ROCM)
endif()

View File

@ -1,4 +1,4 @@
#include "allreduce_ops.h"
#include "caffe2/contrib/gloo/allreduce_ops.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/logging.h"

View File

@ -1,4 +1,4 @@
#include "broadcast_ops.h"
#include "caffe2/contrib/gloo/broadcast_ops.h"
#include "caffe2/core/context_gpu.h"

View File

@ -17,6 +17,7 @@ void CompareDivMod(int32_t v, int32_t divisor) {
int fixed_q = fixed.Div(v);
int fixed_r = fixed.Mod(v);
#ifndef __HIP_PLATFORM_HCC__
EXPECT_EQ(native_q, fixed_q)
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
<< fixed.shift() << " quot " << fixed_q << " " << native_q;
@ -24,6 +25,7 @@ void CompareDivMod(int32_t v, int32_t divisor) {
EXPECT_EQ(native_r, fixed_r)
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
<< fixed.shift() << " rem " << fixed_r << " " << native_r;
#endif
}
} // namespace

View File

@ -950,6 +950,8 @@ if(USE_GLOO)
list(APPEND Caffe2_DEPENDENCY_LIBS gloo)
if(USE_CUDA)
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS gloo_cuda)
elseif(USE_ROCM)
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS gloo_hip)
endif()
add_compile_options(-DCAFFE2_USE_GLOO)
endif()

View File

@ -69,6 +69,7 @@ includes = [
"caffe2/core/*",
"caffe2/db/*",
"caffe2/utils/*",
"caffe2/contrib/gloo/*",
"c10/cuda/*",
"c10/cuda/test/CMakeLists.txt",
"modules/*",
@ -126,7 +127,7 @@ if not args.out_of_place_only:
paths = ("torch", "tools")
for root, _directories, files in chain.from_iterable(os.walk(path) for path in paths):
for filename in files:
if filename.endswith(".cpp") or filename.endswith(".h"):
if filename.endswith(".cpp") or filename.endswith(".h") or filename.endswith(".hpp"):
source = os.path.join(root, filename)
# Disabled files
if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):

View File

@ -2258,6 +2258,15 @@ PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict([
("c10/cuda/CUDAGuard.h", ("ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h", API_PYTORCH)),
("c10/cuda/CUDACachingAllocator.h", ("ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h", API_PYTORCH)),
("c10/cuda/CUDAStream.h", ("ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h", API_PYTORCH)),
("gloo/cuda.h", ("gloo/hip.h", API_PYTORCH)),
("gloo/cuda_allreduce_halving_doubling.h", ("gloo/hip_allreduce_halving_doubling.h", API_PYTORCH)),
("gloo/cuda_allreduce_halving_doubling_pipelined.h", ("gloo/hip_allreduce_halving_doubling_pipelined.h", API_PYTORCH)),
("gloo/cuda_allreduce_ring.h", ("gloo/hip_allreduce_ring.h", API_PYTORCH)),
("gloo/cuda_broadcast_one_to_all.h", ("gloo/hip_broadcast_one_to_all.h", API_PYTORCH)),
("gloo::CudaAllreduceHalvingDoublingPipelined", ("gloo::HipAllreduceHalvingDoublingPipelined", API_PYTORCH)),
("gloo::CudaBroadcastOneToAll", ("gloo::HipBroadcastOneToAll", API_PYTORCH)),
("gloo::CudaHostWorkspace", ("gloo::HipHostWorkspace", API_PYTORCH)),
("gloo::CudaDeviceWorkspace", ("gloo::HipDeviceWorkspace", API_PYTORCH)),
])
CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict([
@ -2310,6 +2319,7 @@ CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict([
("cuda::CUDAStreamGuard", ("hip::HIPStreamGuard", API_CAFFE2)),
("cuda::OptionalCUDAStreamGuard", ("hip::OptionalHIPStreamGuard", API_CAFFE2)),
("c10/cuda/CUDAGuard.h", ("c10/hip/HIPGuard.h", API_CAFFE2)),
("gloo/cuda", ("gloo/hip", API_CAFFE2)),
])
# We must tread very carefully here. Blanket conversions like are done

View File

@ -5,7 +5,7 @@ import glob
from .env import IS_CONDA, IS_WINDOWS, CONDA_DIR, check_env_flag, check_negative_env_flag, gather_paths
# On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM")
USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS
USE_GLOO_IBVERBS = False
IB_DEVINFO_CMD = "ibv_devinfo"

View File

@ -694,7 +694,7 @@ if (BUILD_PYTHON)
list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
if (USE_CUDA)
if (USE_CUDA OR USE_ROCM)
list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp)
endif()
endif()

View File

@ -44,9 +44,7 @@ FIND_PACKAGE(MPI)
INCLUDE_DIRECTORIES(${CAFFE2_INCLUDE_DIR})
IF(NOT USE_CUDA)
MESSAGE(STATUS "ignoring CUDA")
ELSE()
IF(USE_CUDA)
FIND_PACKAGE(CUDA 7.5)
IF(CUDA_FOUND)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
@ -54,6 +52,14 @@ ELSE()
ADD_DEFINITIONS(-DUSE_CUDA=1)
ENDIF()
ELSEIF(USE_ROCM)
INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDE})
INCLUDE_DIRECTORIES(${GLOO_HIP_INCLUDE})
ADD_DEFINITIONS(-DUSE_ROCM=1)
ADD_DEFINITIONS(-D__HIP_PLATFORM_HCC__=1)
ADD_DEFINITIONS(-DHIP_VERSION=${HIP_VERSION_MAJOR})
ELSE()
MESSAGE(STATUS "ignoring GPU")
ENDIF()
IF(MPI_FOUND)
@ -61,7 +67,7 @@ IF(MPI_FOUND)
MESSAGE(STATUS "MPI_LIBRARIES: ${MPI_LIBRARIES}")
ENDIF()
IF(USE_GLOO AND USE_CUDA)
IF(USE_GLOO AND (USE_CUDA OR USE_ROCM))
ADD_DEFINITIONS(-DWITH_GLOO=1)
IF(USE_GLOO_IBVERBS)
MESSAGE(STATUS "Building the gloo backend with both TCP and infiniband support")
@ -89,7 +95,7 @@ IF(NOT MPI_FOUND)
LIST(REMOVE_ITEM test_cpp "${CMAKE_CURRENT_SOURCE_DIR}/test/data_channel_mpi_smoke.cpp")
ENDIF()
IF(NOT USE_GLOO OR NOT USE_CUDA)
IF(NOT (USE_GLOO AND (USE_CUDA OR USE_ROCM)))
LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/DataChannelGloo.cpp")
LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/Store.cpp")
LIST(REMOVE_ITEM test_cpp "${CMAKE_CURRENT_SOURCE_DIR}/test/data_channel_gloo_store.cpp")
@ -135,8 +141,13 @@ IF(MPI_FOUND)
ENDIF()
# TODO we shouldn't need the USE_CUDA condition here. See https://github.com/pytorch/pytorch/issues/13101
IF(USE_GLOO AND USE_CUDA)
IF(USE_GLOO)
ADD_DEPENDENCIES(THD gloo)
IF(USE_CUDA)
ADD_DEPENDENCIES(THD gloo_cuda)
ELSEIF(USE_ROCM)
ADD_DEPENDENCIES(THD gloo_hip)
ENDIF()
ENDIF()
IF(USE_NCCL)

View File

@ -16,6 +16,13 @@ if(USE_CUDA)
set(C10D_USE_CUDA false)
message(STATUS "CUDA not found, building C10D without CUDA support")
endif()
elseif(USE_ROCM)
add_definitions(-DUSE_ROCM=1)
add_definitions(-D__HIP_PLATFORM_HCC__=1)
add_definitions(-DHIP_VERSION=${HIP_VERSION_MAJOR})
set(C10D_USE_CUDA false)
set(C10D_USE_ROCM true)
else()
set(C10D_USE_CUDA false)
message(STATUS "Building C10D without CUDA support")
@ -55,6 +62,10 @@ if(C10D_USE_CUDA)
set(C10D_LIBS
caffe2_gpu
)
elseif(C10D_USE_ROCM)
set(C10D_LIBS
caffe2_hip
)
else()
set(C10D_LIBS
caffe2