mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Enable ROCm multi-gpu with Gloo
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18640 Differential Revision: D15185822 Pulled By: bddppq fbshipit-source-id: 1b49ab3fb0f251cfc7ef3ddd62033ae0065a4ec3
This commit is contained in:
committed by
Facebook Github Bot
parent
cf55670bdd
commit
bc5398451e
@ -24,5 +24,7 @@ set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
|
||||
|
||||
# HIP source
|
||||
# HIP sources, include, test sources
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_INCLUDE ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
|
||||
|
@ -10,13 +10,24 @@ if(USE_GLOO)
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
|
||||
)
|
||||
|
||||
set(Caffe2_CONTRIB_GLOO_GPU_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
|
||||
)
|
||||
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
|
||||
|
||||
if(USE_CUDA)
|
||||
set(Caffe2_CONTRIB_GLOO_GPU_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
|
||||
)
|
||||
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
|
||||
endif(USE_CUDA)
|
||||
|
||||
if(USE_ROCM)
|
||||
set(Caffe2_CONTRIB_GLOO_HIP_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/hip/allreduce_ops_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/hip/broadcast_ops_gpu.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/hip/common_world_ops_gpu.cc"
|
||||
)
|
||||
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_GLOO_HIP_SRC} PARENT_SCOPE)
|
||||
set(Caffe2_HIP_INCLUDE ${GLOO_HIP_INCLUDE} ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
|
||||
endif(USE_ROCM)
|
||||
endif()
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "allreduce_ops.h"
|
||||
#include "caffe2/contrib/gloo/allreduce_ops.h"
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "broadcast_ops.h"
|
||||
#include "caffe2/contrib/gloo/broadcast_ops.h"
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
|
||||
|
@ -17,6 +17,7 @@ void CompareDivMod(int32_t v, int32_t divisor) {
|
||||
int fixed_q = fixed.Div(v);
|
||||
int fixed_r = fixed.Mod(v);
|
||||
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
EXPECT_EQ(native_q, fixed_q)
|
||||
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
|
||||
<< fixed.shift() << " quot " << fixed_q << " " << native_q;
|
||||
@ -24,6 +25,7 @@ void CompareDivMod(int32_t v, int32_t divisor) {
|
||||
EXPECT_EQ(native_r, fixed_r)
|
||||
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
|
||||
<< fixed.shift() << " rem " << fixed_r << " " << native_r;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -950,6 +950,8 @@ if(USE_GLOO)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS gloo)
|
||||
if(USE_CUDA)
|
||||
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS gloo_cuda)
|
||||
elseif(USE_ROCM)
|
||||
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS gloo_hip)
|
||||
endif()
|
||||
add_compile_options(-DCAFFE2_USE_GLOO)
|
||||
endif()
|
||||
|
2
third_party/gloo
vendored
2
third_party/gloo
vendored
Submodule third_party/gloo updated: 670b4d4aa4...1a0f0cd72b
@ -69,6 +69,7 @@ includes = [
|
||||
"caffe2/core/*",
|
||||
"caffe2/db/*",
|
||||
"caffe2/utils/*",
|
||||
"caffe2/contrib/gloo/*",
|
||||
"c10/cuda/*",
|
||||
"c10/cuda/test/CMakeLists.txt",
|
||||
"modules/*",
|
||||
@ -126,7 +127,7 @@ if not args.out_of_place_only:
|
||||
paths = ("torch", "tools")
|
||||
for root, _directories, files in chain.from_iterable(os.walk(path) for path in paths):
|
||||
for filename in files:
|
||||
if filename.endswith(".cpp") or filename.endswith(".h"):
|
||||
if filename.endswith(".cpp") or filename.endswith(".h") or filename.endswith(".hpp"):
|
||||
source = os.path.join(root, filename)
|
||||
# Disabled files
|
||||
if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
|
||||
|
@ -2258,6 +2258,15 @@ PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict([
|
||||
("c10/cuda/CUDAGuard.h", ("ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h", API_PYTORCH)),
|
||||
("c10/cuda/CUDACachingAllocator.h", ("ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h", API_PYTORCH)),
|
||||
("c10/cuda/CUDAStream.h", ("ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h", API_PYTORCH)),
|
||||
("gloo/cuda.h", ("gloo/hip.h", API_PYTORCH)),
|
||||
("gloo/cuda_allreduce_halving_doubling.h", ("gloo/hip_allreduce_halving_doubling.h", API_PYTORCH)),
|
||||
("gloo/cuda_allreduce_halving_doubling_pipelined.h", ("gloo/hip_allreduce_halving_doubling_pipelined.h", API_PYTORCH)),
|
||||
("gloo/cuda_allreduce_ring.h", ("gloo/hip_allreduce_ring.h", API_PYTORCH)),
|
||||
("gloo/cuda_broadcast_one_to_all.h", ("gloo/hip_broadcast_one_to_all.h", API_PYTORCH)),
|
||||
("gloo::CudaAllreduceHalvingDoublingPipelined", ("gloo::HipAllreduceHalvingDoublingPipelined", API_PYTORCH)),
|
||||
("gloo::CudaBroadcastOneToAll", ("gloo::HipBroadcastOneToAll", API_PYTORCH)),
|
||||
("gloo::CudaHostWorkspace", ("gloo::HipHostWorkspace", API_PYTORCH)),
|
||||
("gloo::CudaDeviceWorkspace", ("gloo::HipDeviceWorkspace", API_PYTORCH)),
|
||||
])
|
||||
|
||||
CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict([
|
||||
@ -2310,6 +2319,7 @@ CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict([
|
||||
("cuda::CUDAStreamGuard", ("hip::HIPStreamGuard", API_CAFFE2)),
|
||||
("cuda::OptionalCUDAStreamGuard", ("hip::OptionalHIPStreamGuard", API_CAFFE2)),
|
||||
("c10/cuda/CUDAGuard.h", ("c10/hip/HIPGuard.h", API_CAFFE2)),
|
||||
("gloo/cuda", ("gloo/hip", API_CAFFE2)),
|
||||
])
|
||||
|
||||
# We must tread very carefully here. Blanket conversions like are done
|
||||
|
@ -5,7 +5,7 @@ import glob
|
||||
from .env import IS_CONDA, IS_WINDOWS, CONDA_DIR, check_env_flag, check_negative_env_flag, gather_paths
|
||||
|
||||
# On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
|
||||
USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM")
|
||||
USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS
|
||||
USE_GLOO_IBVERBS = False
|
||||
|
||||
IB_DEVINFO_CMD = "ibv_devinfo"
|
||||
|
@ -694,7 +694,7 @@ if (BUILD_PYTHON)
|
||||
list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp)
|
||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
|
||||
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
||||
if (USE_CUDA)
|
||||
if (USE_CUDA OR USE_ROCM)
|
||||
list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp)
|
||||
endif()
|
||||
endif()
|
||||
|
@ -44,9 +44,7 @@ FIND_PACKAGE(MPI)
|
||||
|
||||
INCLUDE_DIRECTORIES(${CAFFE2_INCLUDE_DIR})
|
||||
|
||||
IF(NOT USE_CUDA)
|
||||
MESSAGE(STATUS "ignoring CUDA")
|
||||
ELSE()
|
||||
IF(USE_CUDA)
|
||||
FIND_PACKAGE(CUDA 7.5)
|
||||
IF(CUDA_FOUND)
|
||||
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
|
||||
@ -54,6 +52,14 @@ ELSE()
|
||||
|
||||
ADD_DEFINITIONS(-DUSE_CUDA=1)
|
||||
ENDIF()
|
||||
ELSEIF(USE_ROCM)
|
||||
INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDE})
|
||||
INCLUDE_DIRECTORIES(${GLOO_HIP_INCLUDE})
|
||||
ADD_DEFINITIONS(-DUSE_ROCM=1)
|
||||
ADD_DEFINITIONS(-D__HIP_PLATFORM_HCC__=1)
|
||||
ADD_DEFINITIONS(-DHIP_VERSION=${HIP_VERSION_MAJOR})
|
||||
ELSE()
|
||||
MESSAGE(STATUS "ignoring GPU")
|
||||
ENDIF()
|
||||
|
||||
IF(MPI_FOUND)
|
||||
@ -61,7 +67,7 @@ IF(MPI_FOUND)
|
||||
MESSAGE(STATUS "MPI_LIBRARIES: ${MPI_LIBRARIES}")
|
||||
ENDIF()
|
||||
|
||||
IF(USE_GLOO AND USE_CUDA)
|
||||
IF(USE_GLOO AND (USE_CUDA OR USE_ROCM))
|
||||
ADD_DEFINITIONS(-DWITH_GLOO=1)
|
||||
IF(USE_GLOO_IBVERBS)
|
||||
MESSAGE(STATUS "Building the gloo backend with both TCP and infiniband support")
|
||||
@ -89,7 +95,7 @@ IF(NOT MPI_FOUND)
|
||||
LIST(REMOVE_ITEM test_cpp "${CMAKE_CURRENT_SOURCE_DIR}/test/data_channel_mpi_smoke.cpp")
|
||||
ENDIF()
|
||||
|
||||
IF(NOT USE_GLOO OR NOT USE_CUDA)
|
||||
IF(NOT (USE_GLOO AND (USE_CUDA OR USE_ROCM)))
|
||||
LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/DataChannelGloo.cpp")
|
||||
LIST(REMOVE_ITEM base_cpp "${CMAKE_CURRENT_SOURCE_DIR}/base/data_channels/Store.cpp")
|
||||
LIST(REMOVE_ITEM test_cpp "${CMAKE_CURRENT_SOURCE_DIR}/test/data_channel_gloo_store.cpp")
|
||||
@ -135,8 +141,13 @@ IF(MPI_FOUND)
|
||||
ENDIF()
|
||||
|
||||
# TODO we shouldn't need the USE_CUDA condition here. See https://github.com/pytorch/pytorch/issues/13101
|
||||
IF(USE_GLOO AND USE_CUDA)
|
||||
IF(USE_GLOO)
|
||||
ADD_DEPENDENCIES(THD gloo)
|
||||
IF(USE_CUDA)
|
||||
ADD_DEPENDENCIES(THD gloo_cuda)
|
||||
ELSEIF(USE_ROCM)
|
||||
ADD_DEPENDENCIES(THD gloo_hip)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(USE_NCCL)
|
||||
|
@ -16,6 +16,13 @@ if(USE_CUDA)
|
||||
set(C10D_USE_CUDA false)
|
||||
message(STATUS "CUDA not found, building C10D without CUDA support")
|
||||
endif()
|
||||
elseif(USE_ROCM)
|
||||
add_definitions(-DUSE_ROCM=1)
|
||||
add_definitions(-D__HIP_PLATFORM_HCC__=1)
|
||||
add_definitions(-DHIP_VERSION=${HIP_VERSION_MAJOR})
|
||||
|
||||
set(C10D_USE_CUDA false)
|
||||
set(C10D_USE_ROCM true)
|
||||
else()
|
||||
set(C10D_USE_CUDA false)
|
||||
message(STATUS "Building C10D without CUDA support")
|
||||
@ -55,6 +62,10 @@ if(C10D_USE_CUDA)
|
||||
set(C10D_LIBS
|
||||
caffe2_gpu
|
||||
)
|
||||
elseif(C10D_USE_ROCM)
|
||||
set(C10D_LIBS
|
||||
caffe2_hip
|
||||
)
|
||||
else()
|
||||
set(C10D_LIBS
|
||||
caffe2
|
||||
|
Reference in New Issue
Block a user