NCCL: Re-enable parallel builds (#83696)

Since #83173 was merged I have noticed some CI being slowed down by
the nccl building step. e.g. if there are no C++ changes then sccache
compiles everything else very quickly and nccl becomes the limiting
factor.

This re-enables parallel builds with some safeguards to protect
against oversubscription. When `make` is the parent build system, we
can use `$(MAKE)` and the `make` jobserver will coordinate job
allocation with the sub-process. For other build systems, this calls
`make` with the `-l` flag which should prevent it launching jobs when
the system load average is already too high.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83696
Approved by: https://github.com/malfet
This commit is contained in:
Peter Bell
2022-08-25 00:57:57 +01:00
committed by PyTorch MergeBot
parent d5af2a70ba
commit 2000eba454

View File

@ -15,21 +15,37 @@ if(NOT __NCCL_INCLUDED)
# this second replacement is needed when there are multiple archs
string(REPLACE ";-gencode" " -gencode" NVCC_GENCODE "${NVCC_GENCODE}")
if("${CMAKE_GENERATOR}" MATCHES "Make")
# Recursive make with jobserver for parallelism
set(MAKE_COMMAND "$(MAKE)")
else()
if(DEFINED ENV{MAX_JOBS})
set(MAX_JOBS "$ENV{MAX_JOBS}")
else()
include(ProcessorCount)
ProcessorCount(NUM_HARDWARE_THREADS)
# Assume 2 hardware threads per cpu core
math(EXPR MAX_JOBS "${NUM_HARDWARE_THREADS} / 2")
endif()
# Parallel build with CPU load limit to avoid oversubscription
set(MAKE_COMMAND "make" "-j${MAX_JOBS}" "-l${MAX_JOBS}")
endif()
set(__NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
ExternalProject_Add(nccl_external
SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl/nccl
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND ""
BUILD_COMMAND
env
make
${MAKE_COMMAND}
"CXX=${CMAKE_CXX_COMPILER}"
"CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}"
"NVCC=${CUDA_NVCC_EXECUTABLE}"
"NVCC_GENCODE=${NVCC_GENCODE}"
"BUILDDIR=${__NCCL_BUILD_DIR}"
"VERBOSE=0"
BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
INSTALL_COMMAND ""
)