mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Fix intermittent link errors in NCCL build (#84245)
Should fix #13362 and fix #83790 I think I've discovered the root cause of the intermittent nccl link failures. If we look at the variable name in the redefinition error: ``` _02021d91_11_sendrecv_cu_0bc7b9c8_11152 ``` this is the name of the file being compiled + some form of unique ID. As part of NCCL's build process, the same file is compiled multiple times with different macro definitions depending on which operator and dtype are being compiled, e.g. ``` nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -dc sendrecv.cu -o sendrecv_sum_i8.o ``` Since the filename parts are the same, then if the unique IDs also happen to collide then the entire identifier will collide and the link fails. So the fix here is to generate a unique `.cu` file for each object file. I've implemented this as a `.patch` file that gets applied from our cmake code, but if we instead fork nccl that would be cleaner. Pull Request resolved: https://github.com/pytorch/pytorch/pull/84245 Approved by: https://github.com/janeyx99, https://github.com/malfet
This commit is contained in:
committed by
PyTorch MergeBot
parent
74d0c64708
commit
fa86874bbd
@ -318,6 +318,7 @@ exclude_patterns = [
|
||||
'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
|
||||
'test/cpp/jit/upgrader_models/*.ptl',
|
||||
'test/cpp/jit/upgrader_models/*.ptl.ff',
|
||||
'cmake/External/nccl.patch',
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
@ -347,6 +348,7 @@ exclude_patterns = [
|
||||
'test/cpp/jit/upgrader_models/*.ptl',
|
||||
'test/cpp/jit/upgrader_models/*.ptl.ff',
|
||||
'.lintrunner.toml',
|
||||
'cmake/External/nccl.patch',
|
||||
]
|
||||
command = [
|
||||
'python3',
|
||||
|
12
cmake/External/apply_nccl_patch.sh
vendored
Executable file
12
cmake/External/apply_nccl_patch.sh
vendored
Executable file
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This patch is required to fix intermittent link errors when building
|
||||
# NCCL. See https://github.com/pytorch/pytorch/issues/83790
|
||||
|
||||
TORCH_DIR=$1
|
||||
|
||||
# Only apply patch if "git status" is empty to avoid failing when the
|
||||
# patch has already been applied
|
||||
if [[ `git status --porcelain` == "" ]]; then
|
||||
git apply "${TORCH_DIR}/cmake/External/nccl.patch"
|
||||
fi
|
1
cmake/External/nccl.cmake
vendored
1
cmake/External/nccl.cmake
vendored
@ -50,6 +50,7 @@ if(NOT __NCCL_INCLUDED)
|
||||
"BUILDDIR=${__NCCL_BUILD_DIR}"
|
||||
"VERBOSE=0"
|
||||
BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
|
||||
PATCH_COMMAND "${CMAKE_CURRENT_LIST_DIR}/apply_nccl_patch.sh" "${PROJECT_SOURCE_DIR}"
|
||||
INSTALL_COMMAND ""
|
||||
)
|
||||
|
||||
|
46
cmake/External/nccl.patch
vendored
Normal file
46
cmake/External/nccl.patch
vendored
Normal file
@ -0,0 +1,46 @@
|
||||
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
|
||||
index 04bce8e..a2498a0 100644
|
||||
--- a/src/collectives/device/Makefile
|
||||
+++ b/src/collectives/device/Makefile
|
||||
@@ -29,7 +29,7 @@ all: $(STATICLIB)
|
||||
all_deps: $(DEPENDFILES)
|
||||
|
||||
# Auto-generating the rules per op/reduction/datatype/algorithm
|
||||
-$(RULESFILE) :
|
||||
+$(RULESFILE) : gen_rules.sh
|
||||
@printf "Generating %-35s > %s\n" rules $@
|
||||
@mkdir -p $(OBJDIR)
|
||||
@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
|
||||
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
|
||||
index aaf3685..73359af 100755
|
||||
--- a/src/collectives/device/gen_rules.sh
|
||||
+++ b/src/collectives/device/gen_rules.sh
|
||||
@@ -13,6 +13,9 @@ then
|
||||
datatypes+=" bf16"
|
||||
fi
|
||||
|
||||
+echo "CURDIR := \$(dir \$(realpath \$(word \$(words \$(math\$(MAKEFILE_LIST))-1), \$(MAKEFILE_LIST))))"
|
||||
+echo ""
|
||||
+
|
||||
targets="GENOBJS := \\\\\n"
|
||||
|
||||
for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
|
||||
@@ -21,10 +24,16 @@ for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
|
||||
dtn=0
|
||||
# Order must match that of the ncclDataType_t enum
|
||||
for dt in ${datatypes}; do
|
||||
- echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
|
||||
+ # Generate a unique filename for each compilation unit,
|
||||
+ # otherwise the __nv_module_id may conflict at link time
|
||||
+ echo "${dir}/${base}_${opn}_${dtn}.cu :"
|
||||
+ echo " echo \"#include \\\"\$(CURDIR)${base}.cu\\\"\" > \$@"
|
||||
+ echo ""
|
||||
+ # Compile the file
|
||||
+ echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${opn}_${dtn}.cu ${base}.cu ${dir}/${base}.dep"
|
||||
echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
|
||||
echo " mkdir -p ${dir}"
|
||||
- echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
|
||||
+ echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
|
||||
echo ""
|
||||
targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
|
||||
dtn=$(($dtn + 1))
|
Reference in New Issue
Block a user