Fix intermittent link errors in NCCL build (#84245)

Should fix #13362 and fix #83790

I think I've discovered the root cause of the intermittent nccl link
failures. If we look at the variable name in the redefinition error:
```
_02021d91_11_sendrecv_cu_0bc7b9c8_11152
```

this is the name of the file being compiled + some form of unique ID.
As part of NCCL's build process, the same file is compiled multiple
times with different macro definitions depending on which operator and
dtype are being compiled, e.g.
```
nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -dc sendrecv.cu -o sendrecv_sum_i8.o
```

Since the filename parts are the same, then if the unique IDs also
happen to collide then the entire identifier will collide and the link
fails. So the fix here is to generate a unique `.cu` file for each
object file. I've implemented this as a `.patch` file that gets
applied from our cmake code, but if we instead fork nccl that would be
cleaner.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84245
Approved by: https://github.com/janeyx99, https://github.com/malfet
This commit is contained in:
Peter Bell
2022-09-13 17:13:37 +00:00
committed by PyTorch MergeBot
parent 74d0c64708
commit fa86874bbd
4 changed files with 61 additions and 0 deletions

View File

@ -318,6 +318,7 @@ exclude_patterns = [
'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
'test/cpp/jit/upgrader_models/*.ptl',
'test/cpp/jit/upgrader_models/*.ptl.ff',
'cmake/External/nccl.patch',
]
command = [
'python3',
@ -347,6 +348,7 @@ exclude_patterns = [
'test/cpp/jit/upgrader_models/*.ptl',
'test/cpp/jit/upgrader_models/*.ptl.ff',
'.lintrunner.toml',
'cmake/External/nccl.patch',
]
command = [
'python3',

12
cmake/External/apply_nccl_patch.sh vendored Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
# This patch is required to fix intermittent link errors when building
# NCCL. See https://github.com/pytorch/pytorch/issues/83790
TORCH_DIR=$1
# Only apply patch if "git status" is empty to avoid failing when the
# patch has already been applied
if [[ `git status --porcelain` == "" ]]; then
git apply "${TORCH_DIR}/cmake/External/nccl.patch"
fi

View File

@ -50,6 +50,7 @@ if(NOT __NCCL_INCLUDED)
"BUILDDIR=${__NCCL_BUILD_DIR}"
"VERBOSE=0"
BUILD_BYPRODUCTS "${__NCCL_BUILD_DIR}/lib/libnccl_static.a"
PATCH_COMMAND "${CMAKE_CURRENT_LIST_DIR}/apply_nccl_patch.sh" "${PROJECT_SOURCE_DIR}"
INSTALL_COMMAND ""
)

46
cmake/External/nccl.patch vendored Normal file
View File

@ -0,0 +1,46 @@
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 04bce8e..a2498a0 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -29,7 +29,7 @@ all: $(STATICLIB)
all_deps: $(DEPENDFILES)
# Auto-generating the rules per op/reduction/datatype/algorithm
-$(RULESFILE) :
+$(RULESFILE) : gen_rules.sh
@printf "Generating %-35s > %s\n" rules $@
@mkdir -p $(OBJDIR)
@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index aaf3685..73359af 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -13,6 +13,9 @@ then
datatypes+=" bf16"
fi
+echo "CURDIR := \$(dir \$(realpath \$(word \$(words \$(math\$(MAKEFILE_LIST))-1), \$(MAKEFILE_LIST))))"
+echo ""
+
targets="GENOBJS := \\\\\n"
for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
@@ -21,10 +24,16 @@ for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
dtn=0
# Order must match that of the ncclDataType_t enum
for dt in ${datatypes}; do
- echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
+ # Generate a unique filename for each compilation unit,
+ # otherwise the __nv_module_id may conflict at link time
+ echo "${dir}/${base}_${opn}_${dtn}.cu :"
+ echo " echo \"#include \\\"\$(CURDIR)${base}.cu\\\"\" > \$@"
+ echo ""
+ # Compile the file
+ echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${opn}_${dtn}.cu ${base}.cu ${dir}/${base}.dep"
echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
echo " mkdir -p ${dir}"
- echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
+ echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
echo ""
targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
dtn=$(($dtn + 1))