mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Fixes some `d_qk` != `d_v` cases on Hopper that are broken by cuDNN 9.11-9.12 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162268 Approved by: https://github.com/drisspg, https://github.com/Skylion007
186 lines
4.8 KiB
Bash
186 lines
4.8 KiB
Bash
#!/bin/bash
|
|
|
|
set -ex
|
|
|
|
arch_path=''
|
|
targetarch=${TARGETARCH:-$(uname -m)}
|
|
if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
|
|
arch_path='x86_64'
|
|
else
|
|
arch_path='sbsa'
|
|
fi
|
|
|
|
NVSHMEM_VERSION=3.3.24
|
|
|
|
function install_cuda {
|
|
version=$1
|
|
runfile=$2
|
|
major_minor=${version%.*}
|
|
rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
|
|
if [[ ${arch_path} == 'sbsa' ]]; then
|
|
runfile="${runfile}_sbsa"
|
|
fi
|
|
runfile="${runfile}.run"
|
|
wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
|
|
chmod +x ${runfile}
|
|
./${runfile} --toolkit --silent
|
|
rm -f ${runfile}
|
|
rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
|
|
}
|
|
|
|
function install_cudnn {
|
|
cuda_major_version=$1
|
|
cudnn_version=$2
|
|
mkdir tmp_cudnn && cd tmp_cudnn
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
|
|
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
|
|
tar xf ${filepath}.tar.xz
|
|
cp -a ${filepath}/include/* /usr/local/cuda/include/
|
|
cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
|
|
cd ..
|
|
rm -rf tmp_cudnn
|
|
}
|
|
|
|
function install_nvshmem {
|
|
cuda_major_version=$1 # e.g. "12"
|
|
nvshmem_version=$2 # e.g. "3.3.9"
|
|
|
|
case "${arch_path}" in
|
|
sbsa)
|
|
dl_arch="aarch64"
|
|
;;
|
|
x86_64)
|
|
dl_arch="x64"
|
|
;;
|
|
*)
|
|
dl_arch="${arch}"
|
|
;;
|
|
esac
|
|
|
|
tmpdir="tmp_nvshmem"
|
|
mkdir -p "${tmpdir}" && cd "${tmpdir}"
|
|
|
|
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
|
|
# This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
|
|
filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
|
|
suffix=".tar.xz"
|
|
url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
|
|
|
|
# download, unpack, install
|
|
wget -q "${url}"
|
|
tar xf "${filename}${suffix}"
|
|
cp -a "${filename}/include/"* /usr/local/cuda/include/
|
|
cp -a "${filename}/lib/"* /usr/local/cuda/lib64/
|
|
|
|
# cleanup
|
|
cd ..
|
|
rm -rf "${tmpdir}"
|
|
|
|
echo "nvSHMEM ${nvshmem_version} for CUDA ${cuda_major_version} (${arch_path}) installed."
|
|
}
|
|
|
|
function install_124 {
|
|
CUDNN_VERSION=9.1.0.70
|
|
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
|
|
install_cuda 12.4.1 cuda_12.4.1_550.54.15_linux
|
|
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
CUDA_VERSION=12.4 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.4 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_126 {
|
|
CUDNN_VERSION=9.10.2.21
|
|
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
|
|
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
install_nvshmem 12 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=12.6 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.6 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_129 {
|
|
CUDNN_VERSION=9.10.2.21
|
|
echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
# install CUDA 12.9.1 in the same container
|
|
install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
|
|
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
install_nvshmem 12 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=12.9 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.9 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_128 {
|
|
CUDNN_VERSION=9.8.0.87
|
|
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
# install CUDA 12.8.1 in the same container
|
|
install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
|
|
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
install_nvshmem 12 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=12.8 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.8 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_130 {
|
|
CUDNN_VERSION=9.13.0.50
|
|
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
# install CUDA 13.0 in the same container
|
|
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
|
|
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
install_cudnn 13 $CUDNN_VERSION
|
|
|
|
install_nvshmem 13 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=13.0 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=13.0 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
# idiomatic parameter and option handling in sh
|
|
while test $# -gt 0
|
|
do
|
|
case "$1" in
|
|
12.4) install_124;
|
|
;;
|
|
12.6|12.6.*) install_126;
|
|
;;
|
|
12.8|12.8.*) install_128;
|
|
;;
|
|
12.9|12.9.*) install_129;
|
|
;;
|
|
13.0|13.0.*) install_130;
|
|
;;
|
|
*) echo "bad argument $1"; exit 1
|
|
;;
|
|
esac
|
|
shift
|
|
done
|