Add CUDA 12.1 CI workflows (#98832)

Adds CUDA 12.1 CI workflows, removes CUDA 11.7.
CC @malfet

Pull Request resolved: https://github.com/pytorch/pytorch/pull/98832
Approved by: https://github.com/atalman
This commit is contained in:
pbialecki
2023-05-01 16:25:53 +00:00
committed by PyTorch MergeBot
parent 3edff6b6ec
commit 73645a8412
10 changed files with 55 additions and 53 deletions

View File

@ -81,15 +81,15 @@ fi
# CMake 3.18 is needed to support CUDA17 language variant
CMAKE_VERSION=3.18.5
_UCX_COMMIT=31e74cac7bee0ef66bef2af72e7d86d9c282e5ab
_UCC_COMMIT=1c7a7127186e7836f73aafbd7697bbc274a77eee
_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
# It's annoying to rename jobs every time you want to rewrite a
# configuration, so we hardcode everything here rather than do it
# from scratch
case "$image" in
pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
CUDA_VERSION=11.7.0
pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7)
CUDA_VERSION=12.1.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=7

View File

@ -4,9 +4,9 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
if [[ ${CUDA_VERSION:0:4} == "11.7" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.5.0.96_cuda11-archive"
curl --retry 3 -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz
if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.8.1.3_cuda12-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/c/${CUDNN_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz

View File

@ -137,6 +137,7 @@ RUN rm install_cudnn.sh
# Delete /usr/local/cuda-11.X/cuda-11.X symlinks
RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
USER jenkins
CMD ["bash"]

View File

@ -40,8 +40,9 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
# TODO: there is a linking issue when building with UCC using clang,
# disable it for now and to be fix later.
export USE_UCC=1
export USE_SYSTEM_UCC=1
# TODO: disable UCC temporarily to enable CUDA 12.1 in CI
export USE_UCC=0
export USE_SYSTEM_UCC=0
fi
fi

View File

@ -147,7 +147,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["11.7", "12.1"],
arches=["11.8", "12.1"],
python_versions=["3.8"],
gen_special_an_non_special_wheel=False,
),

View File

@ -33,7 +33,7 @@ jobs:
fail-fast: false
matrix:
include:
- docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
- docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
- docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
- docker-image-name: pytorch-linux-bionic-py3.8-clang9
- docker-image-name: pytorch-linux-bionic-py3.11-clang9

View File

@ -31,7 +31,7 @@ concurrency:
cancel-in-progress: true
jobs:
manywheel-py3_8-cuda11_7-with-pypi-cudnn-build:
manywheel-py3_8-cuda11_8-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
@ -40,20 +40,19 @@ jobs:
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu117
GPU_ARCH_VERSION: 11.7
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
build_name: manywheel-py3_8-cuda11_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cuda11_7-with-pypi-cudnn-test: # Testing
manywheel-py3_8-cuda11_8-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_8-cuda11_7-with-pypi-cudnn-build
needs: manywheel-py3_8-cuda11_8-build
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
@ -61,12 +60,12 @@ jobs:
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu117
GPU_ARCH_VERSION: 11.7
DESIRED_CUDA: cu118
GPU_ARCH_VERSION: 11.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
build_name: manywheel-py3_8-cuda11_8
build_environment: linux-binary-manywheel
runs_on: linux.4xlarge.nvidia.gpu
secrets:

View File

@ -37,12 +37,12 @@ jobs:
docker-image: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.docker-image }}
test-matrix: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build:
name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build:
name: cuda12.1-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
@ -60,14 +60,14 @@ jobs:
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-test:
name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-test:
name: cuda12.1-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build
needs: linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build
with:
build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
linux-bionic-cuda11_8-py3_9-gcc7-build:
name: linux-bionic-cuda11.8-py3.9-gcc7

View File

@ -17,12 +17,12 @@ concurrency:
cancel-in-progress: true
jobs:
linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build:
name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build:
name: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
build-environment: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
@ -31,22 +31,22 @@ jobs:
{ config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-test:
name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-test:
name: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build
needs: linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build
with:
build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
docker-image: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
build-environment: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
docker-image: ${{ needs.linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
timeout-minutes: 300
linux-bionic-cuda11_7-py3_10-gcc7-sm86-build:
name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
linux-bionic-cuda12_1-py3_10-gcc7-sm86-build:
name: linux-bionic-cuda12.1-py3.10-gcc7-sm86
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
cuda-arch-list: 8.6
test-matrix: |
{ include: [
@ -54,14 +54,14 @@ jobs:
{ config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda11_7-py3_10-gcc7-sm86-test:
name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
linux-bionic-cuda12_1-py3_10-gcc7-sm86-test:
name: linux-bionic-cuda12.1-py3.10-gcc7-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda11_7-py3_10-gcc7-sm86-build
needs: linux-bionic-cuda12_1-py3_10-gcc7-sm86-build
with:
build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.test-matrix }}
build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-sm86-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-sm86-build.outputs.test-matrix }}
linux-bionic-py3_8-clang9-build:
name: linux-bionic-py3.8-clang9

View File

@ -385,6 +385,7 @@ if dist.is_available():
"UCX_TLS": "tcp",
"UCC_TLS": "nccl,ucp",
"UCC_TL_UCP_TUNE": "cuda:0", # don't use UCP TL on CUDA as it is not well supported
"UCC_EC_CUDA_USE_COOPERATIVE_LAUNCH": "n", # CI nodes (M60) fail if it is on
}
# https://stackoverflow.com/questions/2549939/get-signal-names-from-numbers-in-python