[CI] Add basic CUDA 13.0 periodic test (#161013)

https://github.com/pytorch/pytorch/issues/159779 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161013 Approved by: https://github.com/atalman Co-authored-by: Andrey Talman <atalman@fb.com> Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
2025-10-20 21:14:14 +08:00 · 2025-08-29 17:56:33 +00:00
parent f532f99822
commit 303f514d5b
9 changed files with 62 additions and 6 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi

-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
+_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,6 +114,16 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
+    CUDA_VERSION=13.0.0
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -44,8 +44,12 @@ function install_ucc() {

  ./autogen.sh

-  # We only run distributed tests on Tesla M60 and A10G
-  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
+    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
+  else
+    # We only run distributed tests on Tesla M60 and A10G
+    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  fi

  if [[ -n "$ROCM_VERSION" ]]; then
    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
+ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -50,6 +50,7 @@ jobs:
        runner: [linux.12xlarge]
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -170,6 +170,38 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc11-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      cuda-arch-list: 7.5
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc11-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda13_0-py3_10-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-rocm-py3_10-build:
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -216,7 +216,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
  add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})

-  target_include_directories(flash_attention PUBLIC
+  target_include_directories(flash_attention SYSTEM PUBLIC
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
    ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@ -10,8 +10,13 @@ using namespace at::native::memory;

 constexpr int buffer_size = 1024;

+#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
 __managed__ double4 buffer1[buffer_size];
 __managed__ double4 buffer2[buffer_size];
+#else
+__managed__ double4_16a buffer1[buffer_size];
+__managed__ double4_16a buffer2[buffer_size];
+#endif

 void reset_buffers() {
  for (int i = 0; i < buffer_size; i++) {
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@ -61,11 +61,14 @@ void* get_symbol(const char* name, int version) {
  }
 #endif

+  // As of CUDA 13, this API is deprecated.
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
  // This fallback to the old API to try getting the symbol again.
  if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
    return out;
  }
+#endif

  // If the symbol cannot be resolved, report and return nullptr;
  // the caller is responsible for checking the pointer.
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1063,7 +1063,7 @@ elseif(USE_CUDA)
        UNFUSE_FMA                      # Addressing issue #121558
      )
    target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
-    target_include_directories(torch_cuda PUBLIC
+    target_include_directories(torch_cuda SYSTEM PUBLIC
      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>