refactor

fix the debug build arch and correct two test skips
xfail only on CUDA 13.0 for float8_e4m3fn and success only on CUDA 13.0 for sparse_fp8fp8_mm
2025-11-14 22:25:03 +08:00 · 2025-11-14 02:34:29 -08:00 · 2025-11-14 02:29:32 -08:00 · 2025-11-13 18:57:22 -08:00 · 2025-11-13 15:56:57 -08:00 · 2025-11-13 05:35:51 +00:00
12 changed files with 202 additions and 6 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -116,7 +116,7 @@ case "$tag" in
    INSTALL_MINGW=yes
    ;;
  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
-    CUDA_VERSION=13.0.0
+    CUDA_VERSION=13.0.2
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
@ -125,6 +125,16 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9)
+    CUDA_VERSION=13.0.2
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -54,6 +54,7 @@ jobs:
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
          pytorch-linux-jammy-py3.11-clang12,
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -204,6 +204,39 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc9-debug-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc9-debug
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-debug
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9
+      cuda-arch-list: 8.9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc9-debug-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc9-debug
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda13_0-py3_10-gcc9-debug-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-debug
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-debug-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-debug-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -268,6 +268,35 @@ jobs:
        ]}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc9-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc9
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9
+      cuda-arch-list: 8.9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc9-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc9
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda13_0-py3_10-gcc9-build
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cpu-py3_10-gcc11-bazel-test:
    name: linux-jammy-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -78,6 +78,35 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc11-sm86-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      cuda-arch-list: 8.6
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc11-sm86-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda13_0-py3_10-gcc11-sm86-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-sm86-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-py3_10-clang12-build:
    name: linux-jammy-py3.10-clang12
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -63,6 +63,23 @@ jobs:
        ]}
    secrets: inherit

+  libtorch-linux-jammy-cuda13_0-py3_10-gcc11-debug-build:
+    name: libtorch-linux-jammy-cuda13.0-py3.10-gcc11-debug
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: libtorch-linux-jammy-cuda13.0-py3.10-gcc11
+      cuda-arch-list: '7.5 8.9'
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      build-generates-artifacts: false
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: "linux.c7i.4xlarge"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3_10-gcc11-build:
    name: linux-jammy-cuda12.8-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
@ -99,6 +116,41 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc11-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      cuda-arch-list: '7.5 8.9'
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc11-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda13_0-py3_10-gcc11-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit

  # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
  linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
@ -115,6 +167,21 @@ jobs:
        ]}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc11-no-ops-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11-no-ops
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11-no-ops
+      cuda-arch-list: '7.5 8.9'
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit
+
  macos-py3-arm64-build:
    if: github.repository_owner == 'pytorch'
    name: macos-py3-arm64
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1394,6 +1394,9 @@ if(NOT INTERN_BUILD_MOBILE)
  # https://github.com/pytorch/pytorch/pull/55292
  string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")

+  # Suppress cusparse warnings
+  string(APPEND CMAKE_CUDA_FLAGS " -DDISABLE_CUSPARSE_DEPRECATED")
+
  message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
  string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
                                 " -D__CUDA_NO_HALF_OPERATORS__"
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -383,7 +383,11 @@ function(torch_compile_options libname)
      -Wno-strict-aliasing
      )
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      list(APPEND private_compile_options -Wredundant-move -Wno-interference-size)
+      list(APPEND private_compile_options -Wredundant-move)
+      # -Wno-interference-size only exists in GCC 12+
+      if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
+        list(APPEND private_compile_options -Wno-interference-size)
+      endif()
    endif()
    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
      list(APPEND private_compile_options -Wextra-semi -Wmove)
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@ -50,6 +50,7 @@ from torch.testing._internal.common_cuda import (
    PLATFORM_SUPPORTS_CUDNN_ATTENTION,
    PLATFORM_SUPPORTS_FLASH_ATTENTION,
    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+    TEST_CUDNN_VERSION,
    tf32_on_and_off,
    with_tf32_off,
 )
@ -3927,6 +3928,12 @@ class TestVmapBatchedGradient(Namespace.TestVmapBase):
    def test_randomness(self, device, randomness, backend):
        if device == "cpu":
            raise unittest.SkipTest("This test is only for CUDA for now")
+
+        # xfail for cuDNN version between 9.10 and 9.13
+        if backend == SDPBackend.CUDNN_ATTENTION and randomness == "different":
+            if 91100 <= TEST_CUDNN_VERSION <= 91300:
+                raise unittest.SkipTest("xfail on cuDNN 9.10-9.13 with CUDNN backend and randomness='different'")
+
        backend_ctx = sdpa_kernel([backend])
        with backend_ctx:
            B = 4
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@ -15,7 +15,7 @@ from torch._dispatch.python import enable_python_dispatcher
 from torch._export.utils import _is_cia_op
 from torch._ops import DispatchKey
 from torch.testing import make_tensor
-from torch.testing._internal.common_cuda import SM70OrLater, tf32_off
+from torch.testing._internal.common_cuda import SM70OrLater, tf32_off, _get_torch_cuda_version
 from torch.testing._internal.common_device_type import (
    instantiate_device_type_tests,
    onlyCPU,
@ -599,6 +599,13 @@ class TestDecomp(TestCase):
    @suppress_warnings
    @ops(op_db)
    def test_comprehensive(self, device, dtype, op):
+        # Version-conditional xfails: skip torch._scaled_mm on CUDA 13.0+ with float8
+        if device == "cuda" and dtype == torch.float8_e4m3fn:
+            # Check both "torch._scaled_mm" and "_scaled_mm" as op.name could be either
+            if op.name in ("torch._scaled_mm", "_scaled_mm"):
+                if torch.version.cuda is not None:
+                    if _get_torch_cuda_version() >= (13, 0):
+                        self.skipTest("xfail on CUDA 13.0+ until nullptr issue is fixed")
        self.do_cross_ref(device, dtype, op, run_all=True)

    def test_uniform(self, device):
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@ -7500,6 +7500,11 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
    @parametrize("use_transpose_a", [True, False])
    @parametrize("use_transpose_b", [True, False])
    def test__int_mm(self, device, k, n, use_transpose_a, use_transpose_b):
+        # Skip specific failing cases on CUDA 13.0
+        if (not TEST_WITH_ROCM) and _get_torch_cuda_version() >= (13, 0):
+            if not use_transpose_a and not use_transpose_b:
+                self.skipTest("xfail on CUDA 13 until cuBLAS adds the supported kernel")
+
        def genf_int_float(x, y, use_transpose):
            if use_transpose:
                x, y = y, x
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@ -22,7 +22,7 @@ from torch.sparse._semi_structured_conversions import (
 )

 from torch.testing import make_tensor
-from torch.testing._internal.common_cuda import _get_torch_cuda_version, PLATFORM_SUPPORTS_FP8, xfailIfSM89
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, IS_SM89, PLATFORM_SUPPORTS_FP8, xfailIfSM89
 from torch.testing._internal.common_device_type import (
    dtypes,
    instantiate_device_type_tests,
@ -1117,12 +1117,13 @@ class TestSparseSemiStructuredCUSPARSELT(TestCase):
        not PLATFORM_SUPPORTS_FP8,
        "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
    )
-    @xfailIfSM89
    @parametrize("dense_input_shape", [(256, 128)])
    def test_sparse_fp8fp8_mm(self, dense_input_shape, device):
        if torch.backends.cusparselt.version() < 602:
            self.skipTest("fp8 matmul requires cuSPARSELt v0.6.2+")
-
+        # CUDA 13 can correctly raise NotImplementedError so passing this test is expected
+        if IS_SM89 and _get_torch_cuda_version() < (13, 0):
+            raise unittest.SkipTest("expected failure on SM 8.9 with CUDA < 13.0")
        A = rand_sparse_semi_structured_mask(256, 128, dtype=torch.float16)
        B = torch.rand(dense_input_shape, device=device).to(torch.float16).t()
Author	SHA1	Message	Date
Ting Lu	f12cbe11db	refactor	2025-11-14 02:34:29 -08:00
Ting Lu	383bae5707	fix the debug build arch and correct two test skips	2025-11-14 02:29:32 -08:00
Ting Lu	875f45450e	xfail only on CUDA 13.0 for float8_e4m3fn and success only on CUDA 13.0 for sparse_fp8fp8_mm	2025-11-13 18:57:22 -08:00
Ting Lu	e1ed73635e	Resolve CUDA 13 test failures	2025-11-13 15:56:57 -08:00
Ting Lu	0911360736	fix cuda 13.0 cuda-arch-list to be 8.9 for L4 gpu test	2025-11-13 05:35:51 +00:00
Ting Lu	38de8d0d33	specify cuda_arch_list for 13.0 tests	2025-11-13 05:35:51 +00:00
Ting Lu	a9fe64bee2	also add the cusparse cmake flag	2025-11-13 05:35:51 +00:00
Ting Lu	84436662a3	use 13.0.2	2025-11-13 05:35:51 +00:00
Ting Lu	794e09311c	add win build too	2025-11-13 05:35:51 +00:00
Ting Lu	641d0bae63	Add eager tests cuda 13.0	2025-11-13 05:35:51 +00:00