Compare commits

...

10 Commits

12 changed files with 202 additions and 6 deletions

View File

@ -116,7 +116,7 @@ case "$tag" in
INSTALL_MINGW=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
CUDA_VERSION=13.0.0
CUDA_VERSION=13.0.2
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
VISION=yes
@ -125,6 +125,16 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9)
CUDA_VERSION=13.0.2
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10

View File

@ -54,6 +54,7 @@ jobs:
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,
pytorch-linux-jammy-py3.11-clang12,

View File

@ -204,6 +204,39 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-debug-build:
name: linux-jammy-cuda13.0-py3.10-gcc9-debug
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-debug
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9
cuda-arch-list: 8.9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
{ config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
{ config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
{ config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
{ config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
{ config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
{ config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-debug-test:
name: linux-jammy-cuda13.0-py3.10-gcc9-debug
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda13_0-py3_10-gcc9-debug-build
- target-determination
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-debug
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-debug-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-debug-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
uses: ./.github/workflows/_linux-build.yml

View File

@ -268,6 +268,35 @@ jobs:
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-build:
name: linux-jammy-cuda13.0-py3.10-gcc9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9
cuda-arch-list: 8.9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-test:
name: linux-jammy-cuda13.0-py3.10-gcc9
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda13_0-py3_10-gcc9-build
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cpu-py3_10-gcc11-bazel-test:
name: linux-jammy-cpu-py3.10-gcc11-bazel-test
uses: ./.github/workflows/_bazel-build-test.yml

View File

@ -78,6 +78,35 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-sm86-build:
name: linux-jammy-cuda13.0-py3.10-gcc11-sm86
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
cuda-arch-list: 8.6
test-matrix: |
{ include: [
{ config: "slow", shard: 1, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "slow", shard: 2, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "slow", shard: 3, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-sm86-test:
name: linux-jammy-cuda13.0-py3.10-gcc11-sm86
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda13_0-py3_10-gcc11-sm86-build
- target-determination
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc11-sm86
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-sm86-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-sm86-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3_10-clang12-build:
name: linux-jammy-py3.10-clang12
uses: ./.github/workflows/_linux-build.yml

View File

@ -63,6 +63,23 @@ jobs:
]}
secrets: inherit
libtorch-linux-jammy-cuda13_0-py3_10-gcc11-debug-build:
name: libtorch-linux-jammy-cuda13.0-py3.10-gcc11-debug
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: libtorch-linux-jammy-cuda13.0-py3.10-gcc11
cuda-arch-list: '7.5 8.9'
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
build-generates-artifacts: false
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runner: "linux.c7i.4xlarge"
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-build:
name: linux-jammy-cuda12.8-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml
@ -99,6 +116,41 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-build:
name: linux-jammy-cuda13.0-py3.10-gcc11
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
cuda-arch-list: '7.5 8.9'
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
{ config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-test:
name: linux-jammy-cuda13.0-py3.10-gcc11
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda13_0-py3_10-gcc11-build
- target-determination
with:
timeout-minutes: 360
build-environment: linux-jammy-cuda13.0-py3.10-gcc11
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
# no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
@ -115,6 +167,21 @@ jobs:
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-no-ops-build:
name: linux-jammy-cuda13.0-py3.10-gcc11-no-ops
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc11-no-ops
cuda-arch-list: '7.5 8.9'
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
]}
secrets: inherit
macos-py3-arm64-build:
if: github.repository_owner == 'pytorch'
name: macos-py3-arm64

View File

@ -1394,6 +1394,9 @@ if(NOT INTERN_BUILD_MOBILE)
# https://github.com/pytorch/pytorch/pull/55292
string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
# Suppress cusparse warnings
string(APPEND CMAKE_CUDA_FLAGS " -DDISABLE_CUSPARSE_DEPRECATED")
message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
" -D__CUDA_NO_HALF_OPERATORS__"

View File

@ -383,7 +383,11 @@ function(torch_compile_options libname)
-Wno-strict-aliasing
)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
list(APPEND private_compile_options -Wredundant-move -Wno-interference-size)
list(APPEND private_compile_options -Wredundant-move)
# -Wno-interference-size only exists in GCC 12+
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
list(APPEND private_compile_options -Wno-interference-size)
endif()
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
list(APPEND private_compile_options -Wextra-semi -Wmove)

View File

@ -50,6 +50,7 @@ from torch.testing._internal.common_cuda import (
PLATFORM_SUPPORTS_CUDNN_ATTENTION,
PLATFORM_SUPPORTS_FLASH_ATTENTION,
PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
TEST_CUDNN_VERSION,
tf32_on_and_off,
with_tf32_off,
)
@ -3927,6 +3928,12 @@ class TestVmapBatchedGradient(Namespace.TestVmapBase):
def test_randomness(self, device, randomness, backend):
if device == "cpu":
raise unittest.SkipTest("This test is only for CUDA for now")
# xfail for cuDNN version between 9.10 and 9.13
if backend == SDPBackend.CUDNN_ATTENTION and randomness == "different":
if 91100 <= TEST_CUDNN_VERSION <= 91300:
raise unittest.SkipTest("xfail on cuDNN 9.10-9.13 with CUDNN backend and randomness='different'")
backend_ctx = sdpa_kernel([backend])
with backend_ctx:
B = 4

View File

@ -15,7 +15,7 @@ from torch._dispatch.python import enable_python_dispatcher
from torch._export.utils import _is_cia_op
from torch._ops import DispatchKey
from torch.testing import make_tensor
from torch.testing._internal.common_cuda import SM70OrLater, tf32_off
from torch.testing._internal.common_cuda import SM70OrLater, tf32_off, _get_torch_cuda_version
from torch.testing._internal.common_device_type import (
instantiate_device_type_tests,
onlyCPU,
@ -599,6 +599,13 @@ class TestDecomp(TestCase):
@suppress_warnings
@ops(op_db)
def test_comprehensive(self, device, dtype, op):
# Version-conditional xfails: skip torch._scaled_mm on CUDA 13.0+ with float8
if device == "cuda" and dtype == torch.float8_e4m3fn:
# Check both "torch._scaled_mm" and "_scaled_mm" as op.name could be either
if op.name in ("torch._scaled_mm", "_scaled_mm"):
if torch.version.cuda is not None:
if _get_torch_cuda_version() >= (13, 0):
self.skipTest("xfail on CUDA 13.0+ until nullptr issue is fixed")
self.do_cross_ref(device, dtype, op, run_all=True)
def test_uniform(self, device):

View File

@ -7500,6 +7500,11 @@ scipy_lobpcg | {eq_err_scipy:10.2e} | {eq_err_general_scipy:10.2e} | {iters2:
@parametrize("use_transpose_a", [True, False])
@parametrize("use_transpose_b", [True, False])
def test__int_mm(self, device, k, n, use_transpose_a, use_transpose_b):
# Skip specific failing cases on CUDA 13.0
if (not TEST_WITH_ROCM) and _get_torch_cuda_version() >= (13, 0):
if not use_transpose_a and not use_transpose_b:
self.skipTest("xfail on CUDA 13 until cuBLAS adds the supported kernel")
def genf_int_float(x, y, use_transpose):
if use_transpose:
x, y = y, x

View File

@ -22,7 +22,7 @@ from torch.sparse._semi_structured_conversions import (
)
from torch.testing import make_tensor
from torch.testing._internal.common_cuda import _get_torch_cuda_version, PLATFORM_SUPPORTS_FP8, xfailIfSM89
from torch.testing._internal.common_cuda import _get_torch_cuda_version, IS_SM89, PLATFORM_SUPPORTS_FP8, xfailIfSM89
from torch.testing._internal.common_device_type import (
dtypes,
instantiate_device_type_tests,
@ -1117,12 +1117,13 @@ class TestSparseSemiStructuredCUSPARSELT(TestCase):
not PLATFORM_SUPPORTS_FP8,
"FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
)
@xfailIfSM89
@parametrize("dense_input_shape", [(256, 128)])
def test_sparse_fp8fp8_mm(self, dense_input_shape, device):
if torch.backends.cusparselt.version() < 602:
self.skipTest("fp8 matmul requires cuSPARSELt v0.6.2+")
# CUDA 13 can correctly raise NotImplementedError so passing this test is expected
if IS_SM89 and _get_torch_cuda_version() < (13, 0):
raise unittest.SkipTest("expected failure on SM 8.9 with CUDA < 13.0")
A = rand_sparse_semi_structured_mask(256, 128, dtype=torch.float16)
B = torch.rand(dense_input_shape, device=device).to(torch.float16).t()