merge conflict resolve add back

revert the driver change and test-infra change to get latest signal other than driver issue
test fix-ubuntu-distro change for driver installation
2025-11-17 08:11:08 +08:00 · 2025-11-13 09:04:44 +00:00 · 2025-11-13 09:04:44 +00:00 · 2025-11-13 09:04:44 +00:00 · 2025-11-13 09:04:44 +00:00 · 2025-11-13 09:04:44 +00:00
14 changed files with 426 additions and 12 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -136,6 +136,17 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=13.0.0
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.12
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -36,6 +36,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  nvcc --version
 fi

+if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
+  # Disable FBGEMM for CUDA 13 builds
+  export USE_FBGEMM=0
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
    # TODO: there is a linking issue when building with UCC using clang,
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -285,7 +285,10 @@ EOF
    rm -rf fbgemm
  else
    pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
-    pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
+    # Skip fbgemm for CUDA 13 as it's not compatible yet
+    if [[ "$BUILD_ENVIRONMENT" != *cuda13* ]]; then
+      pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
+    fi
  fi
 }

--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -855,8 +855,14 @@ test_dynamo_benchmark() {
  local shard_id="$1"
  shift

+  # Exclude torchrec_dlrm for CUDA 13 as FBGEMM is not compatible
+  local extra_args=()
+  if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
+    extra_args=(--exclude-exact torchrec_dlrm)
+  fi
+
  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
-    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
+    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    # TODO (huydhn): Just smoke test some sample models
    if [[ "${TEST_CONFIG}" == *b200* ]]; then
@ -868,7 +874,7 @@ test_dynamo_benchmark() {
        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
      fi
    fi
-    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
+    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "${extra_args[@]}" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
      local dt="float32"
@ -876,17 +882,17 @@ test_dynamo_benchmark() {
        dt="amp"
      fi
      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "${extra_args[@]}" "$@"
      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "${extra_args[@]}" "$@"
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
    elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
    else
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
-      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
+      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
    fi
  fi
 }
@ -1780,7 +1786,8 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  else
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
+    # Skip torchrec/fbgemm for cuda13 as they're not compatible yet
+    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* && "${BUILD_ENVIRONMENT}" != *cuda13* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -53,6 +53,7 @@ jobs:
          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -55,3 +55,30 @@ jobs:
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
+
+  build-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs:
+      - get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
+        ]}
+    secrets: inherit
+
+  test-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+    secrets: inherit
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -59,3 +59,37 @@ jobs:
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit
+
+  build-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs:
+      - get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  test-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      # disable monitor in perf tests for more investigation
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -178,3 +178,98 @@ jobs:
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit
+
+  build-cuda13:
+    name: build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
+      # or newer GPUs, so it doesn't benefit much from existing compiler cache
+      # from trunk. Also use a memory-intensive runner here because memory is
+      # usually the bottleneck
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '9.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  test-periodically-cuda13:
+    name: test-periodically-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '15 0,12 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly-cuda13:
+    name: test-weekly-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 1440
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-cuda13:
+    name: test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    # The pull_request trigger is used in PR to bump transformers pin which always
+    # needs one round of benchmark
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests for more investigation
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -164,3 +164,89 @@ jobs:
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit
+
+  build-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Every bit to make perf run faster helps
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
+          { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  test-nightly-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '0 7 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 1440
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -81,6 +81,56 @@ jobs:
      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

+  periodic-dynamo-benchmarks-build-cuda13:
+    name: periodic-dynamo-benchmarks-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0;8.6'
+      test-matrix: |
+        { include: [
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  periodic-dynamo-benchmarks-test-cuda13:
+    name: periodic-dynamo-benchmarks-test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: periodic-dynamo-benchmarks-build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.test-matrix }}
+    secrets: inherit
+
  rocm-periodic-dynamo-benchmarks-build:
    if: github.repository_owner == 'pytorch'
    name: rocm-periodic-dynamo-benchmarks-build
@ -158,6 +208,33 @@ jobs:
      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-smoke-build-cuda13:
+    name: inductor-smoke-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs:
+      - get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  inductor-smoke-test-cuda13:
+    name: inductor-smoke-test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-smoke-build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image: ${{ needs.inductor-smoke-build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-smoke-build-cuda13.outputs.test-matrix }}
+    secrets: inherit
+
  periodic-dynamo-benchmarks-cpu-build:
    name: periodic-dynamo-benchmarks-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -74,6 +74,36 @@ jobs:
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-build-cuda13:
+    name: inductor-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  inductor-test-cuda13:
+    name: inductor-test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image: ${{ needs.inductor-build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build-cuda13.outputs.test-matrix }}
+    secrets: inherit
+
  inductor-cpu-build:
    name: inductor-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -342,8 +342,33 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-noble-xpu-n-py3_10-build:
-    name: linux-noble-xpu-n-py3.10
+  linux-jammy-cuda13_0-py3_10-gcc9-inductor-build:
+    name: cuda13.0-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc9-inductor-test:
+    name: cuda13.0-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda13_0-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -235,6 +235,16 @@ jobs:
      cuda-arch-list: '8.0'
    secrets: inherit

+  inductor-build-cuda13:
+    name: inductor-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.12-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+    secrets: inherit
+
  # Test cross-compiled models with Windows libs extracted from wheel
  cross-compile-linux-test:
    name: cross-compile-linux-test
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1394,6 +1394,9 @@ if(NOT INTERN_BUILD_MOBILE)
  # https://github.com/pytorch/pytorch/pull/55292
  string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")

+  # Suppress cusparse warnings
+  string(APPEND CMAKE_CUDA_FLAGS " -DDISABLE_CUSPARSE_DEPRECATED")
+
  message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
  string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
                                 " -D__CUDA_NO_HALF_OPERATORS__"
Author	SHA1	Message	Date
Ting Lu	17b9217fdd	merge conflict resolve add back	2025-11-13 09:04:44 +00:00
Ting Lu	016f58bfee	revert the driver change and test-infra change to get latest signal other than driver issue	2025-11-13 09:04:44 +00:00
Ting Lu	e55d249c85	test fix-ubuntu-distro change for driver installation	2025-11-13 09:04:44 +00:00
Ting Lu	fe8b1853cf	upgrade driver for aws.a100 runner	2025-11-13 09:04:44 +00:00
Ting Lu	f834086934	Revert "use gcc11 and add eager tests as well to reduce turnaround" This reverts commit bf7ca1e0dd66de725f24905c4d192853bde41ac3.	2025-11-13 09:04:44 +00:00
Ting Lu	f792123e7e	use gcc11 and add eager tests as well to reduce turnaround	2025-11-13 09:04:44 +00:00
Ting Lu	7ac589c981	fix lint	2025-11-13 09:04:44 +00:00
Ting Lu	7feff0f415	skip torchrec_dlrm as fbgemm is required	2025-11-13 09:04:44 +00:00
Ting Lu	4fce979d62	Do not skip torchrec	2025-11-13 09:04:44 +00:00
Ting Lu	271dc5807d	do not build fbgemm_gpu for test file in cuda13 too	2025-11-13 09:04:44 +00:00
Ting Lu	730279bf38	Add disable option in CMakeLists.txt too	2025-11-13 09:04:44 +00:00
Ting Lu	43fb73585e	Disable fbgemm from build.sh	2025-11-13 09:04:44 +00:00
Ting Lu	3d17510e76	fix typo	2025-11-13 09:04:44 +00:00
Ting Lu	3fc1029c43	disable fbgemm for h100 nightly	2025-11-13 09:04:44 +00:00
Ting Lu	44a8309719	add missing nightly and pull	2025-11-13 09:04:44 +00:00
Ting Lu	de77cde1ea	Add full coverage inductor tests	2025-11-13 09:04:44 +00:00
Ting Lu	d47e117a5b	Add CUDA 13 tests to torchbench workflow	2025-11-13 09:04:44 +00:00
Ting Lu	55220d98ac	Do not build fbgemm for inductor 13.0	2025-11-13 09:04:44 +00:00
Ting Lu	dcc76ad961	suppress deprecation warning with -DDISABLE_CUSPARSE_DEPRECATED	2025-11-13 09:04:44 +00:00
Ting Lu	3019f60bfb	Try suppress the cusparse.h warning	2025-11-13 09:04:44 +00:00
Ting Lu	28ac6692fe	Inductor 13.0 test	2025-11-13 09:04:44 +00:00