revert the driver change and test-infra change to get latest signal other than driver issue

test fix-ubuntu-distro change for driver installation
upgrade driver for aws.a100 runner
2025-11-11 22:34:53 +08:00 · 2025-11-05 00:04:57 -08:00 · 2025-11-04 00:50:28 -08:00 · 2025-11-02 01:09:03 -08:00 · 2025-10-30 23:28:28 -07:00 · 2025-10-30 15:46:07 -07:00
14 changed files with 422 additions and 9 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -136,6 +136,17 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=13.0.0
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.12
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -36,6 +36,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  nvcc --version
 fi

+if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
+  # Disable FBGEMM for CUDA 13 builds
+  export USE_FBGEMM=0
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
    # TODO: there is a linking issue when building with UCC using clang,
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -286,7 +286,10 @@ EOF
    rm -rf fbgemm
  else
    pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
-    pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
+    # Skip fbgemm for CUDA 13 as it's not compatible yet
+    if [[ "$BUILD_ENVIRONMENT" != *cuda13* ]]; then
+      pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
+    fi
  fi
 }

--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -834,8 +834,14 @@ test_dynamo_benchmark() {
  local shard_id="$1"
  shift

+  # Exclude torchrec_dlrm for CUDA 13 as FBGEMM is not compatible
+  local extra_args=()
+  if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
+    extra_args=(--exclude-exact torchrec_dlrm)
+  fi
+
  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
-    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
+    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    # TODO (huydhn): Just smoke test some sample models
    if [[ "${TEST_CONFIG}" == *b200* ]]; then
@ -847,7 +853,7 @@ test_dynamo_benchmark() {
        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
      fi
    fi
-    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
+    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "${extra_args[@]}" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
      local dt="float32"
@ -855,17 +861,17 @@ test_dynamo_benchmark() {
        dt="amp"
      fi
      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "${extra_args[@]}" "$@"
      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "${extra_args[@]}" "$@"
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
    elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
    else
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
-      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
+      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
    fi
  fi
 }
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -53,6 +53,7 @@ jobs:
          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -55,3 +55,30 @@ jobs:
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
+
+  build-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs:
+      - get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
+        ]}
+    secrets: inherit
+
+  test-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+    secrets: inherit
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -59,3 +59,37 @@ jobs:
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit
+
+  build-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs:
+      - get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  test-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      # disable monitor in perf tests for more investigation
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -178,3 +178,98 @@ jobs:
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit
+
+  build-cuda13:
+    name: build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
+      # or newer GPUs, so it doesn't benefit much from existing compiler cache
+      # from trunk. Also use a memory-intensive runner here because memory is
+      # usually the bottleneck
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '9.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  test-periodically-cuda13:
+    name: test-periodically-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '15 0,12 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly-cuda13:
+    name: test-weekly-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 1440
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-cuda13:
+    name: test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    # The pull_request trigger is used in PR to bump transformers pin which always
+    # needs one round of benchmark
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
+      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests for more investigation
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -164,3 +164,89 @@ jobs:
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit
+
+  build-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Every bit to make perf run faster helps
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
+          { config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
+          { config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  test-nightly-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '0 7 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 1440
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-cuda13:
+    name: cuda13.0-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build-cuda13
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -81,6 +81,56 @@ jobs:
      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

+  periodic-dynamo-benchmarks-build-cuda13:
+    name: periodic-dynamo-benchmarks-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0;8.6'
+      test-matrix: |
+        { include: [
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  periodic-dynamo-benchmarks-test-cuda13:
+    name: periodic-dynamo-benchmarks-test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: periodic-dynamo-benchmarks-build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.test-matrix }}
+    secrets: inherit
+
  rocm-periodic-dynamo-benchmarks-build:
    if: github.repository_owner == 'pytorch'
    name: rocm-periodic-dynamo-benchmarks-build
@ -158,6 +208,33 @@ jobs:
      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-smoke-build-cuda13:
+    name: inductor-smoke-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs:
+      - get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  inductor-smoke-test-cuda13:
+    name: inductor-smoke-test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-smoke-build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
+      docker-image: ${{ needs.inductor-smoke-build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-smoke-build-cuda13.outputs.test-matrix }}
+    secrets: inherit
+
  periodic-dynamo-benchmarks-cpu-build:
    name: periodic-dynamo-benchmarks-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -74,6 +74,36 @@ jobs:
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-build-cuda13:
+    name: inductor-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      build-additional-packages: "vision audio torchao"
+    secrets: inherit
+
+  inductor-test-cuda13:
+    name: inductor-test-cuda13
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-build-cuda13
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
+      docker-image: ${{ needs.inductor-build-cuda13.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build-cuda13.outputs.test-matrix }}
+    secrets: inherit
+
  inductor-cpu-build:
    name: inductor-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -342,6 +342,31 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cuda13_0-py3_10-gcc9-inductor-build:
+    name: cuda13.0-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc9-inductor-test:
+    name: cuda13.0-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda13_0-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-xpu-n-py3_10-build:
    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -234,6 +234,16 @@ jobs:
      cuda-arch-list: '8.0'
    secrets: inherit

+  inductor-build-cuda13:
+    name: inductor-build-cuda13
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.12-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+    secrets: inherit
+
  # Test cross-compiled models with Windows libs extracted from wheel
  cross-compile-linux-test:
    name: cross-compile-linux-test
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1394,6 +1394,9 @@ if(NOT INTERN_BUILD_MOBILE)
  # https://github.com/pytorch/pytorch/pull/55292
  string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")

+  # Suppress cusparse warnings
+  string(APPEND CMAKE_CUDA_FLAGS " -DDISABLE_CUSPARSE_DEPRECATED")
+
  message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
  string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
                                 " -D__CUDA_NO_HALF_OPERATORS__"
Author	SHA1	Message	Date
Ting Lu	62a803dff5	revert the driver change and test-infra change to get latest signal other than driver issue	2025-11-05 00:04:57 -08:00
Ting Lu	fadbaa1b9b	test fix-ubuntu-distro change for driver installation	2025-11-04 00:50:28 -08:00
Ting Lu	6ffd8b4e39	upgrade driver for aws.a100 runner	2025-11-02 01:09:03 -08:00
Ting Lu	30d6f0abae	Revert "use gcc11 and add eager tests as well to reduce turnaround" This reverts commit bf7ca1e0dd66de725f24905c4d192853bde41ac3.	2025-10-30 23:28:28 -07:00
Ting Lu	bf7ca1e0dd	use gcc11 and add eager tests as well to reduce turnaround	2025-10-30 15:46:07 -07:00
Ting Lu	b9faec53a6	fix lint	2025-10-30 15:45:00 -07:00
Ting Lu	9f5612b293	skip torchrec_dlrm as fbgemm is required	2025-10-30 15:45:00 -07:00
Ting Lu	0f2087f456	Do not skip torchrec	2025-10-30 15:45:00 -07:00
Ting Lu	ea4093a93b	do not build fbgemm_gpu for test file in cuda13 too	2025-10-30 15:45:00 -07:00
Ting Lu	f530350318	Add disable option in CMakeLists.txt too	2025-10-30 15:45:00 -07:00
Ting Lu	1c2f02e604	Disable fbgemm from build.sh	2025-10-30 15:45:00 -07:00
Ting Lu	f71550edee	fix typo	2025-10-30 15:45:00 -07:00
Ting Lu	8ac7f060ed	disable fbgemm for h100 nightly	2025-10-30 15:45:00 -07:00
Ting Lu	0efca5ce66	add missing nightly and pull	2025-10-30 15:45:00 -07:00
Ting Lu	1247c93b6f	Add full coverage inductor tests	2025-10-30 15:45:00 -07:00
Ting Lu	c58f06e2c9	Add CUDA 13 tests to torchbench workflow	2025-10-30 15:45:00 -07:00
Ting Lu	295455ab14	Do not build fbgemm for inductor 13.0	2025-10-30 15:45:00 -07:00
Ting Lu	f41dfb84d5	suppress deprecation warning with -DDISABLE_CUSPARSE_DEPRECATED	2025-10-30 15:45:00 -07:00
Ting Lu	19c5808d1a	Try suppress the cusparse.h warning	2025-10-30 15:45:00 -07:00
Ting Lu	81a01260d0	Inductor 13.0 test	2025-10-30 15:45:00 -07:00