Compare commits

...

10 Commits

12 changed files with 404 additions and 0 deletions

View File

@ -136,6 +136,17 @@ case "$tag" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=13.0.0
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.12

View File

@ -46,6 +46,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
fi
fi
# Disable fbgemm for CUDA 13 builds as it's not compatible yet
if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
export USE_FBGEMM=0
fi
if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
export ATEN_THREADING=NATIVE
fi

View File

@ -53,6 +53,7 @@ jobs:
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,

View File

@ -55,3 +55,30 @@ jobs:
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
]}
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit

View File

@ -59,3 +59,37 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
# disable monitor in perf tests for more investigation
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -178,3 +178,98 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
# or newer GPUs, so it doesn't benefit much from existing compiler cache
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-periodically-cuda13:
name: test-periodically-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '15 0,12 * * 1-6'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly-cuda13:
name: test-weekly-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-cuda13:
name: test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
# The pull_request trigger is used in PR to bump transformers pin which always
# needs one round of benchmark
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -164,3 +164,89 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Every bit to make perf run faster helps
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-nightly-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -81,6 +81,56 @@ jobs:
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-build-cuda13:
name: periodic-dynamo-benchmarks-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0;8.6'
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
periodic-dynamo-benchmarks-test-cuda13:
name: periodic-dynamo-benchmarks-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.test-matrix }}
secrets: inherit
rocm-periodic-dynamo-benchmarks-build:
if: github.repository_owner == 'pytorch'
name: rocm-periodic-dynamo-benchmarks-build
@ -159,6 +209,33 @@ jobs:
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
secrets: inherit
inductor-smoke-build-cuda13:
name: inductor-smoke-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-smoke-test-cuda13:
name: inductor-smoke-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: inductor-smoke-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.inductor-smoke-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.inductor-smoke-build-cuda13.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-cpu-build:
name: periodic-dynamo-benchmarks-cpu-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -74,6 +74,36 @@ jobs:
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
secrets: inherit
inductor-build-cuda13:
name: inductor-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
{ include: [
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-test-cuda13:
name: inductor-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build-cuda13.outputs.test-matrix }}
secrets: inherit
inductor-cpu-build:
name: inductor-cpu-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -342,6 +342,31 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-inductor-build:
name: cuda13.0-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-inductor-test:
name: cuda13.0-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda13_0-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-xpu-n-py3_10-build:
name: linux-jammy-xpu-n-py3.10
uses: ./.github/workflows/_linux-build.yml

View File

@ -234,6 +234,16 @@ jobs:
cuda-arch-list: '8.0'
secrets: inherit
inductor-build-cuda13:
name: inductor-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda13.0-py3.12-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
secrets: inherit
# Test cross-compiled models with Windows libs extracted from wheel
cross-compile-linux-test:
name: cross-compile-linux-test

View File

@ -1394,6 +1394,9 @@ if(NOT INTERN_BUILD_MOBILE)
# https://github.com/pytorch/pytorch/pull/55292
string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
# Suppress cusparse warnings
string(APPEND CMAKE_CUDA_FLAGS " -DDISABLE_CUSPARSE_DEPRECATED")
message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
" -D__CUDA_NO_HALF_OPERATORS__"