Compare commits

...

20 Commits

Author SHA1 Message Date
62a803dff5 revert the driver change and test-infra change to get latest signal other than driver issue 2025-11-05 00:04:57 -08:00
fadbaa1b9b test fix-ubuntu-distro change for driver installation 2025-11-04 00:50:28 -08:00
6ffd8b4e39 upgrade driver for aws.a100 runner 2025-11-02 01:09:03 -08:00
30d6f0abae Revert "use gcc11 and add eager tests as well to reduce turnaround"
This reverts commit bf7ca1e0dd66de725f24905c4d192853bde41ac3.
2025-10-30 23:28:28 -07:00
bf7ca1e0dd use gcc11 and add eager tests as well to reduce turnaround 2025-10-30 15:46:07 -07:00
b9faec53a6 fix lint 2025-10-30 15:45:00 -07:00
9f5612b293 skip torchrec_dlrm as fbgemm is required 2025-10-30 15:45:00 -07:00
0f2087f456 Do not skip torchrec 2025-10-30 15:45:00 -07:00
ea4093a93b do not build fbgemm_gpu for test file in cuda13 too 2025-10-30 15:45:00 -07:00
f530350318 Add disable option in CMakeLists.txt too 2025-10-30 15:45:00 -07:00
1c2f02e604 Disable fbgemm from build.sh 2025-10-30 15:45:00 -07:00
f71550edee fix typo 2025-10-30 15:45:00 -07:00
8ac7f060ed disable fbgemm for h100 nightly 2025-10-30 15:45:00 -07:00
0efca5ce66 add missing nightly and pull 2025-10-30 15:45:00 -07:00
1247c93b6f Add full coverage inductor tests 2025-10-30 15:45:00 -07:00
c58f06e2c9 Add CUDA 13 tests to torchbench workflow 2025-10-30 15:45:00 -07:00
295455ab14 Do not build fbgemm for inductor 13.0 2025-10-30 15:45:00 -07:00
f41dfb84d5 suppress deprecation warning with -DDISABLE_CUSPARSE_DEPRECATED 2025-10-30 15:45:00 -07:00
19c5808d1a Try suppress the cusparse.h warning 2025-10-30 15:45:00 -07:00
81a01260d0 Inductor 13.0 test 2025-10-30 15:45:00 -07:00
14 changed files with 422 additions and 9 deletions

View File

@ -136,6 +136,17 @@ case "$tag" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=13.0.0
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.12

View File

@ -36,6 +36,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
nvcc --version
fi
if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
# Disable FBGEMM for CUDA 13 builds
export USE_FBGEMM=0
fi
if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
# TODO: there is a linking issue when building with UCC using clang,

View File

@ -286,7 +286,10 @@ EOF
rm -rf fbgemm
else
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
# Skip fbgemm for CUDA 13 as it's not compatible yet
if [[ "$BUILD_ENVIRONMENT" != *cuda13* ]]; then
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
fi
fi
}

View File

@ -834,8 +834,14 @@ test_dynamo_benchmark() {
local shard_id="$1"
shift
# Exclude torchrec_dlrm for CUDA 13 as FBGEMM is not compatible
local extra_args=()
if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
extra_args=(--exclude-exact torchrec_dlrm)
fi
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
# TODO (huydhn): Just smoke test some sample models
if [[ "${TEST_CONFIG}" == *b200* ]]; then
@ -847,7 +853,7 @@ test_dynamo_benchmark() {
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
fi
fi
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "${extra_args[@]}" "$@"
else
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
local dt="float32"
@ -855,17 +861,17 @@ test_dynamo_benchmark() {
dt="amp"
fi
if [[ "${TEST_CONFIG}" == *freezing* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "${extra_args[@]}" "$@"
else
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "${extra_args[@]}" "$@"
fi
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
else
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
fi
fi
}

View File

@ -53,6 +53,7 @@ jobs:
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,

View File

@ -55,3 +55,30 @@ jobs:
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
]}
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit

View File

@ -59,3 +59,37 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
# disable monitor in perf tests for more investigation
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -178,3 +178,98 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
# or newer GPUs, so it doesn't benefit much from existing compiler cache
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-periodically-cuda13:
name: test-periodically-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '15 0,12 * * 1-6'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly-cuda13:
name: test-weekly-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-cuda13:
name: test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
# The pull_request trigger is used in PR to bump transformers pin which always
# needs one round of benchmark
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -164,3 +164,89 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Every bit to make perf run faster helps
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-nightly-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -81,6 +81,56 @@ jobs:
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-build-cuda13:
name: periodic-dynamo-benchmarks-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0;8.6'
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
periodic-dynamo-benchmarks-test-cuda13:
name: periodic-dynamo-benchmarks-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.test-matrix }}
secrets: inherit
rocm-periodic-dynamo-benchmarks-build:
if: github.repository_owner == 'pytorch'
name: rocm-periodic-dynamo-benchmarks-build
@ -158,6 +208,33 @@ jobs:
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
secrets: inherit
inductor-smoke-build-cuda13:
name: inductor-smoke-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-smoke-test-cuda13:
name: inductor-smoke-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: inductor-smoke-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.inductor-smoke-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.inductor-smoke-build-cuda13.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-cpu-build:
name: periodic-dynamo-benchmarks-cpu-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -74,6 +74,36 @@ jobs:
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
secrets: inherit
inductor-build-cuda13:
name: inductor-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
{ include: [
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-test-cuda13:
name: inductor-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build-cuda13.outputs.test-matrix }}
secrets: inherit
inductor-cpu-build:
name: inductor-cpu-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -342,6 +342,31 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-inductor-build:
name: cuda13.0-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-inductor-test:
name: cuda13.0-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda13_0-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-xpu-n-py3_10-build:
name: linux-jammy-xpu-n-py3.10
uses: ./.github/workflows/_linux-build.yml

View File

@ -234,6 +234,16 @@ jobs:
cuda-arch-list: '8.0'
secrets: inherit
inductor-build-cuda13:
name: inductor-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda13.0-py3.12-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
secrets: inherit
# Test cross-compiled models with Windows libs extracted from wheel
cross-compile-linux-test:
name: cross-compile-linux-test

View File

@ -1394,6 +1394,9 @@ if(NOT INTERN_BUILD_MOBILE)
# https://github.com/pytorch/pytorch/pull/55292
string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
# Suppress cusparse warnings
string(APPEND CMAKE_CUDA_FLAGS " -DDISABLE_CUSPARSE_DEPRECATED")
message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
" -D__CUDA_NO_HALF_OPERATORS__"