Compare commits

...

21 Commits

Author SHA1 Message Date
17b9217fdd merge conflict resolve add back 2025-11-13 09:04:44 +00:00
016f58bfee revert the driver change and test-infra change to get latest signal other than driver issue 2025-11-13 09:04:44 +00:00
e55d249c85 test fix-ubuntu-distro change for driver installation 2025-11-13 09:04:44 +00:00
fe8b1853cf upgrade driver for aws.a100 runner 2025-11-13 09:04:44 +00:00
f834086934 Revert "use gcc11 and add eager tests as well to reduce turnaround"
This reverts commit bf7ca1e0dd66de725f24905c4d192853bde41ac3.
2025-11-13 09:04:44 +00:00
f792123e7e use gcc11 and add eager tests as well to reduce turnaround 2025-11-13 09:04:44 +00:00
7ac589c981 fix lint 2025-11-13 09:04:44 +00:00
7feff0f415 skip torchrec_dlrm as fbgemm is required 2025-11-13 09:04:44 +00:00
4fce979d62 Do not skip torchrec 2025-11-13 09:04:44 +00:00
271dc5807d do not build fbgemm_gpu for test file in cuda13 too 2025-11-13 09:04:44 +00:00
730279bf38 Add disable option in CMakeLists.txt too 2025-11-13 09:04:44 +00:00
43fb73585e Disable fbgemm from build.sh 2025-11-13 09:04:44 +00:00
3d17510e76 fix typo 2025-11-13 09:04:44 +00:00
3fc1029c43 disable fbgemm for h100 nightly 2025-11-13 09:04:44 +00:00
44a8309719 add missing nightly and pull 2025-11-13 09:04:44 +00:00
de77cde1ea Add full coverage inductor tests 2025-11-13 09:04:44 +00:00
d47e117a5b Add CUDA 13 tests to torchbench workflow 2025-11-13 09:04:44 +00:00
55220d98ac Do not build fbgemm for inductor 13.0 2025-11-13 09:04:44 +00:00
dcc76ad961 suppress deprecation warning with -DDISABLE_CUSPARSE_DEPRECATED 2025-11-13 09:04:44 +00:00
3019f60bfb Try suppress the cusparse.h warning 2025-11-13 09:04:44 +00:00
28ac6692fe Inductor 13.0 test 2025-11-13 09:04:44 +00:00
14 changed files with 426 additions and 12 deletions

View File

@ -136,6 +136,17 @@ case "$tag" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=13.0.0
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.12

View File

@ -36,6 +36,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
nvcc --version
fi
if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
# Disable FBGEMM for CUDA 13 builds
export USE_FBGEMM=0
fi
if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
# TODO: there is a linking issue when building with UCC using clang,

View File

@ -285,7 +285,10 @@ EOF
rm -rf fbgemm
else
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
# Skip fbgemm for CUDA 13 as it's not compatible yet
if [[ "$BUILD_ENVIRONMENT" != *cuda13* ]]; then
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
fi
fi
}

View File

@ -855,8 +855,14 @@ test_dynamo_benchmark() {
local shard_id="$1"
shift
# Exclude torchrec_dlrm for CUDA 13 as FBGEMM is not compatible
local extra_args=()
if [[ "$BUILD_ENVIRONMENT" == *cuda13* ]]; then
extra_args=(--exclude-exact torchrec_dlrm)
fi
if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
# TODO (huydhn): Just smoke test some sample models
if [[ "${TEST_CONFIG}" == *b200* ]]; then
@ -868,7 +874,7 @@ test_dynamo_benchmark() {
export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
fi
fi
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "${extra_args[@]}" "$@"
else
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
local dt="float32"
@ -876,17 +882,17 @@ test_dynamo_benchmark() {
dt="amp"
fi
if [[ "${TEST_CONFIG}" == *freezing* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "${extra_args[@]}" "$@"
else
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "${extra_args[@]}" "$@"
fi
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
else
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "${extra_args[@]}" "$@"
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "${extra_args[@]}" "$@"
fi
fi
}
@ -1780,7 +1786,8 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
else
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
# Skip torchrec/fbgemm for cuda13 as they're not compatible yet
if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* && "${BUILD_ENVIRONMENT}" != *cuda13* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"

View File

@ -53,6 +53,7 @@ jobs:
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,

View File

@ -55,3 +55,30 @@ jobs:
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
]}
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
secrets: inherit

View File

@ -59,3 +59,37 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
# disable monitor in perf tests for more investigation
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -178,3 +178,98 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
# or newer GPUs, so it doesn't benefit much from existing compiler cache
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_cuda_h100", shard: 1, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 2, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" },
{ config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-periodically-cuda13:
name: test-periodically-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '15 0,12 * * 1-6'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly-cuda13:
name: test-weekly-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-cuda13:
name: test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
# The pull_request trigger is used in PR to bump transformers pin which always
# needs one round of benchmark
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm90
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
# disable monitor in perf tests for more investigation
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -164,3 +164,89 @@ jobs:
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
build-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Every bit to make perf run faster helps
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf", shard: 1, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 2, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 3, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 4, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_huggingface_perf", shard: 5, num_shards: 5, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_timm_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 1, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 2, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 3, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 4, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 5, num_shards: 6, runner: "linux.aws.a100" },
{ config: "inductor_torchbench_perf", shard: 6, num_shards: 6, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 1, num_shards: 2, runner: "linux.aws.a100" },
{ config: "cachebench", shard: 2, num_shards: 2, runner: "linux.aws.a100" },
]}
selected-test-configs: ${{ inputs.benchmark_configs }}
build-additional-packages: "vision audio torchao"
secrets: inherit
test-nightly-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-weekly-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 1440
# disable monitor in perf tests, next step is to enable it
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit
test-cuda13:
name: cuda13.0-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build-cuda13
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.build-cuda13.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -81,6 +81,56 @@ jobs:
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-build-cuda13:
name: periodic-dynamo-benchmarks-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0;8.6'
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
periodic-dynamo-benchmarks-test-cuda13:
name: periodic-dynamo-benchmarks-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build-cuda13.outputs.test-matrix }}
secrets: inherit
rocm-periodic-dynamo-benchmarks-build:
if: github.repository_owner == 'pytorch'
name: rocm-periodic-dynamo-benchmarks-build
@ -158,6 +208,33 @@ jobs:
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
secrets: inherit
inductor-smoke-build-cuda13:
name: inductor-smoke-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-smoke-test-cuda13:
name: inductor-smoke-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: inductor-smoke-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm80
docker-image: ${{ needs.inductor-smoke-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.inductor-smoke-build-cuda13.outputs.test-matrix }}
secrets: inherit
periodic-dynamo-benchmarks-cpu-build:
name: periodic-dynamo-benchmarks-cpu-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -74,6 +74,36 @@ jobs:
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
secrets: inherit
inductor-build-cuda13:
name: inductor-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
{ include: [
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
]}
build-additional-packages: "vision audio torchao"
secrets: inherit
inductor-test-cuda13:
name: inductor-test-cuda13
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build-cuda13
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build-cuda13.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build-cuda13.outputs.test-matrix }}
secrets: inherit
inductor-cpu-build:
name: inductor-cpu-build
uses: ./.github/workflows/_linux-build.yml

View File

@ -342,8 +342,33 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-noble-xpu-n-py3_10-build:
name: linux-noble-xpu-n-py3.10
linux-jammy-cuda13_0-py3_10-gcc9-inductor-build:
name: cuda13.0-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc9-inductor-test:
name: cuda13.0-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda13_0-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda13.0-py3.10-gcc9-sm75
docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-xpu-n-py3_10-build:
name: linux-jammy-xpu-n-py3.10
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:

View File

@ -235,6 +235,16 @@ jobs:
cuda-arch-list: '8.0'
secrets: inherit
inductor-build-cuda13:
name: inductor-build-cuda13
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda13.0-py3.12-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
secrets: inherit
# Test cross-compiled models with Windows libs extracted from wheel
cross-compile-linux-test:
name: cross-compile-linux-test

View File

@ -1394,6 +1394,9 @@ if(NOT INTERN_BUILD_MOBILE)
# https://github.com/pytorch/pytorch/pull/55292
string(APPEND CMAKE_CUDA_FLAGS " -DCUB_WRAPPED_NAMESPACE=at_cuda_detail")
# Suppress cusparse warnings
string(APPEND CMAKE_CUDA_FLAGS " -DDISABLE_CUSPARSE_DEPRECATED")
message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
string(APPEND CMAKE_CUDA_FLAGS " -DCUDA_HAS_FP16=1"
" -D__CUDA_NO_HALF_OPERATORS__"