Enable manywheel build and smoke test on main branch for ROCm (#153287)

Fixes issue of not discovering breakage of ROCm wheel builds until the nightly job runs e.g. https://github.com/pytorch/pytorch/pull/153253

Pull Request resolved: https://github.com/pytorch/pytorch/pull/153287
Approved by: https://github.com/jeffdaily
This commit is contained in:
Jithun Nair
2025-06-14 19:14:31 +00:00
committed by PyTorch MergeBot
parent 5285d10243
commit 794ef6c9b8
6 changed files with 109 additions and 16 deletions

View File

@ -49,6 +49,7 @@ self-hosted-runner:
# Organization-wide AMD-hosted runners
# MI2xx runners
- linux.rocm.gpu
- linux.rocm.gpu.mi250
- linux.rocm.gpu.2
- linux.rocm.gpu.4
# MI300 runners

View File

@ -152,7 +152,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
package_type="manywheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.LINUX,
arches=["12.6", "12.8", "12.9"],
arches=["12.6", "12.8", "12.9", "6.4"],
python_versions=["3.9"],
),
branches="main",

View File

@ -171,7 +171,7 @@ jobs:
- name: Teardown XPU
uses: ./.github/actions/teardown-xpu
{%- else %}
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: !{{ common.timeout_minutes }}
!{{ upload.binary_env(config) }}
steps:

View File

@ -274,7 +274,7 @@ jobs:
needs:
- libtorch-rocm6_3-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -388,7 +388,7 @@ jobs:
needs:
- libtorch-rocm6_4-shared-with-deps-release-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch

View File

@ -182,3 +182,95 @@ jobs:
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-rocm6_4-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: 6.4
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
use_split_build: False
DESIRED_PYTHON: "3.9"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-rocm6_4
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-rocm6_4-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_9-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.4
GPU_ARCH_VERSION: 6.4
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
use_split_build: False
DESIRED_PYTHON: "3.9"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v4.1.7
name: Download Build Artifacts
with:
name: manywheel-py3_9-rocm6_4
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
show-progress: false
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
docker-image-name: manylinux2_28-builder
custom-tag-prefix: rocm6.4
docker-build-dir: .ci/docker
working-directory: pytorch
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
env:
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm

View File

@ -345,7 +345,7 @@ jobs:
needs:
- manywheel-py3_9-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -459,7 +459,7 @@ jobs:
needs:
- manywheel-py3_9-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -958,7 +958,7 @@ jobs:
needs:
- manywheel-py3_10-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -1072,7 +1072,7 @@ jobs:
needs:
- manywheel-py3_10-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -1639,7 +1639,7 @@ jobs:
needs:
- manywheel-py3_11-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -1753,7 +1753,7 @@ jobs:
needs:
- manywheel-py3_11-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -2252,7 +2252,7 @@ jobs:
needs:
- manywheel-py3_12-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -2366,7 +2366,7 @@ jobs:
needs:
- manywheel-py3_12-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -2865,7 +2865,7 @@ jobs:
needs:
- manywheel-py3_13-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -2979,7 +2979,7 @@ jobs:
needs:
- manywheel-py3_13-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -3478,7 +3478,7 @@ jobs:
needs:
- manywheel-py3_13t-rocm6_3-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
@ -3592,7 +3592,7 @@ jobs:
needs:
- manywheel-py3_13t-rocm6_4-build
- get-label-type
runs-on: linux.rocm.gpu
runs-on: linux.rocm.gpu.mi250
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch