Compare commits

...

31 Commits

Author SHA1 Message Date
1885533f68 linting 2025-11-11 17:03:15 +00:00
9abc2cbc8a Merge remote-tracking branch 'upstream/main' into rocm_yml_stress_test 2025-11-11 16:28:26 +00:00
31f63788a5 6 shards of inductor-rocm & pull image first 2025-11-11 16:05:12 +00:00
2875d47e69 9 shards of periodic & pull image first 2025-11-11 16:00:49 +00:00
154038ad9e cancel previous workflows 2025-11-11 15:53:04 +00:00
92428c1752 pull image from registry 2025-11-11 15:51:26 +00:00
39decc5e8f Retry registry with pull-docker-image action 2025-11-10 16:01:33 -05:00
a20053aa5d pull image from registry 2025-11-10 15:52:57 -05:00
18c32b8134 Set fetch-depth for all workflows 2025-11-10 12:58:47 -05:00
ccacb9bd22 Set fetch-depth=1 2025-11-10 12:53:02 -05:00
a2ee8620cd Tag and push image to local registry 2025-11-10 12:41:02 -05:00
0bdbc3d20b fix the mount paths 2025-11-08 11:28:06 +00:00
b71abd02b9 test image caching 2025-11-08 02:39:50 +00:00
3b1874ad45 1 shard 2025-11-07 19:34:31 -05:00
5313cafd73 Checkout pytorch 2025-11-07 18:33:47 -05:00
ef840c9c62 Create _pull-image.yml 2025-11-07 17:55:45 -05:00
9a07df4a39 Update rocm.yml 2025-11-07 17:52:29 -05:00
cb5af99ea2 Add pull image 2025-11-07 17:50:52 -05:00
74dbc7f1d2 check values file change 2025-11-07 16:22:39 +00:00
91b9b08ba1 test docker mirror cache 2025-11-07 14:16:36 +00:00
cced934d98 test mtu & other changes 2025-11-07 02:14:34 +00:00
99b81b4f93 test again wo k8s in parallel 2025-11-05 16:32:44 +00:00
34995954e5 test network robustness 2025-11-04 22:49:06 +00:00
fba6df090e test lower mtu value 2025-10-28 16:25:39 +00:00
4d597ea95a redo testing of 16 core runners 2025-10-23 16:41:02 +00:00
0edb367528 redo testing of 16 core runners 2025-10-23 15:01:51 +00:00
f46983ba46 test new values file with 16 cores for each runner 2025-10-22 17:26:51 +00:00
bdbf8f3e6a test if workflow runs are concurrent or serial 2025-10-22 15:17:14 +00:00
9ec9b9f96e Comment out concurrency 2025-10-22 11:08:26 -04:00
ae3ca09339 check if rocm label will be removed after unapproved author commits to approved authors PR 2025-10-21 16:08:09 +00:00
fd076a784a Increase shards and disable concurrency 2025-10-21 15:58:38 +00:00
5 changed files with 215 additions and 18 deletions

116
.github/workflows/_pull-image.yml vendored Normal file
View File

@ -0,0 +1,116 @@
# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether
name: test
on:
workflow_call:
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
test-matrix:
required: true
type: string
description: JSON description of what test configs to run.
docker-image:
required: true
type: string
description: Docker image to run in.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
timeout-minutes:
required: false
type: number
default: 300
description: |
Set the maximum (in minutes) how long the workflow should take to finish
tests-to-include:
required: false
type: string
default: ""
description: |
List of tests to include (empty string implies default list)
dashboard-tag:
required: false
type: string
default: ""
disable-monitor:
description: |
[Experimental] Disable utilization monitoring for tests.
Currently, by default we disable the monitor job and only look for specific tests,
since we are investigating the behaviour of the monitor script with different tests.
required: false
type: boolean
default: true
monitor-log-interval:
description: |
Set the interval for the monitor script to log utilization.
required: false
type: number
default: 5
monitor-data-collect-interval:
description: |
Set the interval for the monitor script to collect data.
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
permissions:
id-token: write
contents: read
jobs:
test:
# Don't run on forked repos or empty test matrix
if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
fetch-depth: 1
no-sudo: true
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ inputs.docker-image }}
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Tag & Push image for local registry
run: |
docker tag "$IMAGE" "docker-registry.docker-registry.svc.cluster.local:5000/$IMAGE"
docker push "docker-registry.docker-registry.svc.cluster.local:5000/$IMAGE"
echo "Image pushed successfully to local registry"
env:
IMAGE: ${{ inputs.docker-image }}
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm

View File

@ -88,6 +88,7 @@ jobs:
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
no-sudo: true
fetch-depth: 1
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
@ -109,9 +110,10 @@ jobs:
docker-image-name: ${{ inputs.docker-image }}
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
run: |
docker pull "docker-registry.docker-registry.svc.cluster.local:5000/$IMAGE"
env:
IMAGE: ${{ inputs.docker-image }}
- name: Get workflow job id
id: get-job-id

View File

@ -40,8 +40,29 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
{ config: "inductor", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "inductor", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "inductor", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "inductor", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "inductor", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "inductor", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }
]}
secrets: inherit
linux-jammy-rocm-py3_10-pull-image:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10-pull-image
uses: ./.github/workflows/_pull-image.yml
needs:
- linux-jammy-rocm-py3_10-inductor-build
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi250.1" },
]}
secrets: inherit
@ -51,7 +72,9 @@ jobs:
contents: read
name: rocm-py3.10-inductor
uses: ./.github/workflows/_rocm-test.yml
needs: linux-jammy-rocm-py3_10-inductor-build
needs:
- linux-jammy-rocm-py3_10-inductor-build
- linux-jammy-rocm-py3_10-pull-image
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}

View File

@ -61,9 +61,33 @@ jobs:
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
{ config: "distributed", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }
]}
secrets: inherit
linux-jammy-rocm-py3_10-pull-image:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3_10-pull-image
uses: ./.github/workflows/_pull-image.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi250.1" },
]}
secrets: inherit
@ -75,9 +99,10 @@ jobs:
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- linux-jammy-rocm-py3_10-pull-image
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
secrets: inherit
secrets: inherit

View File

@ -12,9 +12,9 @@ on:
- cron: 0 */3 * * *
concurrency:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
cancel-in-progress: true
permissions: read-all
@ -49,15 +49,45 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 1, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 2, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 3, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 4, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 5, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 6, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 7, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 8, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 9, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 10, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 11, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 12, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 13, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 14, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 15, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 16, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 17, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 18, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-pull-image:
permissions:
id-token: write
contents: read
name: linux-jammy-rocm-py3.10-pull-image
uses: ./.github/workflows/_pull-image.yml
needs:
- linux-jammy-rocm-py3_10-build
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi250.1" },
]}
secrets: inherit
linux-jammy-rocm-py3_10-test:
permissions:
id-token: write
@ -66,6 +96,7 @@ jobs:
uses: ./.github/workflows/_rocm-test.yml
needs:
- linux-jammy-rocm-py3_10-build
- linux-jammy-rocm-py3_10-pull-image
- target-determination
with:
build-environment: linux-jammy-rocm-py3.10