linting

Merge remote-tracking branch 'upstream/main' into rocm_yml_stress_test
6 shards of inductor-rocm & pull image first
2025-11-17 16:46:31 +08:00 · 2025-11-11 17:03:15 +00:00 · 2025-11-11 16:28:26 +00:00 · 2025-11-11 16:05:12 +00:00 · 2025-11-11 16:00:49 +00:00 · 2025-11-11 15:53:04 +00:00
5 changed files with 215 additions and 18 deletions
--- a/.github/workflows/_pull-image.yml
+++ b/.github/workflows/_pull-image.yml
@ -0,0 +1,116 @@
+# TODO: this looks sort of similar to _linux-test, but there are like a dozen
+# places where you would have to insert an if statement. Probably it's better to
+# just use a different workflow altogether
+
+name: test
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 300
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+      tests-to-include:
+        required: false
+        type: string
+        default: ""
+        description: |
+          List of tests to include (empty string implies default list)
+      dashboard-tag:
+        required: false
+        type: string
+        default: ""
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+env:
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          fetch-depth: 1
+          no-sudo: true
+
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ${{ inputs.docker-image }}
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Tag & Push image for local registry
+        run: |
+          docker tag "$IMAGE" "docker-registry.docker-registry.svc.cluster.local:5000/$IMAGE"
+          docker push "docker-registry.docker-registry.svc.cluster.local:5000/$IMAGE"
+          echo "Image pushed successfully to local registry"
+        env:
+          IMAGE: ${{ inputs.docker-image }}
+
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -88,6 +88,7 @@ jobs:
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          no-sudo: true
+          fetch-depth: 1

      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
@ -109,9 +110,10 @@ jobs:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        run: |
+          docker pull "docker-registry.docker-registry.svc.cluster.local:5000/$IMAGE"
+        env:
+          IMAGE: ${{ inputs.docker-image }}

      - name: Get workflow job id
        id: get-job-id
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -40,8 +40,29 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
+          { config: "inductor", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "inductor", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "inductor", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "inductor", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "inductor", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "inductor", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-pull-image:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10-pull-image
+    uses: ./.github/workflows/_pull-image.yml
+    needs:
+      - linux-jammy-rocm-py3_10-inductor-build
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi250.1" },
        ]}
    secrets: inherit

@ -51,7 +72,9 @@ jobs:
      contents: read
    name: rocm-py3.10-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-jammy-rocm-py3_10-inductor-build
+    needs:
+      - linux-jammy-rocm-py3_10-inductor-build
+      - linux-jammy-rocm-py3_10-pull-image
    with:
      build-environment: linux-jammy-rocm-py3.10
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -61,9 +61,33 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-pull-image:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3_10-pull-image
+    uses: ./.github/workflows/_pull-image.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi250.1" },
        ]}
    secrets: inherit

@ -75,9 +99,10 @@ jobs:
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-pull-image
      - target-determination
    with:
      build-environment: linux-jammy-rocm-py3.10
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
+    secrets: inherit
--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -12,9 +12,9 @@ on:
    - cron: 0 */3 * * *


-concurrency:
+concurrency: 
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
+  cancel-in-progress: true 

 permissions: read-all

@ -49,15 +49,45 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
+          { config: "default", shard: 1, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 2, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 3, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 4, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 5, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 6, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 7, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 8, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 9, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 10, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 11, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 12, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 13, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 14, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 15, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 16, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 17, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 18, num_shards: 18, runner: "linux.rocm.gpu.mi250.1" },
        ]}
    secrets: inherit

+  linux-jammy-rocm-py3_10-pull-image:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10-pull-image
+    uses: ./.github/workflows/_pull-image.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi250.1" },
+        ]}
+    secrets: inherit
+        
  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
@ -66,6 +96,7 @@ jobs:
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
+      - linux-jammy-rocm-py3_10-pull-image
      - target-determination
    with:
      build-environment: linux-jammy-rocm-py3.10
Author	SHA1	Message	Date
amdfaa	1885533f68	linting	2025-11-11 17:03:15 +00:00
amdfaa	9abc2cbc8a	Merge remote-tracking branch 'upstream/main' into rocm_yml_stress_test	2025-11-11 16:28:26 +00:00
amdfaa	31f63788a5	6 shards of inductor-rocm & pull image first	2025-11-11 16:05:12 +00:00
amdfaa	2875d47e69	9 shards of periodic & pull image first	2025-11-11 16:00:49 +00:00
amdfaa	154038ad9e	cancel previous workflows	2025-11-11 15:53:04 +00:00
amdfaa	92428c1752	pull image from registry	2025-11-11 15:51:26 +00:00
amdfaa	39decc5e8f	Retry registry with pull-docker-image action	2025-11-10 16:01:33 -05:00
amdfaa	a20053aa5d	pull image from registry	2025-11-10 15:52:57 -05:00
amdfaa	18c32b8134	Set fetch-depth for all workflows	2025-11-10 12:58:47 -05:00
amdfaa	ccacb9bd22	Set fetch-depth=1	2025-11-10 12:53:02 -05:00
amdfaa	a2ee8620cd	Tag and push image to local registry	2025-11-10 12:41:02 -05:00
amdfaa	0bdbc3d20b	fix the mount paths	2025-11-08 11:28:06 +00:00
amdfaa	b71abd02b9	test image caching	2025-11-08 02:39:50 +00:00
amdfaa	3b1874ad45	1 shard	2025-11-07 19:34:31 -05:00
amdfaa	5313cafd73	Checkout pytorch	2025-11-07 18:33:47 -05:00
amdfaa	ef840c9c62	Create _pull-image.yml	2025-11-07 17:55:45 -05:00
amdfaa	9a07df4a39	Update rocm.yml	2025-11-07 17:52:29 -05:00
amdfaa	cb5af99ea2	Add pull image	2025-11-07 17:50:52 -05:00
amdfaa	74dbc7f1d2	check values file change	2025-11-07 16:22:39 +00:00
amdfaa	91b9b08ba1	test docker mirror cache	2025-11-07 14:16:36 +00:00
amdfaa	cced934d98	test mtu & other changes	2025-11-07 02:14:34 +00:00
amdfaa	99b81b4f93	test again wo k8s in parallel	2025-11-05 16:32:44 +00:00
amdfaa	34995954e5	test network robustness	2025-11-04 22:49:06 +00:00
amdfaa	fba6df090e	test lower mtu value	2025-10-28 16:25:39 +00:00
amdfaa	4d597ea95a	redo testing of 16 core runners	2025-10-23 16:41:02 +00:00
amdfaa	0edb367528	redo testing of 16 core runners	2025-10-23 15:01:51 +00:00
amdfaa	f46983ba46	test new values file with 16 cores for each runner	2025-10-22 17:26:51 +00:00
amdfaa	bdbf8f3e6a	test if workflow runs are concurrent or serial	2025-10-22 15:17:14 +00:00
amdfaa	9ec9b9f96e	Comment out concurrency	2025-10-22 11:08:26 -04:00
amdfaa	ae3ca09339	check if rocm label will be removed after unapproved author commits to approved authors PR	2025-10-21 16:08:09 +00:00
Jithun Nair	fd076a784a	Increase shards and disable concurrency	2025-10-21 15:58:38 +00:00