mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
ci: Removing ROCm tests from trunk. (#164585)
Had a conversation with the AMD team today and I think we are all in agreement that the current state of queueing for AMD is beyond where we'd like to be for there to be blocking CI for ROCm. Moving the representative testing jobs for this into the ciflow/rocm workflow. I'd love for these to be back in trunk if we can get to a state where our queueing metrics are below an hour for ROCm infrastructure. Dashboards: * ROCm Queueing (>60mins) ([link](https://hud.pytorch.org/queue_time_analysis?dateRange=30&startDate=2025-09-03T16%3A06%3A45.025Z&endDate=2025-10-03T16%3A06%3A45.025Z&granularity=week&chartType=bar&repos=pytorch%2Fpytorch&category=machine_type&machineTypes=linux.rocm.gpu.2&machineTypes=linux.rocm.gpu.4&machineTypes=linux.rocm.gpu.mi250&machineTypes=linux.rocm.gpu.gfx942.1&machineTypes=linux.rocm.gpu.mi250.4&machineTypes=linux.rocm.gpu.gfx942.4&machineTypes=linux.rocm.gpu.mi355.2&machineTypes=linux.rocm.gpu.gfx942.4.test&machineTypes=linux.rocm.gpu.mi250.1&machineTypes=linux.rocm.gpu.gfx942.1.test&machineTypes=linux.rocm.gpu.gfx90a.1&machineTypes=linux.rocm.gpu.gfx90a.4&items=linux.rocm.gpu.2&items=linux.rocm.gpu.4&items=linux.rocm.gpu.mi250&items=linux.rocm.gpu.gfx942.1&items=linux.rocm.gpu.mi250.4&items=linux.rocm.gpu.gfx942.4&items=linux.rocm.gpu.mi355.2&items=linux.rocm.gpu.gfx942.4.test&items=linux.rocm.gpu.mi250.1&items=linux.rocm.gpu.gfx942.1.test&items=linux.rocm.gpu.gfx90a.1&items=linux.rocm.gpu.gfx90a.4)) * NVIDIA queueing (<5mins) ([link](https://hud.pytorch.org/queue_time_analysis?dateRange=30&startDate=2025-09-03T16%3A05%3A08.000Z&endDate=2025-10-03T16%3A05%3A08.000Z&granularity=week&chartType=bar&repos=pytorch%2Fpytorch&category=machine_type&machineTypes=lf.linux.g4dn.4xlarge.nvidia.gpu&machineTypes=linux.g4dn.12xlarge.nvidia.gpu&machineTypes=linux.g4dn.metal.nvidia.gpu&machineTypes=linux.g5.4xlarge.nvidia.gpu&machineTypes=lf.linux.g4dn.12xlarge.nvidia.gpu&machineTypes=lf.linux.g5.12xlarge.nvidia.gpu&machineTypes=lf.linux.g5.4xlarge.nvidia.gpu&machineTypes=lf.linux.g6.4xlarge.experimental.nvidia.gpu&machineTypes=linux.g6.4xlarge.experimental.nvidia.gpu&machineTypes=linux.4xlarge.nvidia.gpu&machineTypes=linux.g5.12xlarge.nvidia.gpu&machineTypes=linux.g4dn.4xlarge.nvidia.gpu&machineTypes=lf.linux.4xlarge.nvidia.gpu&machineTypes=linux.g6.12xlarge.nvidia.gpu&items=lf.linux.g4dn.4xlarge.nvidia.gpu&items=linux.g4dn.12xlarge.nvidia.gpu&items=linux.g4dn.metal.nvidia.gpu&items=linux.g5.4xlarge.nvidia.gpu&items=lf.linux.g4dn.12xlarge.nvidia.gpu&items=lf.linux.g5.12xlarge.nvidia.gpu&items=lf.linux.g5.4xlarge.nvidia.gpu&items=lf.linux.g6.4xlarge.experimental.nvidia.gpu&items=linux.g6.4xlarge.experimental.nvidia.gpu&items=linux.4xlarge.nvidia.gpu&items=linux.g5.12xlarge.nvidia.gpu&items=linux.g4dn.4xlarge.nvidia.gpu&items=lf.linux.4xlarge.nvidia.gpu&items=linux.g6.12xlarge.nvidia.gpu)) Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164585 Approved by: https://github.com/malfet, https://github.com/yangw-dev, https://github.com/atalman, https://github.com/jeffdaily
This commit is contained in:
committed by
PyTorch MergeBot
parent
f39789cdab
commit
7eb1eb4313
35
.github/workflows/trunk.yml
vendored
35
.github/workflows/trunk.yml
vendored
@ -189,41 +189,6 @@ jobs:
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
|
||||
secrets: inherit
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
Reference in New Issue
Block a user