mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Enable MI355X PyTorch CI testing. (#158889)
This PR consists of all the changes required to enable PyTorch ROCm CI on MI355X nodes. - Rework aotriton cmake configuration to rely on `HIP_VERSION` instead of `ROCM_VERSION` as aotriton depnds on hip. Hip loosely track the rocm major version, but the two are not actually synchronized as observed in the ROCm 7 alpha build. - Bump composable-kernel submodule to [df6023e305f389bbf7249b0c4414e649f3ad6598](df6023e305) for mi350 compatibility. - Extend the change docker permissions step to the MI355x runners as well. This step is included to apply the required permission change to the test folder for a successful upload of artifacts in k8s docker. - Create new rocm-mi355 workflow to trigger core PyTorch tests on a nightly basis at 2:30 am PST. - Successfully tested running the test suites listed in rocm-mi355.yml on MI355 runners by temporarily hacking rocm-mi300.yml:ca7d5fae11 (rocm-mi300)Pull Request resolved: https://github.com/pytorch/pytorch/pull/158889 Approved by: https://github.com/jeffdaily
This commit is contained in:
committed by
PyTorch MergeBot
parent
d8425e9c75
commit
5619bf9971
4
.github/workflows/_rocm-test.yml
vendored
4
.github/workflows/_rocm-test.yml
vendored
@ -269,8 +269,8 @@ jobs:
|
|||||||
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
# copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
|
||||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
|
||||||
|
|
||||||
- name: Change permissions (only needed for MI300 runners for now)
|
- name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now)
|
||||||
if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
|
if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }}
|
||||||
run: |
|
run: |
|
||||||
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
|
docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
|
||||||
|
|
||||||
|
|||||||
68
.github/workflows/rocm-mi355.yml
vendored
Normal file
68
.github/workflows/rocm-mi355.yml
vendored
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
name: rocm-mi355
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: 30 9 * * * # about 2:30am PDT
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions: read-all
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
target-determination:
|
||||||
|
if: github.repository_owner == 'pytorch'
|
||||||
|
name: before-test
|
||||||
|
uses: ./.github/workflows/target_determination.yml
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
get-label-type:
|
||||||
|
name: get-label-type
|
||||||
|
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||||
|
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||||
|
with:
|
||||||
|
triggering_actor: ${{ github.triggering_actor }}
|
||||||
|
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||||
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
|
||||||
|
linux-noble-rocm-py3_12-build:
|
||||||
|
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||||
|
name: linux-noble-rocm-py3.12-mi355
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
needs: get-label-type
|
||||||
|
with:
|
||||||
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
build-environment: linux-noble-rocm-py3.12-mi355
|
||||||
|
docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
|
||||||
|
sync-tag: rocm-build
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||||
|
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||||
|
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||||
|
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||||
|
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||||
|
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
linux-noble-rocm-py3_12-test:
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
name: linux-noble-rocm-py3.12-mi355
|
||||||
|
uses: ./.github/workflows/_rocm-test.yml
|
||||||
|
needs:
|
||||||
|
- linux-noble-rocm-py3_12-build
|
||||||
|
- target-determination
|
||||||
|
with:
|
||||||
|
build-environment: linux-noble-rocm-py3.12-mi355
|
||||||
|
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||||
|
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
|
||||||
|
secrets: inherit
|
||||||
1
.github/workflows/upload-test-stats.yml
vendored
1
.github/workflows/upload-test-stats.yml
vendored
@ -14,6 +14,7 @@ on:
|
|||||||
- inductor-periodic
|
- inductor-periodic
|
||||||
- rocm
|
- rocm
|
||||||
- rocm-mi300
|
- rocm-mi300
|
||||||
|
- rocm-mi355
|
||||||
- inductor-micro-benchmark
|
- inductor-micro-benchmark
|
||||||
- inductor-micro-benchmark-x86
|
- inductor-micro-benchmark-x86
|
||||||
- inductor-cu124
|
- inductor-cu124
|
||||||
|
|||||||
5
cmake/External/aotriton.cmake
vendored
5
cmake/External/aotriton.cmake
vendored
@ -13,17 +13,20 @@ if(NOT __AOTRITON_INCLUDED)
|
|||||||
set(__AOTRITON_MANYLINUX_LIST
|
set(__AOTRITON_MANYLINUX_LIST
|
||||||
"manylinux_2_28" # rocm6.3
|
"manylinux_2_28" # rocm6.3
|
||||||
"manylinux_2_28" # rocm6.4
|
"manylinux_2_28" # rocm6.4
|
||||||
|
"manylinux_2_28" # rocm6.5
|
||||||
"manylinux_2_28" # rocm7.0
|
"manylinux_2_28" # rocm7.0
|
||||||
)
|
)
|
||||||
set(__AOTRITON_ROCM_LIST
|
set(__AOTRITON_ROCM_LIST
|
||||||
"rocm6.3"
|
"rocm6.3"
|
||||||
"rocm6.4"
|
"rocm6.4"
|
||||||
|
"rocm6.5"
|
||||||
"rocm7.0"
|
"rocm7.0"
|
||||||
)
|
)
|
||||||
set(__AOTRITON_CI_COMMIT "6fca155f4deeb8d9529326f7b69f350aeeb93477")
|
set(__AOTRITON_CI_COMMIT "6fca155f4deeb8d9529326f7b69f350aeeb93477")
|
||||||
set(__AOTRITON_SHA256_LIST
|
set(__AOTRITON_SHA256_LIST
|
||||||
"861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe" # rocm6.3
|
"861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe" # rocm6.3
|
||||||
"acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78" # rocm6.4
|
"acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78" # rocm6.4
|
||||||
|
"7e29c325d5bd33ba896ddb106f5d4fc7d715274dca7fe937f724fffa82017838" # rocm6.5
|
||||||
"1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b" # rocm7.0
|
"1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b" # rocm7.0
|
||||||
)
|
)
|
||||||
set(__AOTRITON_Z "gz")
|
set(__AOTRITON_Z "gz")
|
||||||
@ -59,7 +62,7 @@ if(NOT __AOTRITON_INCLUDED)
|
|||||||
add_dependencies(__caffe2_aotriton aotriton_external)
|
add_dependencies(__caffe2_aotriton aotriton_external)
|
||||||
message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
|
message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
|
||||||
else()
|
else()
|
||||||
set(__AOTRITON_SYSTEM_ROCM "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}")
|
set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
|
||||||
list(GET __AOTRITON_ROCM_LIST 0 __AOTRITON_ROCM_DEFAULT_STR)
|
list(GET __AOTRITON_ROCM_LIST 0 __AOTRITON_ROCM_DEFAULT_STR)
|
||||||
# Initialize __AOTRITON_ROCM to lowest version, in case all builds > system's ROCM
|
# Initialize __AOTRITON_ROCM to lowest version, in case all builds > system's ROCM
|
||||||
string(SUBSTRING ${__AOTRITON_ROCM_DEFAULT_STR} 4 -1 __AOTRITON_ROCM)
|
string(SUBSTRING ${__AOTRITON_ROCM_DEFAULT_STR} 4 -1 __AOTRITON_ROCM)
|
||||||
|
|||||||
2
third_party/composable_kernel
vendored
2
third_party/composable_kernel
vendored
Submodule third_party/composable_kernel updated: 434d19f696...df6023e305
Reference in New Issue
Block a user