mirror of
				https://github.com/pytorch/pytorch.git
				synced 2025-10-20 21:14:14 +08:00 
			
		
		
		
	Enable MI355X PyTorch CI testing. (#158889)
This PR consists of all the changes required to enable PyTorch ROCm CI on MI355X nodes. - Rework aotriton cmake configuration to rely on `HIP_VERSION` instead of `ROCM_VERSION` as aotriton depnds on hip. Hip loosely track the rocm major version, but the two are not actually synchronized as observed in the ROCm 7 alpha build. - Bump composable-kernel submodule to [df6023e305f389bbf7249b0c4414e649f3ad6598](df6023e305) for mi350 compatibility. - Extend the change docker permissions step to the MI355x runners as well. This step is included to apply the required permission change to the test folder for a successful upload of artifacts in k8s docker. - Create new rocm-mi355 workflow to trigger core PyTorch tests on a nightly basis at 2:30 am PST. - Successfully tested running the test suites listed in rocm-mi355.yml on MI355 runners by temporarily hacking rocm-mi300.yml:ca7d5fae11 (rocm-mi300)Pull Request resolved: https://github.com/pytorch/pytorch/pull/158889 Approved by: https://github.com/jeffdaily
This commit is contained in:
		
				
					committed by
					
						 PyTorch MergeBot
						PyTorch MergeBot
					
				
			
			
				
	
			
			
			
						parent
						
							d8425e9c75
						
					
				
				
					commit
					5619bf9971
				
			
							
								
								
									
										4
									
								
								.github/workflows/_rocm-test.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/_rocm-test.yml
									
									
									
									
										vendored
									
									
								
							| @ -269,8 +269,8 @@ jobs: | |||||||
|           # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct |           # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct | ||||||
|           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" |           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" | ||||||
|  |  | ||||||
|       - name: Change permissions (only needed for MI300 runners for now) |       - name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now) | ||||||
|         if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }} |         if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }} | ||||||
|         run: | |         run: | | ||||||
|           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" |           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" | ||||||
|  |  | ||||||
|  | |||||||
							
								
								
									
										68
									
								
								.github/workflows/rocm-mi355.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								.github/workflows/rocm-mi355.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,68 @@ | |||||||
|  | name: rocm-mi355 | ||||||
|  |  | ||||||
|  | on: | ||||||
|  |   workflow_dispatch: | ||||||
|  |   schedule: | ||||||
|  |     - cron: 30 9 * * *  # about 2:30am PDT | ||||||
|  |  | ||||||
|  | concurrency: | ||||||
|  |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | ||||||
|  |   cancel-in-progress: true | ||||||
|  |  | ||||||
|  | permissions: read-all | ||||||
|  |  | ||||||
|  | jobs: | ||||||
|  |   target-determination: | ||||||
|  |     if: github.repository_owner == 'pytorch' | ||||||
|  |     name: before-test | ||||||
|  |     uses: ./.github/workflows/target_determination.yml | ||||||
|  |     permissions: | ||||||
|  |       id-token: write | ||||||
|  |       contents: read | ||||||
|  |  | ||||||
|  |   get-label-type: | ||||||
|  |     name: get-label-type | ||||||
|  |     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main | ||||||
|  |     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | ||||||
|  |     with: | ||||||
|  |       triggering_actor: ${{ github.triggering_actor }} | ||||||
|  |       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} | ||||||
|  |       curr_branch: ${{ github.head_ref || github.ref_name }} | ||||||
|  |       curr_ref_type: ${{ github.ref_type }} | ||||||
|  |  | ||||||
|  |   linux-noble-rocm-py3_12-build: | ||||||
|  |     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} | ||||||
|  |     name: linux-noble-rocm-py3.12-mi355 | ||||||
|  |     uses: ./.github/workflows/_linux-build.yml | ||||||
|  |     needs: get-label-type | ||||||
|  |     with: | ||||||
|  |       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" | ||||||
|  |       build-environment: linux-noble-rocm-py3.12-mi355 | ||||||
|  |       docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3 | ||||||
|  |       sync-tag: rocm-build | ||||||
|  |       test-matrix: | | ||||||
|  |         { include: [ | ||||||
|  |           { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||||
|  |           { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||||
|  |           { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||||
|  |           { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||||
|  |           { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||||
|  |           { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, | ||||||
|  |         ]} | ||||||
|  |     secrets: inherit | ||||||
|  |  | ||||||
|  |   linux-noble-rocm-py3_12-test: | ||||||
|  |     permissions: | ||||||
|  |       id-token: write | ||||||
|  |       contents: read | ||||||
|  |     name: linux-noble-rocm-py3.12-mi355 | ||||||
|  |     uses: ./.github/workflows/_rocm-test.yml | ||||||
|  |     needs: | ||||||
|  |       - linux-noble-rocm-py3_12-build | ||||||
|  |       - target-determination | ||||||
|  |     with: | ||||||
|  |       build-environment: linux-noble-rocm-py3.12-mi355 | ||||||
|  |       docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }} | ||||||
|  |       test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }} | ||||||
|  |       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" | ||||||
|  |     secrets: inherit | ||||||
							
								
								
									
										1
									
								
								.github/workflows/upload-test-stats.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/upload-test-stats.yml
									
									
									
									
										vendored
									
									
								
							| @ -14,6 +14,7 @@ on: | |||||||
|       - inductor-periodic |       - inductor-periodic | ||||||
|       - rocm |       - rocm | ||||||
|       - rocm-mi300 |       - rocm-mi300 | ||||||
|  |       - rocm-mi355 | ||||||
|       - inductor-micro-benchmark |       - inductor-micro-benchmark | ||||||
|       - inductor-micro-benchmark-x86 |       - inductor-micro-benchmark-x86 | ||||||
|       - inductor-cu124 |       - inductor-cu124 | ||||||
|  | |||||||
							
								
								
									
										5
									
								
								cmake/External/aotriton.cmake
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								cmake/External/aotriton.cmake
									
									
									
									
										vendored
									
									
								
							| @ -13,17 +13,20 @@ if(NOT __AOTRITON_INCLUDED) | |||||||
|   set(__AOTRITON_MANYLINUX_LIST |   set(__AOTRITON_MANYLINUX_LIST | ||||||
|       "manylinux_2_28"  # rocm6.3 |       "manylinux_2_28"  # rocm6.3 | ||||||
|       "manylinux_2_28"  # rocm6.4 |       "manylinux_2_28"  # rocm6.4 | ||||||
|  |       "manylinux_2_28"  # rocm6.5 | ||||||
|       "manylinux_2_28"  # rocm7.0 |       "manylinux_2_28"  # rocm7.0 | ||||||
|       ) |       ) | ||||||
|   set(__AOTRITON_ROCM_LIST |   set(__AOTRITON_ROCM_LIST | ||||||
|       "rocm6.3" |       "rocm6.3" | ||||||
|       "rocm6.4" |       "rocm6.4" | ||||||
|  |       "rocm6.5" | ||||||
|       "rocm7.0" |       "rocm7.0" | ||||||
|       ) |       ) | ||||||
|   set(__AOTRITON_CI_COMMIT "6fca155f4deeb8d9529326f7b69f350aeeb93477") |   set(__AOTRITON_CI_COMMIT "6fca155f4deeb8d9529326f7b69f350aeeb93477") | ||||||
|   set(__AOTRITON_SHA256_LIST |   set(__AOTRITON_SHA256_LIST | ||||||
|       "861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe"  # rocm6.3 |       "861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe"  # rocm6.3 | ||||||
|       "acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78"  # rocm6.4 |       "acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78"  # rocm6.4 | ||||||
|  |       "7e29c325d5bd33ba896ddb106f5d4fc7d715274dca7fe937f724fffa82017838"  # rocm6.5 | ||||||
|       "1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b"  # rocm7.0 |       "1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b"  # rocm7.0 | ||||||
|       ) |       ) | ||||||
|   set(__AOTRITON_Z "gz") |   set(__AOTRITON_Z "gz") | ||||||
| @ -59,7 +62,7 @@ if(NOT __AOTRITON_INCLUDED) | |||||||
|     add_dependencies(__caffe2_aotriton aotriton_external) |     add_dependencies(__caffe2_aotriton aotriton_external) | ||||||
|     message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}") |     message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}") | ||||||
|   else() |   else() | ||||||
|     set(__AOTRITON_SYSTEM_ROCM "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}") |     set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}") | ||||||
|     list(GET __AOTRITON_ROCM_LIST 0 __AOTRITON_ROCM_DEFAULT_STR) |     list(GET __AOTRITON_ROCM_LIST 0 __AOTRITON_ROCM_DEFAULT_STR) | ||||||
|     # Initialize __AOTRITON_ROCM to lowest version, in case all builds > system's ROCM |     # Initialize __AOTRITON_ROCM to lowest version, in case all builds > system's ROCM | ||||||
|     string(SUBSTRING ${__AOTRITON_ROCM_DEFAULT_STR} 4 -1 __AOTRITON_ROCM) |     string(SUBSTRING ${__AOTRITON_ROCM_DEFAULT_STR} 4 -1 __AOTRITON_ROCM) | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								third_party/composable_kernel
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
							
						
						
									
										2
									
								
								third_party/composable_kernel
									
									
									
									
										vendored
									
									
								
							 Submodule third_party/composable_kernel updated: 434d19f696...df6023e305
									
								
							
		Reference in New Issue
	
	Block a user