mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[ROCm] Add diskspace check for rocm CI nodes (#93032)
Fixes #92822 Pull Request resolved: https://github.com/pytorch/pytorch/pull/93032 Approved by: https://github.com/malfet, https://github.com/huydhn
This commit is contained in:
31
.github/actions/diskspace-cleanup/action.yml
vendored
Normal file
31
.github/actions/diskspace-cleanup/action.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
name: Cleans up diskspace
|
||||
|
||||
description: Cleans up diskspace if the root directory has used more than seventy percent of your diskspace.
|
||||
|
||||
inputs:
|
||||
diskspace-cutoff:
|
||||
description: The percent amount after which docker prune is run.
|
||||
required: true
|
||||
default: 70
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Cleans up diskspace
|
||||
shell: bash
|
||||
run: |
|
||||
diskspace_cutoff=${{ inputs.diskspace-cutoff }}
|
||||
diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
|
||||
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
|
||||
if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
|
||||
docker system prune -af
|
||||
diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
|
||||
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
|
||||
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
|
||||
echo "$msg"
|
||||
exit 1
|
||||
else
|
||||
difference=$((diskspace - diskspace_new))
|
||||
echo "Diskspace saved: $difference percent"
|
||||
fi
|
||||
fi
|
4
.github/actions/setup-rocm/action.yml
vendored
4
.github/actions/setup-rocm/action.yml
vendored
@ -57,6 +57,10 @@ runs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Runner diskspace health check
|
||||
uses: ./.github/actions/diskspace-cleanup
|
||||
if: always()
|
||||
|
||||
- name: Runner health check disconnect on failure
|
||||
if: ${{ failure() }}
|
||||
shell: bash
|
||||
|
13
.github/actions/teardown-rocm/action.yml
vendored
13
.github/actions/teardown-rocm/action.yml
vendored
@ -14,13 +14,6 @@ runs:
|
||||
docker stop $(docker ps -q) || true
|
||||
# Prune all stopped containers.
|
||||
docker container prune -f
|
||||
# Prune everything docker if there are more than 10 images (~200GB).
|
||||
# This is easier than using a time filter, e.g., "until=24h".
|
||||
# Might fail if a prune is already in progress by another runner.
|
||||
image_count=$(docker images | wc -l)
|
||||
if [[ ${image_count} -gt 10 ]]; then
|
||||
echo "Purging all docker caches"
|
||||
docker system prune -af || true
|
||||
else
|
||||
echo "Will not purge docker, only ${image_count} images found"
|
||||
fi
|
||||
- name: Runner diskspace health check
|
||||
uses: ./.github/actions/diskspace-cleanup
|
||||
if: always()
|
||||
|
Reference in New Issue
Block a user