[ROCm] Add diskspace check for rocm CI nodes (#93032)

Fixes #92822

Pull Request resolved: https://github.com/pytorch/pytorch/pull/93032
Approved by: https://github.com/malfet, https://github.com/huydhn
This commit is contained in:
amdfaa
2023-02-03 22:38:57 +00:00
committed by PyTorch MergeBot
parent ef156f9136
commit 6d597c532e
3 changed files with 38 additions and 10 deletions

View File

@ -0,0 +1,31 @@
name: Cleans up diskspace
description: Cleans up diskspace if the root directory has used more than seventy percent of your diskspace.
inputs:
diskspace-cutoff:
description: The percent amount after which docker prune is run.
required: true
default: 70
runs:
using: composite
steps:
- name: Cleans up diskspace
shell: bash
run: |
diskspace_cutoff=${{ inputs.diskspace-cutoff }}
diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
docker system prune -af
diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
echo "$msg"
exit 1
else
difference=$((diskspace - diskspace_new))
echo "Diskspace saved: $difference percent"
fi
fi

View File

@ -57,6 +57,10 @@ runs:
exit 1
fi
- name: Runner diskspace health check
uses: ./.github/actions/diskspace-cleanup
if: always()
- name: Runner health check disconnect on failure
if: ${{ failure() }}
shell: bash

View File

@ -14,13 +14,6 @@ runs:
docker stop $(docker ps -q) || true
# Prune all stopped containers.
docker container prune -f
# Prune everything docker if there are more than 10 images (~200GB).
# This is easier than using a time filter, e.g., "until=24h".
# Might fail if a prune is already in progress by another runner.
image_count=$(docker images | wc -l)
if [[ ${image_count} -gt 10 ]]; then
echo "Purging all docker caches"
docker system prune -af || true
else
echo "Will not purge docker, only ${image_count} images found"
fi
- name: Runner diskspace health check
uses: ./.github/actions/diskspace-cleanup
if: always()