mirror of
https://github.com/huggingface/peft.git
synced 2025-10-20 23:43:47 +08:00
This change introduces CI caching for datasets and hub artifacts across runner operating systems with the intended goal to minimize the number of failed test runs because of network faults. As an additional bonus it might make the CI a bit faster. The following artifacts are cached: ${HF_HOME}/hub/** Note that we're avoiding .lock files as well as *.pyc files. We're not simply caching $HF_HOME since there is also the datasets and modules where the former was acting up when testing (no details, just dropped, we may explore this later but we're not using that many datasets) and the latter is just code which is probably not a good idea to cache anyway. There is a post process for the cache action which uploads new data to the cache - only one runner can access the cache for uploading. This is done because github actions is locking cache creation, so if there's a concurrent cache creation, both may fail. This runner is currently set to ubuntu in the python 3.10 run. If this modification turns out to be ineffective we can move to forbidding access to the hub in general (HF_HUB_OFFLINE=1) and updating the cache once per day but let's first try out if this is already enough to decrease the fail rate.
68 lines
2.3 KiB
Python
68 lines
2.3 KiB
Python
"""
|
|
Utility to clean cache files that exceed a specific time in days according to their
|
|
last access time recorded in the cache.
|
|
|
|
Exit code:
|
|
- 1 if no candidates are found
|
|
- 0 if candidates are found
|
|
|
|
Deletion can be enabled by passing `-d` parameter, otherwise it will only list the candidates.
|
|
"""
|
|
|
|
import sys
|
|
from datetime import datetime as dt
|
|
|
|
from huggingface_hub import scan_cache_dir
|
|
|
|
|
|
def find_old_revisions(scan_results, max_age_days=30):
|
|
"""Find commit hashes of objects in the cache. These objects need a last access time that
|
|
is above the passed `max_age_days` parameter. Returns an empty list if no objects are found.
|
|
Time measurement is based of the current time and the recorded last access tiem in the cache.
|
|
"""
|
|
now = dt.now()
|
|
revisions = [(i.revisions, i.last_accessed) for i in scan_results.repos]
|
|
revisions_ages = [(rev, (now - dt.fromtimestamp(ts_access)).days) for rev, ts_access in revisions]
|
|
delete_candidates = [rev for rev, age in revisions_ages if age > max_age_days]
|
|
hashes = [n.commit_hash for rev in delete_candidates for n in rev]
|
|
|
|
return hashes
|
|
|
|
|
|
def delete_old_revisions(scan_results, delete_candidates, do_delete=False):
|
|
delete_operation = scan_results.delete_revisions(*delete_candidates)
|
|
print(f"Would free {delete_operation.expected_freed_size_str}")
|
|
print(f"Candidates: {delete_candidates}")
|
|
|
|
if do_delete:
|
|
print("Deleting now.")
|
|
delete_operation.execute()
|
|
else:
|
|
print("Not deleting, pass the -d flag.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from argparse import ArgumentParser
|
|
|
|
parser = ArgumentParser()
|
|
parser.add_argument("-a", "--max-age", type=int, default=30, help="Max. age in days items in the cache may have.")
|
|
parser.add_argument(
|
|
"-d",
|
|
"--delete",
|
|
action="store_true",
|
|
help=(
|
|
"Delete mode; Really delete items if there are candidates. Exit code = 0 when we found something to delete, 1 "
|
|
"otherwise."
|
|
),
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
scan_results = scan_cache_dir()
|
|
|
|
delete_candidates = find_old_revisions(scan_results, args.max_age)
|
|
if not delete_candidates:
|
|
print("No delete candidates found, not deleting anything.")
|
|
sys.exit(1)
|
|
|
|
delete_old_revisions(scan_results, delete_candidates, do_delete=args.delete)
|