[td] Consistent pytest cache (#113804)

Move the pytest cache downloading into the build step and store it in additional ci files so that it stays consistent during sharding. Only build env is taken into account now instead of also test config since we might not have the test config during build time, making it less specific, but I also think this might be better since tests are likely to fail across the same test config (I also think it might be worth not even looking at build env but thats a different topic) Each cache upload should only include information from the current run. Do not merge current cache with downloaded cache during upload (shouldn't matter anyways since the downloaded cache won't exist at the time) From what I cant tell of the s3 retention policy, pytest cache files will be deleted after 30 days (cc @ZainRizvi to confirm), so we never have to worry about space or pulling old versions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/113804 Approved by: https://github.com/ZainRizvi
2025-10-20 21:14:14 +08:00 · 2023-11-17 23:45:43 +00:00
parent 033d7b670a
commit dab272eed8
11 changed files with 98 additions and 83 deletions
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@ -10,6 +10,13 @@ inputs:
    description: Shard number for the current job
    required: false
    default: "0"
+  sha:
+    description: SHA for the commit
+    required: true
+  test_config:
+    description: Name of the test config
+    required: false
+    default: "default"
  job_identifier:
    description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
    required: true
@ -33,6 +40,8 @@ runs:
      env:
        CACHE_DIR: ${{ inputs.cache_dir }}
        JOB_IDENTIFIER: ${{ inputs.job_identifier }}
+        SHA: ${{ inputs.sha }}
+        TEST_CONFIG: ${{ inputs.test_config }}
        SHARD: ${{ inputs.shard }}
        REPO: ${{ github.repository }}
      run: |
@ -41,6 +50,8 @@ runs:
          --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
          --pr_identifier $GITHUB_REF \
          --job_identifier $JOB_IDENTIFIER \
+          --sha $SHA \
+          --test_config $TEST_CONFIG \
          --shard $SHARD \
          --repo $REPO \
          --temp_dir $RUNNER_TEMP \
--- a/.github/scripts/pytest_cache.py
+++ b/.github/scripts/pytest_cache.py
@ -38,6 +38,12 @@ def parse_args() -> argparse.Namespace:
        required=True,
        help="A unique job identifier that should be the same for all runs of job",
    )
+    parser.add_argument(
+        "--sha", required="--upload" in sys.argv, help="SHA of the commit"
+    )  # Only required for upload
+    parser.add_argument(
+        "--test_config", required="--upload" in sys.argv, help="The test config"
+    )  # Only required for upload
    parser.add_argument(
        "--shard", required="--upload" in sys.argv, help="The shard id"
    )  # Only required for upload
@ -84,6 +90,8 @@ def main() -> None:
            pr_identifier=pr_identifier,
            repo=repo,
            job_identifier=args.job_identifier,
+            sha=args.sha,
+            test_config=args.test_config,
            shard=args.shard,
            cache_dir=cache_dir,
            bucket=args.bucket,
--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -56,6 +56,8 @@ def upload_pytest_cache(
    pr_identifier: PRIdentifier,
    repo: GithubRepo,
    job_identifier: str,
+    sha: str,
+    test_config: str,
    shard: str,
    cache_dir: Path,
    temp_dir: Path,
@ -79,25 +81,11 @@ def upload_pytest_cache(
    if not bucket:
        bucket = BUCKET

-    # Merge the current cache with any caches from previous runs before uploading
-    # We only need to merge it with the cache for the same shard (which will have already been downloaded if it exists)
-    # since the other shards will handle themselves
-    shard_cache_path = _get_temp_cache_dir_path(
-        temp_dir, pr_identifier, repo, job_identifier, shard
-    )
-
-    if shard_cache_path.is_dir():
-        _merge_pytest_caches(shard_cache_path, cache_dir)
-
-    #
    # Upload the cache
-    #
-
-    obj_key_prefix = _get_s3_key_prefix(pr_identifier, repo, job_identifier, shard)
-    # This doesn't include the zip file extension. That'll get added later
-    zip_file_path = temp_dir / ZIP_UPLOAD / obj_key_prefix
-
-    zip_file_path = zip_folder(cache_dir, zip_file_path)
+    obj_key_prefix = _get_s3_key_prefix(
+        pr_identifier, repo, job_identifier, sha, test_config, shard
+    )
+    zip_file_path = zip_folder(cache_dir, temp_dir / ZIP_UPLOAD / obj_key_prefix)
    obj_key = f"{obj_key_prefix}{os.path.splitext(zip_file_path)[1]}"  # Keep the new file extension
    upload_file_to_s3(zip_file_path, bucket, obj_key)

@ -136,38 +124,22 @@ def download_pytest_cache(
    )

    for downloaded_zip in downloads:
-        # the file name of the zip is the shard id
-        shard = os.path.splitext(os.path.basename(downloaded_zip))[0]
-        cache_dir_for_shard = _get_temp_cache_dir_path(
-            temp_dir, pr_identifier, repo, job_identifier, shard
+        # Unzip into random folder, then merge with the current cache
+        cache_dir_for_shard = (
+            temp_dir / UNZIPPED_CACHES / os.urandom(16).hex() / PYTEST_CACHE_DIR_NAME
        )

        unzip_folder(downloaded_zip, cache_dir_for_shard)
-        print(
-            f"Merging cache for job_identifier `{job_identifier}`, shard `{shard}` into `{dest_cache_dir}`"
-        )
+        print(f"Merging cache from {downloaded_zip}")
        _merge_pytest_caches(cache_dir_for_shard, dest_cache_dir)


-def _get_temp_cache_dir_path(
-    temp_dir: Path,
-    pr_identifier: PRIdentifier,
-    repo: GithubRepo,
-    job_identifier: str,
-    shard: str,
-) -> Path:
-    return (
-        temp_dir
-        / UNZIPPED_CACHES
-        / _get_s3_key_prefix(pr_identifier, repo, job_identifier, shard)
-        / PYTEST_CACHE_DIR_NAME
-    )
-
-
 def _get_s3_key_prefix(
    pr_identifier: PRIdentifier,
    repo: GithubRepo,
    job_identifier: str,
+    sha: str = "",
+    test_config: str = "",
    shard: str = "",
 ) -> str:
    """
@ -176,6 +148,10 @@ def _get_s3_key_prefix(
    """
    prefix = f"{PYTEST_CACHE_KEY_PREFIX}/{repo.owner}/{repo.name}/{pr_identifier}/{sanitize_for_s3(job_identifier)}"

+    if sha:
+        prefix += f"/{sha}"
+    if test_config:
+        prefix += f"/{sanitize_for_s3(test_config)}"
    if shard:
        prefix += f"/{shard}"

--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -118,6 +118,13 @@ jobs:
          test-matrix: ${{ inputs.test-matrix }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

+      - name: Download pytest cache
+        uses: ./.github/actions/pytest-cache-download
+        continue-on-error: true
+        with:
+          cache_dir: .pytest_cache
+          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
+
      - name: Build
        if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
        id: build
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -130,13 +130,6 @@ jobs:
          test-matrix: ${{ inputs.test-matrix }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

-      - name: Download pytest cache
-        uses: ./.github/actions/pytest-cache-download
-        continue-on-error: true
-        with:
-          cache_dir: .pytest_cache
-          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}_${{ github.job }}_${{ matrix.config }}
-
      - name: Set Test step time
        id: test-timeout
        shell: bash
@ -256,7 +249,9 @@ jobs:
        with:
          cache_dir: .pytest_cache
          shard: ${{ matrix.shard }}
-          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}_${{ github.job }}_${{ matrix.config }}
+          sha: ${{ github.event.pull_request.head.sha || github.sha }}
+          test_config: ${{ matrix.config }}
+          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Print remaining test logs
        shell: bash
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -107,6 +107,13 @@ jobs:
          test-matrix: ${{ inputs.test-matrix }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

+      - name: Download pytest cache
+        uses: ./.github/actions/pytest-cache-download
+        continue-on-error: true
+        with:
+          cache_dir: .pytest_cache
+          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
+
      - name: Build
        if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
        id: build
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -132,13 +132,6 @@ jobs:
          test-matrix: ${{ inputs.test-matrix }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

-      - name: Download pytest cache
-        uses: ./.github/actions/pytest-cache-download
-        continue-on-error: true
-        with:
-          cache_dir: .pytest_cache
-          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}_${{ github.job }}_${{ matrix.config }}
-
      - name: Test
        id: test
        shell: bash
@ -190,7 +183,9 @@ jobs:
        with:
          cache_dir: .pytest_cache
          shard: ${{ matrix.shard }}
-          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}_${{ github.job }}_${{ matrix.config }}
+          sha: ${{ github.event.pull_request.head.sha || github.sha }}
+          test_config: ${{ matrix.config }}
+          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Print remaining test logs
        shell: bash
--- a/tools/stats/export_test_times.py
+++ b/tools/stats/export_test_times.py
@ -4,6 +4,7 @@ import sys
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
 sys.path.append(str(REPO_ROOT))
 from tools.stats.import_test_stats import (
+    copy_pytest_cache,
    get_td_heuristic_historial_edited_files_json,
    get_td_heuristic_profiling_json,
    get_test_class_ratings,
@ -21,6 +22,7 @@ def main() -> None:
    get_test_class_ratings()
    get_td_heuristic_historial_edited_files_json()
    get_td_heuristic_profiling_json()
+    copy_pytest_cache()


 if __name__ == "__main__":
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@ -4,6 +4,7 @@ import datetime
 import json
 import os
 import pathlib
+import shutil
 from typing import Any, Callable, cast, Dict, List, Optional, Union
 from urllib.request import urlopen

@ -26,6 +27,7 @@ TEST_FILE_RATINGS_FILE = "test-file-ratings.json"
 TEST_CLASS_RATINGS_FILE = "test-class-ratings.json"
 TD_HEURISTIC_PROFILING_FILE = "td_heuristic_profiling.json"
 TD_HEURISTIC_HISTORICAL_EDITED_FILES = "td_heuristic_historical_edited_files.json"
+TD_HEURISTIC_PREVIOUSLY_FAILED = "previous_failures.json"

 FILE_CACHE_LIFESPAN_SECONDS = datetime.timedelta(hours=3).seconds

@ -153,6 +155,16 @@ def get_td_heuristic_profiling_json() -> Dict[str, Any]:
    )


+def copy_pytest_cache() -> None:
+    original_path = REPO_ROOT / ".pytest_cache/v/cache/lastfailed"
+    if not original_path.exists():
+        return
+    shutil.copyfile(
+        original_path,
+        REPO_ROOT / ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_PREVIOUSLY_FAILED,
+    )
+
+
 def get_from_test_infra_generated_stats(
    from_file: str, to_file: str, failure_explanation: str
 ) -> Dict[str, Any]:
--- a/tools/test/heuristics/test_heuristics.py
+++ b/tools/test/heuristics/test_heuristics.py
@ -6,10 +6,10 @@ import unittest
 from typing import Any, Dict, Set
 from unittest import mock

-REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
 try:
    # using tools/ to optimize test run.
-    sys.path.append(str(REPO_ROOT))
+    sys.path.insert(0, str(REPO_ROOT))

    from tools.test.heuristics.heuristics_test_mixin import HeuristicsTestMixin
    from tools.testing.target_determination.determinator import (
@ -19,10 +19,11 @@ try:
    )
    from tools.testing.target_determination.heuristics import HEURISTICS
    from tools.testing.target_determination.heuristics.previously_failed_in_pr import (
-        _get_previously_failing_tests,
+        get_previous_failures,
    )
    from tools.testing.test_run import TestRun, TestRuns

+    sys.path.remove(str(REPO_ROOT))
 except ModuleNotFoundError:
    print("Can't import required modules, exiting")
    sys.exit(1)
@ -36,20 +37,20 @@ def mocked_file(contents: Dict[Any, Any]) -> io.IOBase:


 class TestParsePrevTests(HeuristicsTestMixin):
-    @mock.patch("pathlib.Path.exists", return_value=False)
+    @mock.patch("os.path.exists", return_value=False)
    def test_cache_does_not_exist(self, mock_exists: Any) -> None:
        expected_failing_test_files: Set[str] = set()

-        found_tests = _get_previously_failing_tests()
+        found_tests = get_previous_failures()

        self.assertSetEqual(expected_failing_test_files, found_tests)

-    @mock.patch("pathlib.Path.exists", return_value=True)
+    @mock.patch("os.path.exists", return_value=True)
    @mock.patch("builtins.open", return_value=mocked_file({"": True}))
    def test_empty_cache(self, mock_exists: Any, mock_open: Any) -> None:
        expected_failing_test_files: Set[str] = set()

-        found_tests = _get_previously_failing_tests()
+        found_tests = get_previous_failures()

        self.assertSetEqual(expected_failing_test_files, found_tests)
        mock_open.assert_called()
@ -61,19 +62,19 @@ class TestParsePrevTests(HeuristicsTestMixin):
        "test/test_bar.py::TestBar::test_fun_copy[25]": True,
    }

-    @mock.patch("pathlib.Path.exists", return_value=True)
+    @mock.patch("os.path.exists", return_value=True)
    @mock.patch(
        "builtins.open",
        return_value=mocked_file(lastfailed_with_multiple_tests_per_file),
    )
    def test_dedupes_failing_test_files(self, mock_exists: Any, mock_open: Any) -> None:
        expected_failing_test_files = {"test_car", "test_bar", "test_far"}
-        found_tests = _get_previously_failing_tests()
+        found_tests = get_previous_failures()

        self.assertSetEqual(expected_failing_test_files, found_tests)

    @mock.patch(
-        "tools.testing.target_determination.heuristics.previously_failed_in_pr._get_previously_failing_tests",
+        "tools.testing.target_determination.heuristics.previously_failed_in_pr.get_previous_failures",
        return_value={"test4"},
    )
    @mock.patch(
--- a/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
+++ b/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
@ -1,7 +1,12 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, List, Set
-from warnings import warn
+
+from tools.stats.import_test_stats import (
+    ADDITIONAL_CI_FILES_FOLDER,
+    TD_HEURISTIC_PREVIOUSLY_FAILED,
+)

 from tools.testing.target_determination.heuristics.interface import (
    HeuristicInterface,
@ -11,6 +16,8 @@ from tools.testing.target_determination.heuristics.utils import (
    python_test_file_to_test_name,
 )

+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent.parent
+

 class PreviouslyFailedInPR(HeuristicInterface):
    def __init__(self, **kwargs: Dict[str, Any]):
@ -19,7 +26,7 @@ class PreviouslyFailedInPR(HeuristicInterface):
    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
        # Tests must always be returned in a deterministic order.
        # Otherwise it breaks our test sharding logic
-        critical_tests = sorted(_get_previously_failing_tests())
+        critical_tests = sorted(get_previous_failures())
        test_rankings = TestPrioritizations(
            tests_being_ranked=tests, high_relevance=critical_tests
        )
@ -27,25 +34,19 @@ class PreviouslyFailedInPR(HeuristicInterface):
        return test_rankings

    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
-        critical_tests = _get_previously_failing_tests()
+        critical_tests = get_previous_failures()
        return {test: 1 for test in critical_tests if test in tests}


-def _get_previously_failing_tests() -> Set[str]:
-    PYTEST_FAILED_TESTS_CACHE_FILE_PATH = Path(".pytest_cache/v/cache/lastfailed")
-
-    if not PYTEST_FAILED_TESTS_CACHE_FILE_PATH.exists():
-        warn(
-            f"No pytorch cache found at {PYTEST_FAILED_TESTS_CACHE_FILE_PATH.absolute()}"
-        )
+def get_previous_failures() -> Set[str]:
+    path = REPO_ROOT / ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_PREVIOUSLY_FAILED
+    if not os.path.exists(path):
+        print(f"could not find path {path}")
        return set()
-
-    with open(PYTEST_FAILED_TESTS_CACHE_FILE_PATH) as f:
-        last_failed_tests = json.load(f)
-
-    prioritized_tests = _parse_prev_failing_test_files(last_failed_tests)
-
-    return python_test_file_to_test_name(prioritized_tests)
+    with open(path) as f:
+        return python_test_file_to_test_name(
+            _parse_prev_failing_test_files(json.load(f))
+        )


 def _parse_prev_failing_test_files(last_failed_tests: Dict[str, bool]) -> Set[str]: