[experiment][TD] Rating number system (#112676)

Emit excessive amount of heuristic info emitted, but that just means I can do more with it later? Pull Request resolved: https://github.com/pytorch/pytorch/pull/112676 Approved by: https://github.com/ZainRizvi
2025-10-21 05:34:18 +08:00 · 2023-11-07 19:40:07 +00:00
parent 82875e69fe
commit 0c448526a4
10 changed files with 114 additions and 11 deletions
--- a/test/run_test.py
+++ b/test/run_test.py
@ -47,6 +47,7 @@ from tools.stats.import_test_stats import (
 from tools.stats.upload_metrics import add_global_metric, emit_metric
 from tools.testing.target_determination.determinator import (
    AggregatedHeuristics,
+    get_prediction_confidences,
    get_test_prioritizations,
 )

@ -1806,7 +1807,17 @@ def main():
                test_stats["num_total_tests"] = num_tests

                print_to_stderr("Emiting td_test_failure_stats")
-                emit_metric("td_test_failure_stats", test_stats)
+                emit_metric(
+                    "td_test_failure_stats",
+                    {
+                        **test_stats,
+                        "confidence_ratings": get_prediction_confidences(
+                            selected_tests
+                        ),
+                        "failure": str(test),
+                        "tests": selected_tests,
+                    },
+                )

    if len(all_failures):
        for _, err in all_failures:
--- a/tools/stats/upload_metrics.py
+++ b/tools/stats/upload_metrics.py
@ -167,4 +167,14 @@ def emit_metric(


 def _convert_float_values_to_decimals(data: Dict[str, Any]) -> Dict[str, Any]:
-    return {k: Decimal(str(v)) if isinstance(v, float) else v for k, v in data.items()}
+    # Attempt to recurse
+    def _helper(o: Any) -> Any:
+        if isinstance(o, float):
+            return Decimal(str(o))
+        if isinstance(o, list):
+            return [_helper(v) for v in o]
+        if isinstance(o, dict):
+            return {_helper(k): _helper(v) for k, v in o.items()}
+        return o
+
+    return {k: _helper(v) for k, v in data.items()}
--- a/tools/testing/target_determination/determinator.py
+++ b/tools/testing/target_determination/determinator.py
@ -1,4 +1,4 @@
-from typing import List
+from typing import Dict, List

 from tools.testing.target_determination.heuristics import (
    AggregatedHeuristics as AggregatedHeuristics,
@ -27,3 +27,11 @@ def get_test_prioritizations(tests: List[str]) -> AggregatedHeuristics:
            new_rankings.print_info()

    return aggregated_results
+
+
+def get_prediction_confidences(tests: List[str]) -> Dict[str, Dict[str, float]]:
+    # heuristic name -> test -> rating/confidence
+    rankings: Dict[str, Dict[str, float]] = {}
+    for heuristic in HEURISTICS:
+        rankings[heuristic.name] = heuristic.get_prediction_confidence(tests)
+    return rankings
--- a/tools/testing/target_determination/heuristics/correlated_with_historical_failures.py
+++ b/tools/testing/target_determination/heuristics/correlated_with_historical_failures.py
@ -10,7 +10,11 @@ from tools.testing.target_determination.heuristics.interface import (
    TestPrioritizations,
 )

-from tools.testing.target_determination.heuristics.utils import get_correlated_tests
+from tools.testing.target_determination.heuristics.utils import (
+    get_correlated_tests,
+    get_ratings_for_tests,
+    normalize_ratings,
+)


 class CorrelatedWithHistoricalFailures(HeuristicInterface):
@ -27,3 +31,10 @@ class CorrelatedWithHistoricalFailures(HeuristicInterface):
        )

        return test_rankings
+
+    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+        test_ratings = get_ratings_for_tests(
+            ADDITIONAL_CI_FILES_FOLDER / TEST_FILE_RATINGS_FILE
+        )
+        test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
+        return normalize_ratings(test_ratings, 1)
--- a/tools/testing/target_determination/heuristics/edited_by_pr.py
+++ b/tools/testing/target_determination/heuristics/edited_by_pr.py
@ -25,6 +25,10 @@ class EditedByPR(HeuristicInterface):

        return test_rankings

+    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+        critical_tests = _get_modified_tests()
+        return {test: 1 for test in critical_tests if test in tests}
+

 def _get_modified_tests() -> Set[str]:
    try:
--- a/tools/testing/target_determination/heuristics/historical_edited_files.py
+++ b/tools/testing/target_determination/heuristics/historical_edited_files.py
@ -1,4 +1,4 @@
-from typing import Any, List
+from typing import Any, Dict, List

 from tools.stats.import_test_stats import (
    ADDITIONAL_CI_FILES_FOLDER,
@ -10,7 +10,11 @@ from tools.testing.target_determination.heuristics.interface import (
    TestPrioritizations,
 )

-from tools.testing.target_determination.heuristics.utils import get_correlated_tests
+from tools.testing.target_determination.heuristics.utils import (
+    get_correlated_tests,
+    get_ratings_for_tests,
+    normalize_ratings,
+)


 # This heuristic assumes that changed files in previous commits are good sources
@ -32,3 +36,10 @@ class HistorialEditedFiles(HeuristicInterface):
        )

        return test_rankings
+
+    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+        test_ratings = get_ratings_for_tests(
+            ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_HISTORICAL_EDITED_FILES
+        )
+        test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
+        return normalize_ratings(test_ratings, 1)
--- a/tools/testing/target_determination/heuristics/interface.py
+++ b/tools/testing/target_determination/heuristics/interface.py
@ -458,3 +458,13 @@ class HeuristicInterface:

    def __str__(self) -> str:
        return self.name
+
+    @abstractmethod
+    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+        """
+        Like get_test_priorities, but instead returns a float ranking ranging
+        from -1 to 1, where negative means skip, positive means run, 0 means no
+        idea, and magnitude = how confident the heuristic is. Used by
+        AggregatedHeuristicsRankings.
+        """
+        pass
--- a/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
+++ b/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
@ -26,6 +26,10 @@ class PreviouslyFailedInPR(HeuristicInterface):

        return test_rankings

+    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+        critical_tests = _get_previously_failing_tests()
+        return {test: 1 for test in critical_tests if test in tests}
+

 def _get_previously_failing_tests() -> Set[str]:
    PYTEST_FAILED_TESTS_CACHE_FILE_PATH = Path(".pytest_cache/v/cache/lastfailed")
--- a/tools/testing/target_determination/heuristics/profiling.py
+++ b/tools/testing/target_determination/heuristics/profiling.py
@ -1,4 +1,4 @@
-from typing import Any, List
+from typing import Any, Dict, List

 from tools.stats.import_test_stats import (
    ADDITIONAL_CI_FILES_FOLDER,
@ -10,7 +10,11 @@ from tools.testing.target_determination.heuristics.interface import (
    TestPrioritizations,
 )

-from tools.testing.target_determination.heuristics.utils import get_correlated_tests
+from tools.testing.target_determination.heuristics.utils import (
+    get_correlated_tests,
+    get_ratings_for_tests,
+    normalize_ratings,
+)


 # Profilers were used to gather simple python code coverage information for each
@ -30,3 +34,10 @@ class Profiling(HeuristicInterface):
        )

        return test_rankings
+
+    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+        test_ratings = get_ratings_for_tests(
+            ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_PROFILING_FILE
+        )
+        test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
+        return normalize_ratings(test_ratings, 1)
--- a/tools/testing/target_determination/heuristics/utils.py
+++ b/tools/testing/target_determination/heuristics/utils.py
@ -44,21 +44,44 @@ def query_changed_files() -> List[str]:
    return lines


-def get_correlated_tests(file: Union[str, Path]) -> List[str]:
+def normalize_ratings(ratings: Dict[str, float], max_value: float) -> Dict[str, float]:
+    # Takse the ratings, makes the max value into max_value, and proportionally
+    # distributes the rest of the ratings.
+    # Ex [1,2,3,4] and max_value 8 gets converted to [2,4,6,8]
+    # Assumes all rankings are >= 0
+    # Don't modify in place
+    if len(ratings) == 0:
+        return ratings
+    min_rating = min(ratings.values())
+    assert min_rating > 0
+    max_rating = max(ratings.values())
+    assert max_rating > 0
+    normalized_ratings = {}
+    for tf, rank in ratings.items():
+        normalized_ratings[tf] = rank / max_rating * max_value
+    return normalized_ratings
+
+
+def get_ratings_for_tests(file: Union[str, Path]) -> Dict[str, float]:
    path = REPO_ROOT / file
    if not os.path.exists(path):
        print(f"could not find path {path}")
-        return []
+        return {}
    with open(path) as f:
        test_file_ratings = cast(Dict[str, Dict[str, float]], json.load(f))
    try:
        changed_files = query_changed_files()
    except Exception as e:
        warn(f"Can't query changed test files due to {e}")
-        return []
+        return {}
    ratings: Dict[str, float] = defaultdict(float)
    for file in changed_files:
        for test_file, score in test_file_ratings.get(file, {}).items():
            ratings[test_file] += score
+    return ratings
+
+
+def get_correlated_tests(file: Union[str, Path]) -> List[str]:
+    ratings = get_ratings_for_tests(file)
    prioritize = sorted(ratings, key=lambda x: -ratings[x])
    return prioritize