[experiment][TD] Rating number system (#112676)

Emit excessive amount of heuristic info emitted, but that just means I can do more with it later?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/112676
Approved by: https://github.com/ZainRizvi
This commit is contained in:
Catherine Lee
2023-11-07 19:40:07 +00:00
committed by PyTorch MergeBot
parent 82875e69fe
commit 0c448526a4
10 changed files with 114 additions and 11 deletions

View File

@ -47,6 +47,7 @@ from tools.stats.import_test_stats import (
from tools.stats.upload_metrics import add_global_metric, emit_metric
from tools.testing.target_determination.determinator import (
AggregatedHeuristics,
get_prediction_confidences,
get_test_prioritizations,
)
@ -1806,7 +1807,17 @@ def main():
test_stats["num_total_tests"] = num_tests
print_to_stderr("Emiting td_test_failure_stats")
emit_metric("td_test_failure_stats", test_stats)
emit_metric(
"td_test_failure_stats",
{
**test_stats,
"confidence_ratings": get_prediction_confidences(
selected_tests
),
"failure": str(test),
"tests": selected_tests,
},
)
if len(all_failures):
for _, err in all_failures:

View File

@ -167,4 +167,14 @@ def emit_metric(
def _convert_float_values_to_decimals(data: Dict[str, Any]) -> Dict[str, Any]:
return {k: Decimal(str(v)) if isinstance(v, float) else v for k, v in data.items()}
# Attempt to recurse
def _helper(o: Any) -> Any:
if isinstance(o, float):
return Decimal(str(o))
if isinstance(o, list):
return [_helper(v) for v in o]
if isinstance(o, dict):
return {_helper(k): _helper(v) for k, v in o.items()}
return o
return {k: _helper(v) for k, v in data.items()}

View File

@ -1,4 +1,4 @@
from typing import List
from typing import Dict, List
from tools.testing.target_determination.heuristics import (
AggregatedHeuristics as AggregatedHeuristics,
@ -27,3 +27,11 @@ def get_test_prioritizations(tests: List[str]) -> AggregatedHeuristics:
new_rankings.print_info()
return aggregated_results
def get_prediction_confidences(tests: List[str]) -> Dict[str, Dict[str, float]]:
# heuristic name -> test -> rating/confidence
rankings: Dict[str, Dict[str, float]] = {}
for heuristic in HEURISTICS:
rankings[heuristic.name] = heuristic.get_prediction_confidence(tests)
return rankings

View File

@ -10,7 +10,11 @@ from tools.testing.target_determination.heuristics.interface import (
TestPrioritizations,
)
from tools.testing.target_determination.heuristics.utils import get_correlated_tests
from tools.testing.target_determination.heuristics.utils import (
get_correlated_tests,
get_ratings_for_tests,
normalize_ratings,
)
class CorrelatedWithHistoricalFailures(HeuristicInterface):
@ -27,3 +31,10 @@ class CorrelatedWithHistoricalFailures(HeuristicInterface):
)
return test_rankings
def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
test_ratings = get_ratings_for_tests(
ADDITIONAL_CI_FILES_FOLDER / TEST_FILE_RATINGS_FILE
)
test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
return normalize_ratings(test_ratings, 1)

View File

@ -25,6 +25,10 @@ class EditedByPR(HeuristicInterface):
return test_rankings
def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
critical_tests = _get_modified_tests()
return {test: 1 for test in critical_tests if test in tests}
def _get_modified_tests() -> Set[str]:
try:

View File

@ -1,4 +1,4 @@
from typing import Any, List
from typing import Any, Dict, List
from tools.stats.import_test_stats import (
ADDITIONAL_CI_FILES_FOLDER,
@ -10,7 +10,11 @@ from tools.testing.target_determination.heuristics.interface import (
TestPrioritizations,
)
from tools.testing.target_determination.heuristics.utils import get_correlated_tests
from tools.testing.target_determination.heuristics.utils import (
get_correlated_tests,
get_ratings_for_tests,
normalize_ratings,
)
# This heuristic assumes that changed files in previous commits are good sources
@ -32,3 +36,10 @@ class HistorialEditedFiles(HeuristicInterface):
)
return test_rankings
def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
test_ratings = get_ratings_for_tests(
ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_HISTORICAL_EDITED_FILES
)
test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
return normalize_ratings(test_ratings, 1)

View File

@ -458,3 +458,13 @@ class HeuristicInterface:
def __str__(self) -> str:
return self.name
@abstractmethod
def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
"""
Like get_test_priorities, but instead returns a float ranking ranging
from -1 to 1, where negative means skip, positive means run, 0 means no
idea, and magnitude = how confident the heuristic is. Used by
AggregatedHeuristicsRankings.
"""
pass

View File

@ -26,6 +26,10 @@ class PreviouslyFailedInPR(HeuristicInterface):
return test_rankings
def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
critical_tests = _get_previously_failing_tests()
return {test: 1 for test in critical_tests if test in tests}
def _get_previously_failing_tests() -> Set[str]:
PYTEST_FAILED_TESTS_CACHE_FILE_PATH = Path(".pytest_cache/v/cache/lastfailed")

View File

@ -1,4 +1,4 @@
from typing import Any, List
from typing import Any, Dict, List
from tools.stats.import_test_stats import (
ADDITIONAL_CI_FILES_FOLDER,
@ -10,7 +10,11 @@ from tools.testing.target_determination.heuristics.interface import (
TestPrioritizations,
)
from tools.testing.target_determination.heuristics.utils import get_correlated_tests
from tools.testing.target_determination.heuristics.utils import (
get_correlated_tests,
get_ratings_for_tests,
normalize_ratings,
)
# Profilers were used to gather simple python code coverage information for each
@ -30,3 +34,10 @@ class Profiling(HeuristicInterface):
)
return test_rankings
def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
test_ratings = get_ratings_for_tests(
ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_PROFILING_FILE
)
test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
return normalize_ratings(test_ratings, 1)

View File

@ -44,21 +44,44 @@ def query_changed_files() -> List[str]:
return lines
def get_correlated_tests(file: Union[str, Path]) -> List[str]:
def normalize_ratings(ratings: Dict[str, float], max_value: float) -> Dict[str, float]:
# Takse the ratings, makes the max value into max_value, and proportionally
# distributes the rest of the ratings.
# Ex [1,2,3,4] and max_value 8 gets converted to [2,4,6,8]
# Assumes all rankings are >= 0
# Don't modify in place
if len(ratings) == 0:
return ratings
min_rating = min(ratings.values())
assert min_rating > 0
max_rating = max(ratings.values())
assert max_rating > 0
normalized_ratings = {}
for tf, rank in ratings.items():
normalized_ratings[tf] = rank / max_rating * max_value
return normalized_ratings
def get_ratings_for_tests(file: Union[str, Path]) -> Dict[str, float]:
path = REPO_ROOT / file
if not os.path.exists(path):
print(f"could not find path {path}")
return []
return {}
with open(path) as f:
test_file_ratings = cast(Dict[str, Dict[str, float]], json.load(f))
try:
changed_files = query_changed_files()
except Exception as e:
warn(f"Can't query changed test files due to {e}")
return []
return {}
ratings: Dict[str, float] = defaultdict(float)
for file in changed_files:
for test_file, score in test_file_ratings.get(file, {}).items():
ratings[test_file] += score
return ratings
def get_correlated_tests(file: Union[str, Path]) -> List[str]:
ratings = get_ratings_for_tests(file)
prioritize = sorted(ratings, key=lambda x: -ratings[x])
return prioritize