[experiment][TD] Rating number system (#112676)

Emit excessive amount of heuristic info emitted, but that just means I can do more with it later? Pull Request resolved: https://github.com/pytorch/pytorch/pull/112676 Approved by: https://github.com/ZainRizvi
2025-10-20 21:14:14 +08:00 · 2023-11-07 19:40:07 +00:00
parent 82875e69fe
commit 0c448526a4
10 changed files with 114 additions and 11 deletions
--- a/test/run_test.py
+++ b/test/run_test.py
@ -47,6 +47,7 @@ from tools.stats.import_test_stats import (
 from tools.stats.upload_metrics import add_global_metric, emit_metric
 from tools.testing.target_determination.determinator import (
    AggregatedHeuristics,
+    get_prediction_confidences,
    get_test_prioritizations,
 )

@ -1806,7 +1807,17 @@ def main():
                test_stats["num_total_tests"] = num_tests

                print_to_stderr("Emiting td_test_failure_stats")
-                emit_metric("td_test_failure_stats", test_stats)
+                emit_metric(
+                    "td_test_failure_stats",
+                    {
+                        **test_stats,
+                        "confidence_ratings": get_prediction_confidences(
+                            selected_tests
+                        ),
+                        "failure": str(test),
+                        "tests": selected_tests,
+                    },
+                )

    if len(all_failures):
        for _, err in all_failures: