Skip signals from older runs of the same workflows (#129291)

I discovered this bug in trymerge when debugging https://github.com/pytorch/pytorch/pull/129013 in which Dr.CI reported no relevant failures while mergebot complained about some unrelated ROCm failures https://github.com/pytorch/pytorch/pull/129013#issuecomment-2183009217. It turns out that mergebot took into account stale signals from older runs of the same workflow here. For example, * https://github.com/pytorch/pytorch/actions/runs/9604985361 was the first run where it had a ROCm failure * While https://github.com/pytorch/pytorch/actions/runs/9608926565 was the second attempt and it was all green Notice that both runs came from the same push to commit [be69191](be69191f2d) with [ciflow/rocm/129013](https://github.com/pytorch/pytorch/tree/ciflow/rocm/129013). So, we just need to check the signals from the newer run. Note that Dr.CI handles this part correctly using the logic in https://github.com/pytorch/test-infra/blob/main/torchci/pages/api/drci/drci.ts#L1079-L1088. So, the fix in this PR is to bring the same logic to trymerge. ### Testing `pytest -v test_trymerge.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/129291 Approved by: https://github.com/ZainRizvi
2025-10-20 21:14:14 +08:00 · 2024-06-26 03:49:08 +00:00
parent c718e2f43b
commit cda4d4887d
4 changed files with 41 additions and 8 deletions
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -397,6 +397,7 @@ class TestTryMerge(TestCase):
        # self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
        self.assertGreater(pr.get_commit_count(), 60)

+    @skip("GitHub doesn't keep this data anymore")
    def test_gql_retrieve_checksuites(self, *args: Any) -> None:
        "Fetch comments and conclusions for PR with 60 commits"
        pr = GitHubPR("pytorch", "pytorch", 94787)
@ -894,6 +895,24 @@ class TestBypassFailures(TestCase):
        self.assertTrue(len(ignorable["FLAKY"]) == 1)
        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)

+    def test_ignore_failures_older_run_same_workflow(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 129013)
+        checks = pr.get_checkrun_conclusions()
+        checks = get_classifications(
+            pr.pr_num,
+            pr.project,
+            checks,
+            [],
+        )
+        pending, failed, ignorable = categorize_checks(
+            checks,
+            list(checks.keys()),
+        )
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(ignorable["FLAKY"]) == 2)
+        self.assertTrue(len(ignorable["UNSTABLE"]) == 13)
+
    @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
    def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
        """
@ -1022,7 +1041,7 @@ class TestGitHubPRGhstackDependencies(TestCase):
        )

    @skip(
-        reason="This test is run against a mutalbe PR that has changed, so it no longer works. The test should be changed"
+        reason="This test is run against a mutable PR that has changed, so it no longer works. The test should be changed"
    )
    @mock.patch("trymerge.read_merge_rules")
    @mock.patch("trymerge.GitRepo")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -81,9 +81,10 @@ JobNameToStateDict = Dict[str, JobCheckState]


 class WorkflowCheckState:
-    def __init__(self, name: str, url: str, status: Optional[str]):
+    def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
        self.name: str = name
        self.url: str = url
+        self.run_id: int = run_id
        self.status: Optional[str] = status
        self.jobs: JobNameToStateDict = {}

@ -122,6 +123,7 @@ fragment PRCheckSuites on CheckSuiteConnection {
      workflowRun {
        workflow {
          name
+          databaseId
        }
        databaseId
        url
@ -512,7 +514,7 @@ def add_workflow_conclusions(
    workflows: Dict[str, WorkflowCheckState] = {}

    # for the jobs that don't have a workflow
-    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", None)
+    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", 0, None)

    def add_conclusions(edges: Any) -> None:
        for edge_idx, edge in enumerate(edges):
@ -523,18 +525,30 @@ def add_workflow_conclusions(
            workflow_obj: WorkflowCheckState = no_workflow_obj

            if workflow_run is not None:
+                # This is the usual workflow run ID we see on GitHub
+                workflow_run_id = workflow_run["databaseId"]
+                # While this is the metadata name and ID of the workflow itself
                workflow_name = workflow_run["workflow"]["name"]
+                workflow_id = workflow_run["workflow"]["databaseId"]
+
                workflow_conclusion = node["conclusion"]
                # Do not override existing status with cancelled
                if workflow_conclusion == "CANCELLED" and workflow_name in workflows:
                    continue
-                if workflow_name not in workflows:
-                    workflows[workflow_name] = WorkflowCheckState(
+
+                # Only keep the latest workflow run for each workflow, heuristically,
+                # it's the run with largest run ID
+                if (
+                    workflow_id not in workflows
+                    or workflows[workflow_id].run_id < workflow_run_id
+                ):
+                    workflows[workflow_id] = WorkflowCheckState(
                        name=workflow_name,
                        status=workflow_conclusion,
                        url=workflow_run["url"],
+                        run_id=workflow_run_id,
                    )
-                workflow_obj = workflows[workflow_name]
+                workflow_obj = workflows[workflow_id]

            while checkruns is not None:
                for checkrun_node in checkruns["nodes"]:
@ -572,12 +586,12 @@ def add_workflow_conclusions(
    # the jobs in but don't put the workflow in.  We care more about the jobs in
    # the workflow that ran than the container workflow.
    res: JobNameToStateDict = {}
-    for workflow_name, workflow in workflows.items():
+    for workflow in workflows.values():
        if len(workflow.jobs) > 0:
            for job_name, job in workflow.jobs.items():
                res[job_name] = job
        else:
-            res[workflow_name] = JobCheckState(
+            res[workflow.name] = JobCheckState(
                workflow.name,
                workflow.url,
                workflow.status,