Handle wrong workflow name from GitHub (#123301)

Fixes https://github.com/pytorch/pytorch/issues/122422. From my testing, the problem is that GitHub didn't return the correct workflow name in some cases and used the path to the workflow instead. Take https://github.com/pytorch/pytorch/pull/123104 as an example, the returning name from GH graphql was `.github/workflows/generated-linux-binary-conda-nightly.yml` while the name we had on Rockset was `linux-binary-conda`. The latter was correct, but the mismatch caused mergebot to miss the flaky failures. This is a weird issue because retrying the graphql query eventually returns the correct name. First query: ![Screenshot 2024-04-03 at 15 28 37](https://github.com/pytorch/pytorch/assets/475357/81a8ada4-c241-4e6b-b45d-7a6de1c3a151) After several retries: ![Screenshot 2024-04-03 at 15 31 53](https://github.com/pytorch/pytorch/assets/475357/402c2e8c-f963-45f6-8c10-e1d2f49c5479) Then I could never get the result like the first query again. The fix here is to keep track of the job ID so that we can compare it instead of the `workflow / job` name. Pull Request resolved: https://github.com/pytorch/pytorch/pull/123301 Approved by: https://github.com/clee2000
2025-10-20 21:14:14 +08:00 · 2024-04-04 07:00:40 +00:00
parent dbeb214043
commit f00ece024b
4 changed files with 54 additions and 9 deletions
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -397,7 +397,7 @@ class TestTryMerge(TestCase):
    def test_gql_retrieve_checksuites(self, *args: Any) -> None:
        "Fetch comments and conclusions for PR with 60 commits"
        pr = GitHubPR("pytorch", "pytorch", 94787)
-        self.assertEqual(len(pr.get_checkrun_conclusions()), 183)
+        self.assertEqual(len(pr.get_checkrun_conclusions()), 182)

    def test_team_members(self, *args: Any) -> None:
        "Test fetching team members works"
@ -832,6 +832,41 @@ class TestBypassFailures(TestCase):
        self.assertTrue(len(ignorable["FLAKY"]) == 4)
        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)

+    def test_get_classifications_wrong_workflow_name(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 123104)
+        checks = pr.get_checkrun_conclusions()
+
+        check_name = "linux-binary-conda / conda-py3_8-cuda11_8-build / build"
+        check_name_workflow_path = ".github/workflows/generated-linux-binary-conda-nightly.yml / conda-py3_8-cuda11_8-build / build"
+
+        # Mock a check where the workflow name uses the full path
+        checks[check_name_workflow_path] = JobCheckState(
+            check_name_workflow_path,
+            checks[check_name].url,
+            checks[check_name].status,
+            checks[check_name].classification,
+            checks[check_name].job_id,
+            checks[check_name].title,
+            checks[check_name].summary,
+        )
+        del checks[check_name]
+
+        checks = get_classifications(
+            pr.pr_num,
+            pr.project,
+            checks,
+            [],
+        )
+        pending, failed, ignorable = categorize_checks(
+            checks,
+            list(checks.keys()),
+        )
+
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(ignorable["FLAKY"]) == 1)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
+
    @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
    def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
        """
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -123,6 +123,7 @@ fragment PRCheckSuites on CheckSuiteConnection {
        workflow {
          name
        }
+        databaseId
        url
      }
      checkRuns(first: 50) {
@ -1618,28 +1619,37 @@ def remove_job_name_suffix(name: str, replacement: str = ")") -> str:


 def is_broken_trunk(
-    name: str,
+    check: JobCheckState,
    drci_classifications: Any,
 ) -> bool:
-    if not name or not drci_classifications:
+    if not check or not drci_classifications:
        return False

+    name = check.name
+    job_id = check.job_id
+
    # Consult the list of broken trunk failures from Dr.CI
    return any(
-        name == broken_trunk["name"]
+        (name == broken_trunk["name"]) or (job_id and job_id == broken_trunk["id"])
        for broken_trunk in drci_classifications.get("BROKEN_TRUNK", [])
    )


 def is_flaky(
-    name: str,
+    check: JobCheckState,
    drci_classifications: Any,
 ) -> bool:
-    if not name or not drci_classifications:
+    if not check or not drci_classifications:
        return False

+    name = check.name
+    job_id = check.job_id
+
    # Consult the list of flaky failures from Dr.CI
-    return any(name == flaky["name"] for flaky in drci_classifications.get("FLAKY", []))
+    return any(
+        (name == flaky["name"] or (job_id and job_id == flaky["id"]))
+        for flaky in drci_classifications.get("FLAKY", [])
+    )


 def is_invalid_cancel(
@ -1726,7 +1736,7 @@ def get_classifications(

        # NB: It's important to note that when it comes to ghstack and broken trunk classification,
        # Dr.CI uses the base of the whole stack
-        if is_broken_trunk(name, drci_classifications):
+        if is_broken_trunk(check, drci_classifications):
            checks_with_classifications[name] = JobCheckState(
                check.name,
                check.url,
@ -1738,7 +1748,7 @@ def get_classifications(
            )
            continue

-        elif is_flaky(name, drci_classifications):
+        elif is_flaky(check, drci_classifications):
            checks_with_classifications[name] = JobCheckState(
                check.name,
                check.url,