Skip signals from older runs of the same workflows (#129291)

I discovered this bug in trymerge when debugging https://github.com/pytorch/pytorch/pull/129013 in which Dr.CI reported no relevant failures while mergebot complained about some unrelated ROCm failures https://github.com/pytorch/pytorch/pull/129013#issuecomment-2183009217.

It turns out that mergebot took into account stale signals from older runs of the same workflow here.  For example,
* https://github.com/pytorch/pytorch/actions/runs/9604985361 was the first run where it had a ROCm failure
* While https://github.com/pytorch/pytorch/actions/runs/9608926565 was the second attempt and it was all green

Notice that both runs came from the same push to commit [be69191](be69191f2d) with [ciflow/rocm/129013](https://github.com/pytorch/pytorch/tree/ciflow/rocm/129013).  So, we just need to check the signals from the newer run.

Note that Dr.CI handles this part correctly using the logic in https://github.com/pytorch/test-infra/blob/main/torchci/pages/api/drci/drci.ts#L1079-L1088.  So, the fix in this PR is to bring the same logic to trymerge.

### Testing

`pytest -v test_trymerge.py`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/129291
Approved by: https://github.com/ZainRizvi
This commit is contained in:
Huy Do
2024-06-26 03:49:08 +00:00
committed by PyTorch MergeBot
parent c718e2f43b
commit cda4d4887d
4 changed files with 41 additions and 8 deletions

Binary file not shown.

Binary file not shown.

View File

@ -397,6 +397,7 @@ class TestTryMerge(TestCase):
# self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
self.assertGreater(pr.get_commit_count(), 60)
@skip("GitHub doesn't keep this data anymore")
def test_gql_retrieve_checksuites(self, *args: Any) -> None:
"Fetch comments and conclusions for PR with 60 commits"
pr = GitHubPR("pytorch", "pytorch", 94787)
@ -894,6 +895,24 @@ class TestBypassFailures(TestCase):
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
def test_ignore_failures_older_run_same_workflow(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 129013)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(
checks,
list(checks.keys()),
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 2)
self.assertTrue(len(ignorable["UNSTABLE"]) == 13)
@mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
"""
@ -1022,7 +1041,7 @@ class TestGitHubPRGhstackDependencies(TestCase):
)
@skip(
reason="This test is run against a mutalbe PR that has changed, so it no longer works. The test should be changed"
reason="This test is run against a mutable PR that has changed, so it no longer works. The test should be changed"
)
@mock.patch("trymerge.read_merge_rules")
@mock.patch("trymerge.GitRepo")

View File

@ -81,9 +81,10 @@ JobNameToStateDict = Dict[str, JobCheckState]
class WorkflowCheckState:
def __init__(self, name: str, url: str, status: Optional[str]):
def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
self.name: str = name
self.url: str = url
self.run_id: int = run_id
self.status: Optional[str] = status
self.jobs: JobNameToStateDict = {}
@ -122,6 +123,7 @@ fragment PRCheckSuites on CheckSuiteConnection {
workflowRun {
workflow {
name
databaseId
}
databaseId
url
@ -512,7 +514,7 @@ def add_workflow_conclusions(
workflows: Dict[str, WorkflowCheckState] = {}
# for the jobs that don't have a workflow
no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", None)
no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", 0, None)
def add_conclusions(edges: Any) -> None:
for edge_idx, edge in enumerate(edges):
@ -523,18 +525,30 @@ def add_workflow_conclusions(
workflow_obj: WorkflowCheckState = no_workflow_obj
if workflow_run is not None:
# This is the usual workflow run ID we see on GitHub
workflow_run_id = workflow_run["databaseId"]
# While this is the metadata name and ID of the workflow itself
workflow_name = workflow_run["workflow"]["name"]
workflow_id = workflow_run["workflow"]["databaseId"]
workflow_conclusion = node["conclusion"]
# Do not override existing status with cancelled
if workflow_conclusion == "CANCELLED" and workflow_name in workflows:
continue
if workflow_name not in workflows:
workflows[workflow_name] = WorkflowCheckState(
# Only keep the latest workflow run for each workflow, heuristically,
# it's the run with largest run ID
if (
workflow_id not in workflows
or workflows[workflow_id].run_id < workflow_run_id
):
workflows[workflow_id] = WorkflowCheckState(
name=workflow_name,
status=workflow_conclusion,
url=workflow_run["url"],
run_id=workflow_run_id,
)
workflow_obj = workflows[workflow_name]
workflow_obj = workflows[workflow_id]
while checkruns is not None:
for checkrun_node in checkruns["nodes"]:
@ -572,12 +586,12 @@ def add_workflow_conclusions(
# the jobs in but don't put the workflow in. We care more about the jobs in
# the workflow that ran than the container workflow.
res: JobNameToStateDict = {}
for workflow_name, workflow in workflows.items():
for workflow in workflows.values():
if len(workflow.jobs) > 0:
for job_name, job in workflow.jobs.items():
res[job_name] = job
else:
res[workflow_name] = JobCheckState(
res[workflow.name] = JobCheckState(
workflow.name,
workflow.url,
workflow.status,