RS migration - trymerge to upload merge records to s3 (#129503)

Uploads merge records to to ossci-raw-job-status (public) bucket instead of directly to rockset The runner used by trymerge is a GH runner, so it doesn't have access to s3. Instead, I save the record as a json and upload the json to s3 in a different step that runs after the aws credentials are configured. The role is defined [here](https://togithub.com/pytorch-labs/pytorch-gha-infra/pull/421) Pull Request resolved: https://github.com/pytorch/pytorch/pull/129503 Approved by: https://github.com/huydhn, https://github.com/ZainRizvi, https://github.com/malfet
2025-10-20 21:14:14 +08:00 · 2024-06-26 19:06:52 +00:00
parent 895316119d
commit 90f82426b9
3 changed files with 48 additions and 52 deletions
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -1177,7 +1177,6 @@ class GitHubPR:
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=comment_id,
                pr_num=self.pr_num,
                owner=self.org,
@ -1193,10 +1192,8 @@ class GitHubPR:
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
-                dry_run=dry_run,
                skip_mandatory_checks=skip_mandatory_checks,
                ignore_current=bool(ignore_current_checks),
-                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
@ -1503,7 +1500,6 @@ def checks_to_markdown_bullets(

@retries_decorator()
 def save_merge_record(
-    collection: str,
    comment_id: int,
    pr_num: int,
    owner: str,
@ -1519,59 +1515,44 @@ def save_merge_record(
    merge_base_sha: str,
    merge_commit_sha: str = "",
    is_failed: bool = False,
-    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
    ignore_current: bool = False,
    error: str = "",
-    workspace: str = "commons",
 ) -> None:
    """
-    This saves the merge records into Rockset, so we can query them (for fun and profit)
+    This saves the merge records as a json, which can later be uploaded to s3
    """
-    if dry_run:
-        # Decide not to save the record to Rockset if dry-run is set to not pollute
-        # the collection
-        return

-    try:
-        import rockset  # type: ignore[import]
+    # Prepare the record to be written into Rockset
+    data = [
+        {
+            "comment_id": comment_id,
+            "pr_num": pr_num,
+            "owner": owner,
+            "project": project,
+            "author": author,
+            "pending_checks": pending_checks,
+            "failed_checks": failed_checks,
+            "ignore_current_checks": ignore_current_checks,
+            "broken_trunk_checks": broken_trunk_checks,
+            "flaky_checks": flaky_checks,
+            "unstable_checks": unstable_checks,
+            "last_commit_sha": last_commit_sha,
+            "merge_base_sha": merge_base_sha,
+            "merge_commit_sha": merge_commit_sha,
+            "is_failed": is_failed,
+            "skip_mandatory_checks": skip_mandatory_checks,
+            "ignore_current": ignore_current,
+            "error": error,
+            # This is a unique identifier for the record for deduping purposes
+            # in rockset.  Any unique string would work
+            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
+        }
+    ]
+    repo_root = Path(__file__).resolve().parent.parent.parent

-        # Prepare the record to be written into Rockset
-        data = [
-            {
-                "comment_id": comment_id,
-                "pr_num": pr_num,
-                "owner": owner,
-                "project": project,
-                "author": author,
-                "pending_checks": pending_checks,
-                "failed_checks": failed_checks,
-                "ignore_current_checks": ignore_current_checks,
-                "broken_trunk_checks": broken_trunk_checks,
-                "flaky_checks": flaky_checks,
-                "unstable_checks": unstable_checks,
-                "last_commit_sha": last_commit_sha,
-                "merge_base_sha": merge_base_sha,
-                "merge_commit_sha": merge_commit_sha,
-                "is_failed": is_failed,
-                "skip_mandatory_checks": skip_mandatory_checks,
-                "ignore_current": ignore_current,
-                "error": error,
-            }
-        ]
-
-        client = rockset.RocksetClient(
-            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-        )
-        client.Documents.add_documents(
-            collection=collection,
-            data=data,
-            workspace=workspace,
-        )
-
-    except ModuleNotFoundError:
-        print("Rockset is missing, no record will be saved")
-        return
+    with open(repo_root / "merge_record.json", "w") as f:
+        json.dump(data, f)


@retries_decorator(rc=[])
@ -2388,7 +2369,6 @@ def main() -> None:
            # list of pending and failed checks here, but they are not really
            # needed at the moment
            save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=args.comment_id,
                pr_num=args.pr_num,
                owner=org,
@ -2403,11 +2383,9 @@ def main() -> None:
                last_commit_sha=pr.last_commit().get("oid", ""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
-                dry_run=args.dry_run,
                skip_mandatory_checks=args.force,
                ignore_current=args.ignore_current,
                error=str(e),
-                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -43,6 +43,7 @@ jobs:
          IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
        run: |
          set -x
          if [ -n "${REBASE}" ]; then
@ -84,6 +85,22 @@ jobs:
          set -x
          python3 .github/scripts/comment_on_pr.py "${PR_NUM}" "merge"

+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        continue-on-error: true
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Upload merge record to s3
+        if: always()
+        continue-on-error: true
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: ossci-raw-job-status
+          s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }}
+          path: merge-record.json
+
 # We want newer merge commands to supercede old ones
 concurrency:
  group: try-merge-${{ github.event.client_payload.pr_num }}
--- a/.gitignore
+++ b/.gitignore
@ -129,6 +129,7 @@ env
 scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
+merge_record.json

 # These files get copied over on invoking setup.py
 torchgen/packaged/*