add retries to external contribution data upload (#100889)

Adds retries to external contribution upload as it is shown to be flaky  ### <samp>🤖 Generated by Copilot at 43c2602</samp> Added a function to read data from S3 objects and used it to implement a retry mechanism and verification for uploading external contribution stats. Modified `tools/stats/upload_external_contrib_stats.py` and `tools/stats/upload_stats_lib.py`.  ### <samp>🤖 Generated by Copilot at 43c2602</samp> > _We'll upload the stats to the cloud, me hearties_ > _We'll use `read_from_s3` to check them all_ > _We'll retry if the connection fails, me hearties_ > _We'll log the results and have a ball_ Pull Request resolved: https://github.com/pytorch/pytorch/pull/100889 Approved by: https://github.com/huydhn
2025-10-20 21:14:14 +08:00 · 2023-05-15 23:46:50 +00:00
parent 773f6b626d
commit c03555a303
2 changed files with 42 additions and 6 deletions
--- a/tools/stats/upload_external_contrib_stats.py
+++ b/tools/stats/upload_external_contrib_stats.py
@ -8,9 +8,10 @@ from urllib.error import HTTPError
 from urllib.request import Request, urlopen

 # import time
-from tools.stats.upload_stats_lib import upload_to_s3
+from tools.stats.upload_stats_lib import read_from_s3, upload_to_s3

 FILTER_OUT_USERS = {"pytorchmergebot", "facebook-github-bot", "pytorch-bot[bot]"}
+MAXIMUM_RETRIES = 5


 def _fetch_url(
@ -126,17 +127,35 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()
    for i in range(args.length):
+        tries = 0
        startdate = args.startDate + datetime.timedelta(days=i)
        data = get_external_pr_data(
            startdate,
            startdate + datetime.timedelta(days=args.period_length),
            period_length=args.period_length,
        )
-        upload_to_s3(
-            bucket_name="torchci-contribution-data",
-            key=f"external_contribution_counts/{str(startdate)}",
-            docs=data,
-        )
+        while tries < MAXIMUM_RETRIES:
+            success = True
+            upload_to_s3(
+                bucket_name="torchci-contribution-data",
+                key=f"external_contribution_counts/{str(startdate)}",
+                docs=data,
+            )
+            uploaded_data = read_from_s3(
+                "torchci-contribution-data",
+                f"external_contribution_counts/{str(startdate)}",
+            )
+            for doc in data:
+                if doc not in uploaded_data:
+                    tries += 1
+                    print(
+                        f"Failed to upload data retrying upload up to {MAXIMUM_RETRIES - tries} more times"
+                    )
+                    success = False
+                    break
+            if success:
+                break
+
        # uncomment when running large queries locally to avoid github's rate limiting
        #
        # import time
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@ -143,6 +143,23 @@ def upload_to_s3(
    print("Done!")


+def read_from_s3(
+    bucket_name: str,
+    key: str,
+) -> List[Dict[str, Any]]:
+    print(f"Reading from s3://{bucket_name}/{key}")
+    body = (
+        S3_RESOURCE.Object(
+            f"{bucket_name}",
+            f"{key}",
+        )
+        .get()["Body"]
+        .read()
+    )
+    results = gzip.decompress(body).decode().split("\n")
+    return [json.loads(result) for result in results if result]
+
+
 def upload_workflow_stats_to_s3(
    workflow_run_id: int,
    workflow_run_attempt: int,