add retries to external contribution data upload (#100889)

Adds retries to external contribution upload as it is shown to be flaky

<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at 43c2602</samp>

Added a function to read data from S3 objects and used it to implement a retry mechanism and verification for uploading external contribution stats. Modified `tools/stats/upload_external_contrib_stats.py` and `tools/stats/upload_stats_lib.py`.
<!--
copilot:poem
-->
### <samp>🤖 Generated by Copilot at 43c2602</samp>

> _We'll upload the stats to the cloud, me hearties_
> _We'll use `read_from_s3` to check them all_
> _We'll retry if the connection fails, me hearties_
> _We'll log the results and have a ball_

Pull Request resolved: https://github.com/pytorch/pytorch/pull/100889
Approved by: https://github.com/huydhn
This commit is contained in:
PaliC
2023-05-15 23:46:50 +00:00
committed by PyTorch MergeBot
parent 773f6b626d
commit c03555a303
2 changed files with 42 additions and 6 deletions

View File

@ -8,9 +8,10 @@ from urllib.error import HTTPError
from urllib.request import Request, urlopen
# import time
from tools.stats.upload_stats_lib import upload_to_s3
from tools.stats.upload_stats_lib import read_from_s3, upload_to_s3
FILTER_OUT_USERS = {"pytorchmergebot", "facebook-github-bot", "pytorch-bot[bot]"}
MAXIMUM_RETRIES = 5
def _fetch_url(
@ -126,17 +127,35 @@ if __name__ == "__main__":
)
args = parser.parse_args()
for i in range(args.length):
tries = 0
startdate = args.startDate + datetime.timedelta(days=i)
data = get_external_pr_data(
startdate,
startdate + datetime.timedelta(days=args.period_length),
period_length=args.period_length,
)
upload_to_s3(
bucket_name="torchci-contribution-data",
key=f"external_contribution_counts/{str(startdate)}",
docs=data,
)
while tries < MAXIMUM_RETRIES:
success = True
upload_to_s3(
bucket_name="torchci-contribution-data",
key=f"external_contribution_counts/{str(startdate)}",
docs=data,
)
uploaded_data = read_from_s3(
"torchci-contribution-data",
f"external_contribution_counts/{str(startdate)}",
)
for doc in data:
if doc not in uploaded_data:
tries += 1
print(
f"Failed to upload data retrying upload up to {MAXIMUM_RETRIES - tries} more times"
)
success = False
break
if success:
break
# uncomment when running large queries locally to avoid github's rate limiting
#
# import time