mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
add retries to external contribution data upload (#100889)
Adds retries to external contribution upload as it is shown to be flaky <!-- copilot:summary --> ### <samp>🤖 Generated by Copilot at 43c2602</samp> Added a function to read data from S3 objects and used it to implement a retry mechanism and verification for uploading external contribution stats. Modified `tools/stats/upload_external_contrib_stats.py` and `tools/stats/upload_stats_lib.py`. <!-- copilot:poem --> ### <samp>🤖 Generated by Copilot at 43c2602</samp> > _We'll upload the stats to the cloud, me hearties_ > _We'll use `read_from_s3` to check them all_ > _We'll retry if the connection fails, me hearties_ > _We'll log the results and have a ball_ Pull Request resolved: https://github.com/pytorch/pytorch/pull/100889 Approved by: https://github.com/huydhn
This commit is contained in:
@ -8,9 +8,10 @@ from urllib.error import HTTPError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
# import time
|
||||
from tools.stats.upload_stats_lib import upload_to_s3
|
||||
from tools.stats.upload_stats_lib import read_from_s3, upload_to_s3
|
||||
|
||||
FILTER_OUT_USERS = {"pytorchmergebot", "facebook-github-bot", "pytorch-bot[bot]"}
|
||||
MAXIMUM_RETRIES = 5
|
||||
|
||||
|
||||
def _fetch_url(
|
||||
@ -126,17 +127,35 @@ if __name__ == "__main__":
|
||||
)
|
||||
args = parser.parse_args()
|
||||
for i in range(args.length):
|
||||
tries = 0
|
||||
startdate = args.startDate + datetime.timedelta(days=i)
|
||||
data = get_external_pr_data(
|
||||
startdate,
|
||||
startdate + datetime.timedelta(days=args.period_length),
|
||||
period_length=args.period_length,
|
||||
)
|
||||
upload_to_s3(
|
||||
bucket_name="torchci-contribution-data",
|
||||
key=f"external_contribution_counts/{str(startdate)}",
|
||||
docs=data,
|
||||
)
|
||||
while tries < MAXIMUM_RETRIES:
|
||||
success = True
|
||||
upload_to_s3(
|
||||
bucket_name="torchci-contribution-data",
|
||||
key=f"external_contribution_counts/{str(startdate)}",
|
||||
docs=data,
|
||||
)
|
||||
uploaded_data = read_from_s3(
|
||||
"torchci-contribution-data",
|
||||
f"external_contribution_counts/{str(startdate)}",
|
||||
)
|
||||
for doc in data:
|
||||
if doc not in uploaded_data:
|
||||
tries += 1
|
||||
print(
|
||||
f"Failed to upload data retrying upload up to {MAXIMUM_RETRIES - tries} more times"
|
||||
)
|
||||
success = False
|
||||
break
|
||||
if success:
|
||||
break
|
||||
|
||||
# uncomment when running large queries locally to avoid github's rate limiting
|
||||
#
|
||||
# import time
|
||||
|
@ -143,6 +143,23 @@ def upload_to_s3(
|
||||
print("Done!")
|
||||
|
||||
|
||||
def read_from_s3(
|
||||
bucket_name: str,
|
||||
key: str,
|
||||
) -> List[Dict[str, Any]]:
|
||||
print(f"Reading from s3://{bucket_name}/{key}")
|
||||
body = (
|
||||
S3_RESOURCE.Object(
|
||||
f"{bucket_name}",
|
||||
f"{key}",
|
||||
)
|
||||
.get()["Body"]
|
||||
.read()
|
||||
)
|
||||
results = gzip.decompress(body).decode().split("\n")
|
||||
return [json.loads(result) for result in results if result]
|
||||
|
||||
|
||||
def upload_workflow_stats_to_s3(
|
||||
workflow_run_id: int,
|
||||
workflow_run_attempt: int,
|
||||
|
Reference in New Issue
Block a user