mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Upload perf benchmark to Rockset in batch of at most 5000 records (#107095)
TIL, uploading to Rockset has an upper limit of 5000 records per request. So uploading PT2 perf benchmark could fail if that limit was reached, for example https://github.com/pytorch/pytorch/actions/runs/5828810421/job/15849232756 ``` HTTP response body: {"message":"The number of documents specified in this request exceeds the maximum allowed limit of 5,000 documents.","message_key":"RECEIVER_REQUEST_MAX_DOCUMENT_LIMIT","type":"INVALIDINPUT","line":null,"column":null,"trace_id":"73fc2eb5-cfd1-4baa-8141-47c7cde87812","error_id":null,"query_id":null,"internal_errors":null} ``` The fix is to upload the results in multiple smaller batches of at most 5000 records. ### Testing 5743 records from https://github.com/pytorch/pytorch/actions/runs/5828810421/job/15849232756 were written in 2 batches (5000 + 743) ``` python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id 5821183777 --workflow-run-attempt 1 --repo pytorch/pytorch --head-branch gh/ezyang/2294/head ... Writing 5000 documents to Rockset Done! Writing 743 documents to Rockset Done! ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/107095 Approved by: https://github.com/atalman, https://github.com/seemethere, https://github.com/ZainRizvi
This commit is contained in:
@ -23,6 +23,8 @@ S3_RESOURCE = boto3.resource("s3")
|
||||
# NB: In CI, a flaky test is usually retried 3 times, then the test file would be rerun
|
||||
# 2 more times
|
||||
MAX_RETRY_IN_NON_DISABLED_MODE = 3 * 3
|
||||
# NB: Rockset has an upper limit of 5000 documents in one request
|
||||
BATCH_SIZE = 5000
|
||||
|
||||
|
||||
def _get_request_headers() -> Dict[str, str]:
|
||||
@ -116,17 +118,29 @@ def download_gha_artifacts(
|
||||
|
||||
|
||||
def upload_to_rockset(
|
||||
collection: str, docs: List[Any], workspace: str = "commons"
|
||||
collection: str,
|
||||
docs: List[Any],
|
||||
workspace: str = "commons",
|
||||
client: Any = None,
|
||||
) -> None:
|
||||
print(f"Writing {len(docs)} documents to Rockset")
|
||||
client = rockset.RocksetClient(
|
||||
host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
|
||||
)
|
||||
client.Documents.add_documents(
|
||||
collection=collection,
|
||||
data=docs,
|
||||
workspace=workspace,
|
||||
)
|
||||
if not client:
|
||||
client = rockset.RocksetClient(
|
||||
host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
|
||||
)
|
||||
|
||||
index = 0
|
||||
while index < len(docs):
|
||||
from_index = index
|
||||
to_index = min(from_index + BATCH_SIZE, len(docs))
|
||||
print(f"Writing {to_index - from_index} documents to Rockset")
|
||||
|
||||
client.Documents.add_documents(
|
||||
collection=collection,
|
||||
data=docs[from_index:to_index],
|
||||
workspace=workspace,
|
||||
)
|
||||
index += BATCH_SIZE
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user