Files
pytorch/tools/test/test_upload_stats_lib.py
Huy Do 00751772e6 Upload perf benchmark to Rockset in batch of at most 5000 records (#107095)
TIL, uploading to Rockset has an upper limit of 5000 records per request.  So uploading PT2 perf benchmark could fail if that limit was reached, for example https://github.com/pytorch/pytorch/actions/runs/5828810421/job/15849232756

```
HTTP response body: {"message":"The number of documents specified in this request exceeds the maximum allowed limit of 5,000 documents.","message_key":"RECEIVER_REQUEST_MAX_DOCUMENT_LIMIT","type":"INVALIDINPUT","line":null,"column":null,"trace_id":"73fc2eb5-cfd1-4baa-8141-47c7cde87812","error_id":null,"query_id":null,"internal_errors":null}
```

The fix is to upload the results in multiple smaller batches of at most 5000 records.

### Testing

5743 records from https://github.com/pytorch/pytorch/actions/runs/5828810421/job/15849232756 were written in 2 batches (5000 + 743)

```
python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id 5821183777 --workflow-run-attempt 1 --repo pytorch/pytorch --head-branch gh/ezyang/2294/head
...
Writing 5000 documents to Rockset
Done!
Writing 743 documents to Rockset
Done!
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/107095
Approved by: https://github.com/atalman, https://github.com/seemethere, https://github.com/ZainRizvi
2023-08-14 19:56:42 +00:00

147 lines
4.4 KiB
Python

import decimal
import inspect
import unittest
from typing import Any, Dict
from unittest import mock
from tools.stats.upload_stats_lib import BATCH_SIZE, emit_metric, upload_to_rockset
# default values
REPO = "some/repo"
BUILD_ENV = "cuda-10.2"
TEST_CONFIG = "test-config"
WORKFLOW = "some-workflow"
JOB = "some-job"
RUN_ID = 56
RUN_NUMBER = 123
RUN_ATTEMPT = 3
class TestUploadStats(unittest.TestCase):
# Before each test, set the env vars to their default values
def setUp(self) -> None:
mock.patch.dict(
"os.environ",
{
"CI": "true",
"BUILD_ENVIRONMENT": BUILD_ENV,
"TEST_CONFIG": TEST_CONFIG,
"GITHUB_REPOSITORY": REPO,
"GITHUB_WORKFLOW": WORKFLOW,
"GITHUB_JOB": JOB,
"GITHUB_RUN_ID": str(RUN_ID),
"GITHUB_RUN_NUMBER": str(RUN_NUMBER),
"GITHUB_RUN_ATTEMPT": str(RUN_ATTEMPT),
},
).start()
@mock.patch("boto3.Session.resource")
def test_emit_metric(self, mock_resource: Any) -> None:
metric = {
"some_number": 123,
"float_number": 32.34,
}
# Querying for this instead of hard coding it b/c this will change
# based on whether we run this test directly from python or from
# pytest
current_module = inspect.getmodule(inspect.currentframe()).__name__ # type: ignore[union-attr]
emit_should_include = {
"metric_name": "metric_name",
"calling_file": "test_upload_stats_lib.py",
"calling_module": current_module,
"calling_function": "test_emit_metric",
"repo": REPO,
"workflow": WORKFLOW,
"build_environment": BUILD_ENV,
"job": JOB,
"test_config": TEST_CONFIG,
"run_id": RUN_ID,
"run_number": RUN_NUMBER,
"run_attempt": RUN_ATTEMPT,
"some_number": 123,
"float_number": decimal.Decimal(str(32.34)),
}
# Preserve the metric emitted
emitted_metric: Dict[str, Any] = {}
def mock_put_item(Item: Dict[str, Any]) -> None:
nonlocal emitted_metric
emitted_metric = Item
mock_resource.return_value.Table.return_value.put_item = mock_put_item
emit_metric("metric_name", metric)
self.assertDictContainsSubset(emit_should_include, emitted_metric)
@mock.patch("boto3.resource")
def test_blocks_emission_if_reserved_keyword_used(self, mock_resource: Any) -> None:
metric = {"repo": "awesome/repo"}
with self.assertRaises(ValueError):
emit_metric("metric_name", metric)
@mock.patch("boto3.resource")
def test_no_metrics_emitted_if_env_var_not_set(self, mock_resource: Any) -> None:
metric = {"some_number": 123}
mock.patch.dict(
"os.environ",
{
"CI": "true",
"BUILD_ENVIRONMENT": BUILD_ENV,
},
clear=True,
).start()
put_item_invoked = False
def mock_put_item(Item: Dict[str, Any]) -> None:
nonlocal put_item_invoked
put_item_invoked = True
mock_resource.return_value.Table.return_value.put_item = mock_put_item
emit_metric("metric_name", metric)
self.assertFalse(put_item_invoked)
def test_upload_to_rockset_batch_size(self) -> None:
cases = [
{
"batch_size": BATCH_SIZE - 1,
"expected_number_of_requests": 1,
},
{
"batch_size": BATCH_SIZE,
"expected_number_of_requests": 1,
},
{
"batch_size": BATCH_SIZE + 1,
"expected_number_of_requests": 2,
},
]
for case in cases:
mock_client = mock.Mock()
mock_client.Documents.add_documents.return_value = "OK"
batch_size = case["batch_size"]
expected_number_of_requests = case["expected_number_of_requests"]
docs = list(range(batch_size))
upload_to_rockset(
collection="test", docs=docs, workspace="commons", client=mock_client
)
self.assertEqual(
mock_client.Documents.add_documents.call_count,
expected_number_of_requests,
)
if __name__ == "__main__":
unittest.main()