mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
I guess this is more of an RFC
Goal:
Enable keep going so that we can get information immediately for failures. We want be aware of failures as soon as possible, especially on the main branch, this is so that reverts can happen quickly.
Proposal:
A job with `keep-going` will continue through errors in `python run_test.py`. If a test fails, before it runs the next test, it will upload a fake log that should have enough information in it so that viewing the log will be able to tell you what failed and any stack traces/error logs, and should be able to be parsed by log classifier to get a line.
I am getting the log by concating the test logs in test/test-reports, which is all the text outputted by pytest (unless someone runs with `ci-verbose-test-logs` label). There are obviously many things this won't catch, ex output outside of run_test.py, some output inside of run_test.py, but it should be enough.
After a log finishes, eventually its raw log is uploaded to ossci-raw-job-status s3 bucket and the log classifier will read it to do classification. This means we will have to change log classifier to read from this bucket as well.
I'm thinking just add an input parameter to log classifier like https://github.com/pytorch/test-infra/pull/6723/files
Also upload the temp results to a temp attribute instead of the real one
To overwrite the conclusion on HUD, I'm thinking a lambda that is s3 put trigger on the fake log being put into s3, that does something similar to log classifier where it just mutates the entry 13a990b678/aws/lambda/log-classifier/src/network.rs (L85)
to add a new field like "will_fail": true, and also triggers the log classifier to run
Then we change HUD/ClickHouse to point the raw log url to the alternate place, the new "will_fail" field as the conclusion, and the temp log classifier result if needed
Why always write to temp attribution/column? I am unsure about overwriting the real results with fake ones
Pros:
Not many changes outside of HUD/UI
Cons:
Lots of moving parts, lots of temp fields that will require adjustment for queries, temp fields never really get deleted
Pull Request resolved: https://github.com/pytorch/pytorch/pull/155371
Approved by: https://github.com/malfet
141 lines
4.8 KiB
Python
141 lines
4.8 KiB
Python
import glob
|
|
import gzip
|
|
import os
|
|
import time
|
|
import zipfile
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
LAST_UPDATED = 0.0
|
|
LOG_BUCKET_PREFIX = "temp_logs"
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def get_s3_resource() -> Any:
|
|
import boto3 # type: ignore[import]
|
|
|
|
return boto3.client("s3")
|
|
|
|
|
|
def zip_artifact(file_name: str, paths: list[str]) -> None:
|
|
"""Zip the files in the paths listed into file_name. The paths will be used
|
|
in a glob and should be relative to REPO_ROOT."""
|
|
|
|
with zipfile.ZipFile(file_name, "w") as f:
|
|
for path in paths:
|
|
for file in glob.glob(f"{REPO_ROOT}/{path}", recursive=True):
|
|
f.write(file, os.path.relpath(file, REPO_ROOT))
|
|
|
|
|
|
def concated_logs() -> str:
|
|
"""Concatenate all the logs in the test-reports directory into a single string."""
|
|
logs = []
|
|
for log_file in glob.glob(
|
|
f"{REPO_ROOT}/test/test-reports/**/*.log", recursive=True
|
|
):
|
|
logs.append(f"=== {log_file} ===")
|
|
with open(log_file) as f:
|
|
# For every line, prefix with fake timestamp for log classifier
|
|
for line in f:
|
|
line = line.rstrip("\n") # Remove any trailing newline
|
|
logs.append(f"2020-01-01T00:00:00.0000000Z {line}")
|
|
return "\n".join(logs)
|
|
|
|
|
|
def upload_to_s3_artifacts(failed: bool) -> None:
|
|
"""Upload the file to S3."""
|
|
workflow_id = os.environ.get("GITHUB_RUN_ID")
|
|
workflow_run_attempt = os.environ.get("GITHUB_RUN_ATTEMPT")
|
|
file_suffix = os.environ.get("ARTIFACTS_FILE_SUFFIX")
|
|
job_id = os.environ.get("JOB_ID")
|
|
if not workflow_id or not workflow_run_attempt or not file_suffix:
|
|
print(
|
|
"GITHUB_RUN_ID, GITHUB_RUN_ATTEMPT, or ARTIFACTS_FILE_SUFFIX not set, not uploading"
|
|
)
|
|
return
|
|
|
|
test_reports_zip_path = f"{REPO_ROOT}/test-reports-{file_suffix}.zip"
|
|
zip_artifact(
|
|
test_reports_zip_path,
|
|
["test/test-reports/**/*.xml", "test/test-reports/**/*.csv"],
|
|
)
|
|
test_logs_zip_path = f"{REPO_ROOT}/logs-{file_suffix}.zip"
|
|
zip_artifact(test_logs_zip_path, ["test/test-reports/**/*.log"])
|
|
jsons_zip_path = f"{REPO_ROOT}/test-jsons-{file_suffix}.zip"
|
|
zip_artifact(jsons_zip_path, ["test/test-reports/**/*.json"])
|
|
|
|
s3_prefix = f"pytorch/pytorch/{workflow_id}/{workflow_run_attempt}/artifact"
|
|
get_s3_resource().upload_file(
|
|
test_reports_zip_path,
|
|
"gha-artifacts",
|
|
f"{s3_prefix}/{Path(test_reports_zip_path).name}",
|
|
)
|
|
get_s3_resource().upload_file(
|
|
test_logs_zip_path,
|
|
"gha-artifacts",
|
|
f"{s3_prefix}/{Path(test_logs_zip_path).name}",
|
|
)
|
|
get_s3_resource().upload_file(
|
|
test_logs_zip_path,
|
|
"gha-artifacts",
|
|
f"{s3_prefix}/{Path(jsons_zip_path).name}",
|
|
)
|
|
get_s3_resource().put_object(
|
|
Body=b"",
|
|
Bucket="gha-artifacts",
|
|
Key=f"workflows_failing_pending_upload/{workflow_id}.txt",
|
|
)
|
|
if job_id and failed:
|
|
logs = concated_logs()
|
|
# Put logs into bucket so log classifier can access them. We cannot get
|
|
# the actual GH logs so this will have to be a proxy.
|
|
print(f"Uploading logs for {job_id} to S3")
|
|
get_s3_resource().put_object(
|
|
Body=gzip.compress(logs.encode("utf-8")),
|
|
Bucket="gha-artifacts",
|
|
Key=f"{LOG_BUCKET_PREFIX}/{job_id}",
|
|
ContentType="text/plain",
|
|
ContentEncoding="gzip",
|
|
)
|
|
|
|
|
|
def zip_and_upload_artifacts(failed: bool) -> None:
|
|
# not thread safe but correctness of the LAST_UPDATED var doesn't really
|
|
# matter for this
|
|
# Upload if a test failed or every 20 minutes
|
|
global LAST_UPDATED
|
|
|
|
if failed or time.time() - LAST_UPDATED > 20 * 60:
|
|
start = time.time()
|
|
try:
|
|
upload_to_s3_artifacts(failed=failed)
|
|
LAST_UPDATED = time.time()
|
|
except Exception as e:
|
|
print(f"Failed to upload artifacts: {e}")
|
|
print(f"Uploading artifacts took {time.time() - start:.2f} seconds")
|
|
|
|
|
|
def trigger_upload_test_stats_intermediate_workflow() -> None:
|
|
import requests
|
|
|
|
# The GITHUB_TOKEN cannot trigger workflow so this isn't used for now
|
|
print("Triggering upload_test_stats_intermediate workflow")
|
|
x = requests.post(
|
|
"https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches", # noqa: B950 @lint-ignore
|
|
headers={
|
|
"Accept": "application/vnd.github.v3+json",
|
|
"Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}",
|
|
},
|
|
json={
|
|
"ref": "main",
|
|
"inputs": {
|
|
"workflow_run_id": os.environ.get("GITHUB_RUN_ID"),
|
|
"workflow_run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
|
|
},
|
|
},
|
|
)
|
|
print(x.text)
|