undo failure

better message
try to extract class name
2025-11-14 06:07:55 +08:00 · 2025-11-10 14:58:31 -08:00 · 2025-11-07 09:50:14 -08:00 · 2025-11-07 09:43:06 -08:00 · 2025-11-07 08:25:02 -08:00 · 2025-11-06 20:43:45 -08:00
3 changed files with 81 additions and 11 deletions
--- a/test/conftest.py
+++ b/test/conftest.py
@ -308,12 +308,16 @@ class StepcurrentPlugin:
        self.report_status = ""
        assert config.cache is not None
        self.cache: pytest.Cache = config.cache
-        self.directory = f"{STEPCURRENT_CACHE_DIR}/{config.getoption('stepcurrent')}"
-        self.lastrun: Optional[str] = self.cache.get(self.directory, None)
+        directory = f"{STEPCURRENT_CACHE_DIR}/{config.getoption('stepcurrent')}"
+        self.lastrun_location = f"{directory}/lastrun"
+        self.lastrun: Optional[str] = self.cache.get(self.lastrun_location, None)
        self.initial_val = self.lastrun
        self.skip: bool = config.getoption("stepcurrent_skip")
        self.run_single: bool = config.getoption("run_single")

+        self.made_failing_xml_location = f"{directory}/made_failing_xml"
+        self.cache.set(self.made_failing_xml_location, False)
+
    def pytest_collection_modifyitems(self, config: Config, items: list[Any]) -> None:
        if not self.lastrun:
            self.report_status = "Cannot find last run test, not skipping"
@ -349,8 +353,10 @@ class StepcurrentPlugin:

    def pytest_runtest_protocol(self, item, nextitem) -> None:
        self.lastrun = item.nodeid
-        self.cache.set(self.directory, self.lastrun)
+        self.cache.set(self.lastrun_location, self.lastrun)

    def pytest_sessionfinish(self, session, exitstatus):
        if exitstatus == 0:
-            self.cache.set(self.directory, self.initial_val)
+            self.cache.set(self.lastrun_location, self.initial_val)
+        if exitstatus != 0:
+            self.cache.set(self.made_failing_xml_location, True)
--- a/test/run_test.py
+++ b/test/run_test.py
@ -78,6 +78,7 @@ from tools.testing.test_selections import (
 try:
    from tools.testing.upload_artifacts import (
        parse_xml_and_upload_json,
+        upload_adhoc_failure_json,
        zip_and_upload_artifacts,
    )
 except ImportError:
@ -87,7 +88,10 @@ except ImportError:
    def parse_xml_and_upload_json():
        pass

-    def zip_and_upload_artifacts(failed: bool):
+    def zip_and_upload_artifacts(*args, **kwargs):
+        pass
+
+    def upload_adhoc_failure_json(*args, **kwargs):
        pass


@ -642,6 +646,7 @@ def run_test(
                output,
                options.continue_through_error,
                test_file,
+                options,
            )
        else:
            command.extend([f"--sc={stepcurrent_key}", "--print-items"])
@ -728,6 +733,7 @@ def run_test_retries(
    output,
    continue_through_error,
    test_file,
+    options,
 ):
    # Run the test with -x to stop at first failure.  Rerun the test by itself.
    # If it succeeds, move on to the rest of the tests in a new process.  If it
@ -746,6 +752,16 @@ def run_test_retries(

    num_failures = defaultdict(int)

+    def read_pytest_cache(key: str) -> Any:
+        cache_file = (
+            REPO_ROOT / ".pytest_cache/v/cache/stepcurrent" / stepcurrent_key / key
+        )
+        try:
+            with open(cache_file) as f:
+                return f.read()
+        except FileNotFoundError:
+            return None
+
    print_items = ["--print-items"]
    sc_command = f"--sc={stepcurrent_key}"
    while True:
@ -766,12 +782,11 @@ def run_test_retries(

        # Read what just failed/ran
        try:
-            with open(
-                REPO_ROOT / ".pytest_cache/v/cache/stepcurrent" / stepcurrent_key
-            ) as f:
-                current_failure = f.read()
-                if current_failure == "null":
-                    current_failure = f"'{test_file}'"
+            current_failure = read_pytest_cache("lastrun")
+            if current_failure is None:
+                raise FileNotFoundError
+            if current_failure == "null":
+                current_failure = f"'{test_file}'"
        except FileNotFoundError:
            print_to_file(
                "No stepcurrent file found. Either pytest didn't get to run (e.g. import error)"
@ -794,6 +809,13 @@ def run_test_retries(
            # This is for log classifier so it can prioritize consistently
            # failing tests instead of reruns. [1:-1] to remove quotes
            print_to_file(f"FAILED CONSISTENTLY: {current_failure[1:-1]}")
+            if (
+                read_pytest_cache("made_failing_xml") == "false"
+                and IS_CI
+                and options.upload_artifacts_while_running
+            ):
+                upload_adhoc_failure_json(test_file, current_failure[1:-1])
+
            if not continue_through_error:
                print_to_file("Stopping at first consistent failure")
                break
--- a/tools/testing/upload_artifacts.py
+++ b/tools/testing/upload_artifacts.py
@ -208,3 +208,45 @@ def parse_xml_and_upload_json() -> None:
                    lock.release()
    except Exception as e:
        print(f"Failed to parse and upload json test reports: {e}")
+
+
+def upload_adhoc_failure_json(invoking_file: str, current_failure: str) -> None:
+    """
+    manually upload a json to s3 indicating that the entire test file failed
+    since xml was probably not generated in this case
+    """
+    try:
+        job_id = int(os.environ["JOB_ID"])
+        workflow_id = int(os.environ["GITHUB_RUN_ID"])
+    except Exception as e:
+        print(f"Failed to get job_id or workflow_id: {e}")
+        return
+
+    split_failure = current_failure.split("::")
+    if len(split_failure) >= 2:
+        className = split_failure[-2]
+        testName = split_failure[-1]
+    else:
+        testName = current_failure
+        className = ""
+
+    message = "The test file failed but pytest did not generate xml.  The most likely cause is a segfault"
+    j = {
+        "invoking_file": invoking_file,
+        "file": f"{invoking_file}.py",
+        "name": testName,
+        "classname": className,
+        "workflow_id": workflow_id,
+        "workflow_run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+        "job_id": job_id,
+        "failure": {"message": message, "text": message},
+    }
+    gzipped = gzip.compress(json.dumps(j).encode("utf-8"))
+    s3_key = f"{invoking_file.replace('/', '_')}_{os.urandom(8).hex()}.json"
+    get_s3_resource().put_object(
+        Body=gzipped,
+        Bucket="gha-artifacts",
+        Key=f"test_jsons_while_running/{workflow_id}/{job_id}/{s3_key}",
+        ContentType="application/json",
+        ContentEncoding="gzip",
+    )
Author	SHA1	Message	Date
Catherine Lee	9b1953d29e	undo failure	2025-11-10 14:58:31 -08:00
Catherine Lee	874ba0a6b4	better message	2025-11-07 09:50:14 -08:00
Catherine Lee	1868ca8b6a	try to extract class name	2025-11-07 09:43:06 -08:00
Catherine Lee	2228067938	typo again	2025-11-07 08:25:02 -08:00
Catherine Lee	56ea1aec79	cache val	2025-11-06 20:43:45 -08:00
Catherine Lee	c8324d30f6	typo again	2025-11-06 18:27:21 -08:00
Catherine Lee	26568b469d	fix path	2025-11-06 15:44:06 -08:00
Catherine Lee	3e7188c5e7	lint	2025-11-06 14:54:20 -08:00
Catherine Lee	6a3694adb0	more general	2025-11-06 14:51:58 -08:00
Catherine Lee	24c259cd6c	tc	2025-11-06 14:13:31 -08:00
Catherine Lee	9cc873e582	lint	2025-11-06 14:02:27 -08:00
Catherine Lee	cb574c7adb	lint, int, fail	2025-11-06 13:48:52 -08:00
Catherine Lee	44cfeec317	tc	2025-11-06 13:44:53 -08:00