[CI][run_test] Fix rerun logic for failing at exit (#155853)

Sometimes a test file reports success according to pytest, but fails afterwards, and the rerun logic doesn't handle that correctly. The name of the last run test is saved in order to do more efficient reruns (target the last run test for a rerun without rerunning the entire file). This usually correct, ex test fails and pytest catches it -> lastrun = the test that failed, test segfaults (pytest doesn't catch) -> lastrun is the test that segfaulted. But sometimes pytest reports a success, but the process has non zero exit code. The two cases I know of are hangs and double freeing at exit. In this case, its unclear which test caused the failure, so lastrun is set to be the first test that ran in that session, so that during the next session it will start from the beginning in an attempt to replicate the error (an alternate solution would be to just fail and not rerun, which might be the better option). But then it reruns with runsingle, which prevents lastrun from being reset (not sure why, I'm pretty sure there's no difference between resetting and not normally), so lastrun becomes the last test that ran, and its not always true that lastrun is the one that caused it. Then on the next run, it starts from the last test and the process now exits cleanly Short term solution here: ensure the lastrun is always set to the initial value if the session succeeds. This is correct even in the normal path because initial value shouldn't change in that case Things that still need to be fixed: * log says "running single test" which is not true * no xml reports get generated here * also no xml reports get generated on segfault * docs for this I think I have a PR that fixes the above but its old so I need to take another look Testing: This from when I was based on a commit that had a hang for macs, and before I added the skips in inductor array ref: cc862d2c14 Pull Request resolved: https://github.com/pytorch/pytorch/pull/155853 Approved by: https://github.com/malfet
2025-10-20 21:14:14 +08:00 · 2025-06-17 17:51:35 +00:00
parent 6629eaf0c6
commit 32c1611263
3 changed files with 29 additions and 2 deletions
--- a/test/conftest.py
+++ b/test/conftest.py
@ -341,5 +341,5 @@ class StepcurrentPlugin:
        self.cache.set(self.directory, self.lastrun)

    def pytest_sessionfinish(self, session, exitstatus):
-        if exitstatus == 0 and not self.run_single:
+        if exitstatus == 0:
            self.cache.set(self.directory, self.initial_val)
--- a/test/inductor/test_aot_inductor_arrayref.py
+++ b/test/inductor/test_aot_inductor_arrayref.py
@ -93,7 +93,7 @@ CPU_TEST_FAILURES = {
    ),
    # https://github.com/pytorch/pytorch/issues/129550
    # https://github.com/pytorch/pytorch/issues/123691
-    "test_dynamic_scalar": fail_minimal_arrayref_interface(is_skip=True),
+    "test_dynamic_scalar": fail_stack_allocation(is_skip=True),
    # https://github.com/pytorch/pytorch/issues/122980
    "test_fft_c2c": fail_stack_allocation(is_skip=True),
    "test_freezing": fail_minimal_arrayref_interface(is_skip=True),
@ -169,6 +169,29 @@ CPU_TEST_FAILURES = {
    "test_symbool_item": fail_minimal_arrayref_interface(is_skip=True),
    # TODO: AttributeError: 'ShapeAsConstantBuffer' object has no attribute 'dtype'
    "test_symfloat_item": fail_minimal_arrayref_interface(is_skip=True),
+    # Causes a segfault when the process exits
+    "test_view_outputs": fail_stack_allocation(is_skip=True),
+    "test_pytree_inputs": fail_stack_allocation(is_skip=True),
+    "test_duplicated_params": fail_stack_allocation(is_skip=True),
+    "test_output_misaligned": fail_stack_allocation(is_skip=True),
+    "test_no_args": fail_stack_allocation(is_skip=True),
+    "test_fqn": fail_stack_allocation(is_skip=True),
+    "test_assert_tensor_meta": fail_stack_allocation(is_skip=True),
+    "test_clamp_decomposition": fail_stack_allocation(is_skip=True),
+    "test_aoti_constant_tensor_name_collision": fail_stack_allocation(is_skip=True),
+    "test_cond_unbacked_symint_closure_dynamic_False": fail_stack_allocation(
+        is_skip=True
+    ),
+    "test_empty_cat_dtype_promotion": fail_stack_allocation(is_skip=True),
+    "test_pad_fallback": fail_stack_allocation(is_skip=True),
+    "test_simple_embed_kernel_binary_False_max_autotune_True": fail_stack_allocation(
+        is_skip=True
+    ),
+    "test_simple_embed_kernel_binary_True_max_autotune_True": fail_stack_allocation(
+        is_skip=True
+    ),
+    # When running test_seq with test_issue_140766, the process segfaults
+    "test_seq": fail_stack_allocation(is_skip=True),
 }


--- a/test/run_test.py
+++ b/test/run_test.py
@ -621,6 +621,7 @@ def run_test(
                stepcurrent_key,
                output,
                options.continue_through_error,
+                test_file,
            )
        else:
            command.extend([f"--sc={stepcurrent_key}", "--print-items"])
@ -699,6 +700,7 @@ def run_test_retries(
    stepcurrent_key,
    output,
    continue_through_error,
+    test_file,
 ):
    # Run the test with -x to stop at first failure.  Rerun the test by itself.
    # If it succeeds, move on to the rest of the tests in a new process.  If it
@ -774,6 +776,8 @@ def run_test_retries(
            print_to_file("Retrying single test...")
        print_items = []  # do not continue printing them, massive waste of space

+    if "null" in num_failures:
+        num_failures[f"'{test_file}'"] = num_failures.pop("null")
    consistent_failures = [x[1:-1] for x in num_failures.keys() if num_failures[x] >= 3]
    flaky_failures = [x[1:-1] for x in num_failures.keys() if 0 < num_failures[x] < 3]
    if len(flaky_failures) > 0: