Files
pytorch/test/conftest.py
Huy Do 6e3e3dd477 Do not collect and skip non-disabled tests when rerunning disabled tests (#102107)
The console log blows up to much when running in rerun disabled tests mode (x50) e132f09e88.  Each log is around 1GB and the whole uncompressed logs is ~50GB.  After compression, it will be around 1GB, still too big.  The increase comes mainly from the multiple SKIPPED message for non-disabled tests, which is expected due to how SkipTest and pytest-flakyfinder currently work.

I update `test/conftest.py` to completely ignore skipped tests when rerunning disabled test instead of collecting then skipping 50 tests each.  The benefit of doing is is much more than I originally expect:
  * Rerun disabled tests jobs now finish in less than half an hour as they should be
  * Fix OOM runner crash because of too many collected tests
  * Fix verbosity issue as now only disabled tests are run x50 times.  There are only few hundreds of them atm
  * Fix timed out issue when rerunning disabled distributed and ASAN tests.  They are just too slow when running at x50

### Testing

When rerunning disabled tests https://github.com/pytorch/pytorch/actions/runs/5084508614, only disabled tests on the platform are run, for example `test_ops_jit` on https://ossci-raw-job-status.s3.amazonaws.com/log/13770164954 only ran 100 tests (`test_variant_consistency_jit_linalg_lu_cuda_float32` + `test_variant_consistency_jit_linalg_lu_factor_cuda_complex64`) x50.

```
Executing ['/opt/conda/envs/py_3.10/bin/python', '-bb', 'test_ops_jit.py', '--shard-id=1', '--num-shards=2', '-v', '-vv', '-rfEX', '-p', 'no:xdist', '--use-pytest', '--sc=test_ops_jit_1', '--flake-finder', '--flake-runs=50', '--import-slow-tests', '--import-disabled-tests', '--rerun-disabled-tests'] ... [2023-05-25 21:32:49.763856]

Expand the folded group to see the log file of test_ops_jit 2/2
##[group]PRINTING LOG FILE of test_ops_jit 2/2 (/var/lib/jenkins/workspace/test/test-reports/test_ops_jit_h2wr_t2c.log)
Test results will be stored in test-reports/python-pytest/test_ops_jit/test_ops_jit-51a83bd44549074e.xml
============================= test session starts ==============================
platform linux -- Python 3.10.11, pytest-7.3.1, pluggy-1.0.0 -- /opt/conda/envs/py_3.10/bin/python
cachedir: .pytest_cache
hypothesis profile 'pytorch_ci' -> database=None, max_examples=50, derandomize=True, suppress_health_check=[HealthCheck.too_slow]
rootdir: /var/lib/jenkins/workspace
configfile: pytest.ini
plugins: hypothesis-5.35.1, cpp-2.3.0, flakefinder-1.1.0, rerunfailures-11.1.2, shard-0.1.2, xdist-3.3.0, xdoctest-1.1.0
collecting ... collected 1084 items
Running 100 items in this shard: test/test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_cuda_float32 (x50), test/test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_factor_cuda_complex64 (x50)
stepcurrent: Cannot find last run test, not skipping

test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_cuda_float32 PASSED [2.1876s] [  1%]
test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_factor_cuda_complex64 PASSED [4.5615s] [  2%]
```

* [pull](https://github.com/pytorch/pytorch/actions/runs/5093566864)
* [trunk](https://github.com/pytorch/pytorch/actions/runs/5095364311)
* [periodic](https://github.com/pytorch/pytorch/actions/runs/5095378850)
* [slow](https://github.com/pytorch/pytorch/actions/runs/5095390285)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/102107
Approved by: https://github.com/clee2000, https://github.com/malfet
2023-05-27 12:10:36 +00:00

313 lines
11 KiB
Python

from _pytest.junitxml import LogXML, _NodeReporter, bin_xml_escape
from _pytest.terminal import _get_raw_skip_reason
from _pytest.stash import StashKey
from _pytest.reports import TestReport
from _pytest.config.argparsing import Parser
from _pytest.config import filename_arg
from _pytest.config import Config
from _pytest._code.code import ReprFileLocation
from _pytest.python import Module
from typing import Any, List, Union
from typing import Optional
from types import MethodType
import xml.etree.ElementTree as ET
import functools
import pytest
import sys
import os
import copy
import json
import re
from collections import defaultdict
# a lot of this file is copied from _pytest.junitxml and modified to get rerun info
xml_key = StashKey["LogXMLReruns"]()
STEPCURRENT_CACHE_DIR = "cache/stepcurrent"
def pytest_addoption(parser: Parser) -> None:
group = parser.getgroup("general")
group.addoption(
"--scs",
action="store",
default=None,
dest="stepcurrent_skip",
)
group.addoption(
"--sc",
action="store",
default=None,
dest="stepcurrent",
)
parser.addoption("--use-main-module", action='store_true')
group = parser.getgroup("terminal reporting")
group.addoption(
"--junit-xml-reruns",
action="store",
dest="xmlpath_reruns",
metavar="path",
type=functools.partial(filename_arg, optname="--junit-xml-reruns"),
default=None,
help="create junit-xml style report file at given path.",
)
group.addoption(
"--junit-prefix-reruns",
action="store",
metavar="str",
default=None,
help="prepend prefix to classnames in junit-xml output",
)
parser.addini(
"junit_suite_name_reruns", "Test suite name for JUnit report", default="pytest"
)
parser.addini(
"junit_logging_reruns",
"Write captured log messages to JUnit report: "
"one of no|log|system-out|system-err|out-err|all",
default="no",
)
parser.addini(
"junit_log_passing_tests_reruns",
"Capture log information for passing tests to JUnit report: ",
type="bool",
default=True,
)
parser.addini(
"junit_duration_report_reruns",
"Duration time to report: one of total|call",
default="total",
)
parser.addini(
"junit_family_reruns",
"Emit XML for schema: one of legacy|xunit1|xunit2",
default="xunit2",
)
def pytest_configure(config: Config) -> None:
xmlpath = config.option.xmlpath_reruns
# Prevent opening xmllog on worker nodes (xdist).
if xmlpath and not hasattr(config, "workerinput"):
junit_family = config.getini("junit_family_reruns")
config.stash[xml_key] = LogXMLReruns(
xmlpath,
config.option.junitprefix,
config.getini("junit_suite_name_reruns"),
config.getini("junit_logging_reruns"),
config.getini("junit_duration_report_reruns"),
junit_family,
config.getini("junit_log_passing_tests_reruns"),
)
config.pluginmanager.register(config.stash[xml_key])
if config.getoption("stepcurrent_skip"):
config.option.stepcurrent = config.getoption("stepcurrent_skip")
if config.getoption("stepcurrent"):
config.pluginmanager.register(StepcurrentPlugin(config), "stepcurrentplugin")
def pytest_unconfigure(config: Config) -> None:
xml = config.stash.get(xml_key, None)
if xml:
del config.stash[xml_key]
config.pluginmanager.unregister(xml)
class _NodeReporterReruns(_NodeReporter):
def _prepare_content(self, content: str, header: str) -> str:
return content
def _write_content(self, report: TestReport, content: str, jheader: str) -> None:
if content == "":
return
tag = ET.Element(jheader)
tag.text = bin_xml_escape(content)
self.append(tag)
class LogXMLReruns(LogXML):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def append_rerun(self, reporter: _NodeReporter, report: TestReport) -> None:
if hasattr(report, "wasxfail"):
reporter._add_simple("skipped", "xfail-marked test passes unexpectedly")
else:
assert report.longrepr is not None
reprcrash: Optional[ReprFileLocation] = getattr(
report.longrepr, "reprcrash", None
)
if reprcrash is not None:
message = reprcrash.message
else:
message = str(report.longrepr)
message = bin_xml_escape(message)
reporter._add_simple("rerun", message, str(report.longrepr))
def pytest_runtest_logreport(self, report: TestReport) -> None:
super().pytest_runtest_logreport(report)
if report.outcome == "rerun":
reporter = self._opentestcase(report)
self.append_rerun(reporter, report)
if report.outcome == "skipped":
if isinstance(report.longrepr, tuple):
fspath, lineno, reason = report.longrepr
reason = f"{report.nodeid}: {_get_raw_skip_reason(report)}"
report.longrepr = (fspath, lineno, reason)
def node_reporter(self, report: Union[TestReport, str]) -> _NodeReporterReruns:
nodeid: Union[str, TestReport] = getattr(report, "nodeid", report)
# Local hack to handle xdist report order.
workernode = getattr(report, "node", None)
key = nodeid, workernode
if key in self.node_reporters:
# TODO: breaks for --dist=each
return self.node_reporters[key]
reporter = _NodeReporterReruns(nodeid, self)
self.node_reporters[key] = reporter
self.node_reporters_ordered.append(reporter)
return reporter
# imitating summary_failures in pytest's terminal.py
# both hookwrapper and tryfirst to make sure this runs before pytest's
@pytest.hookimpl(hookwrapper=True, tryfirst=True)
def pytest_terminal_summary(terminalreporter, exitstatus, config):
# prints stack traces for reruns
if terminalreporter.config.option.tbstyle != "no":
reports = terminalreporter.getreports("rerun")
if reports:
terminalreporter.write_sep("=", "RERUNS")
if terminalreporter.config.option.tbstyle == "line":
for rep in reports:
line = terminalreporter._getcrashline(rep)
terminalreporter.write_line(line)
else:
for rep in reports:
msg = terminalreporter._getfailureheadline(rep)
terminalreporter.write_sep("_", msg, red=True, bold=True)
terminalreporter._outrep_summary(rep)
terminalreporter._handle_teardown_sections(rep.nodeid)
yield
@pytest.hookimpl(tryfirst=True)
def pytest_pycollect_makemodule(module_path, path, parent) -> Module:
if parent.config.getoption("--use-main-module"):
mod = Module.from_parent(parent, path=module_path)
mod._getobj = MethodType(lambda x: sys.modules['__main__'], mod)
return mod
@pytest.hookimpl(hookwrapper=True)
def pytest_report_teststatus(report, config):
# Add the test time to the verbose output, unforunately I don't think this
# includes setup or teardown
pluggy_result = yield
if not isinstance(report, pytest.TestReport):
return
outcome, letter, verbose = pluggy_result.get_result()
if verbose:
pluggy_result.force_result(
(outcome, letter, f"{verbose} [{report.duration:.4f}s]")
)
@pytest.hookimpl(trylast=True)
def pytest_collection_modifyitems(items: List[Any]) -> None:
"""
This hook is used when rerunning disabled tests to get rid of all skipped tests
instead of running and skipping them N times. This avoids flooding the console
and XML outputs with junk. So we want this to run last when collecting tests.
"""
rerun_disabled_tests = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1"
if not rerun_disabled_tests:
return
disabled_regex = re.compile(r"(?P<test_name>.+)\s+\([^\.]+\.(?P<test_class>.+)\)")
disabled_tests = defaultdict(set)
# This environment has already been set by run_test before it calls pytest
disabled_tests_file = os.getenv("DISABLED_TESTS_FILE", "")
if not disabled_tests_file or not os.path.exists(disabled_tests_file):
return
with open(disabled_tests_file) as fp:
for disabled_test in json.load(fp):
m = disabled_regex.match(disabled_test)
if m:
test_name = m["test_name"]
test_class = m["test_class"]
disabled_tests[test_class].add(test_name)
# When rerunning disabled test, ignore all test cases that are not disabled
filtered_items = []
for item in items:
test_name = item.name
test_class = item.parent.name
if test_class not in disabled_tests or test_name not in disabled_tests[test_class]:
continue
cpy = copy.copy(item)
cpy._initrequest()
filtered_items.append(cpy)
items.clear()
# NB: Need to edit items directly here to have the list reflected back to pytest
items.extend(filtered_items)
class StepcurrentPlugin:
# Modified fromo _pytest/stepwise.py in order to save the currently running
# test instead of the last failed test
def __init__(self, config: Config) -> None:
self.config = config
self.report_status = ""
assert config.cache is not None
self.cache: pytest.Cache = config.cache
self.directory = f"{STEPCURRENT_CACHE_DIR}/{config.getoption('stepcurrent')}"
self.lastrun: Optional[str] = self.cache.get(self.directory, None)
self.skip: bool = config.getoption("stepcurrent_skip")
def pytest_collection_modifyitems(self, config: Config, items: List[Any]) -> None:
if not self.lastrun:
self.report_status = "Cannot find last run test, not skipping"
return
# check all item nodes until we find a match on last run
failed_index = None
for index, item in enumerate(items):
if item.nodeid == self.lastrun:
failed_index = index
if self.skip:
failed_index += 1
break
# If the previously failed test was not found among the test items,
# do not skip any tests.
if failed_index is None:
self.report_status = "previously run test not found, not skipping."
else:
self.report_status = f"skipping {failed_index} already run items."
deselected = items[:failed_index]
del items[:failed_index]
config.hook.pytest_deselected(items=deselected)
def pytest_report_collectionfinish(self) -> Optional[str]:
if self.config.getoption("verbose") >= 0 and self.report_status:
return f"stepcurrent: {self.report_status}"
return None
def pytest_runtest_protocol(self, item, nextitem) -> None:
self.lastrun = item.nodeid
self.cache.set(self.directory, self.lastrun)