mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
The console log blows up to much when running in rerun disabled tests mode (x50) e132f09e88
. Each log is around 1GB and the whole uncompressed logs is ~50GB. After compression, it will be around 1GB, still too big. The increase comes mainly from the multiple SKIPPED message for non-disabled tests, which is expected due to how SkipTest and pytest-flakyfinder currently work.
I update `test/conftest.py` to completely ignore skipped tests when rerunning disabled test instead of collecting then skipping 50 tests each. The benefit of doing is is much more than I originally expect:
* Rerun disabled tests jobs now finish in less than half an hour as they should be
* Fix OOM runner crash because of too many collected tests
* Fix verbosity issue as now only disabled tests are run x50 times. There are only few hundreds of them atm
* Fix timed out issue when rerunning disabled distributed and ASAN tests. They are just too slow when running at x50
### Testing
When rerunning disabled tests https://github.com/pytorch/pytorch/actions/runs/5084508614, only disabled tests on the platform are run, for example `test_ops_jit` on https://ossci-raw-job-status.s3.amazonaws.com/log/13770164954 only ran 100 tests (`test_variant_consistency_jit_linalg_lu_cuda_float32` + `test_variant_consistency_jit_linalg_lu_factor_cuda_complex64`) x50.
```
Executing ['/opt/conda/envs/py_3.10/bin/python', '-bb', 'test_ops_jit.py', '--shard-id=1', '--num-shards=2', '-v', '-vv', '-rfEX', '-p', 'no:xdist', '--use-pytest', '--sc=test_ops_jit_1', '--flake-finder', '--flake-runs=50', '--import-slow-tests', '--import-disabled-tests', '--rerun-disabled-tests'] ... [2023-05-25 21:32:49.763856]
Expand the folded group to see the log file of test_ops_jit 2/2
##[group]PRINTING LOG FILE of test_ops_jit 2/2 (/var/lib/jenkins/workspace/test/test-reports/test_ops_jit_h2wr_t2c.log)
Test results will be stored in test-reports/python-pytest/test_ops_jit/test_ops_jit-51a83bd44549074e.xml
============================= test session starts ==============================
platform linux -- Python 3.10.11, pytest-7.3.1, pluggy-1.0.0 -- /opt/conda/envs/py_3.10/bin/python
cachedir: .pytest_cache
hypothesis profile 'pytorch_ci' -> database=None, max_examples=50, derandomize=True, suppress_health_check=[HealthCheck.too_slow]
rootdir: /var/lib/jenkins/workspace
configfile: pytest.ini
plugins: hypothesis-5.35.1, cpp-2.3.0, flakefinder-1.1.0, rerunfailures-11.1.2, shard-0.1.2, xdist-3.3.0, xdoctest-1.1.0
collecting ... collected 1084 items
Running 100 items in this shard: test/test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_cuda_float32 (x50), test/test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_factor_cuda_complex64 (x50)
stepcurrent: Cannot find last run test, not skipping
test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_cuda_float32 PASSED [2.1876s] [ 1%]
test_ops_jit.py::TestJitCUDA::test_variant_consistency_jit_linalg_lu_factor_cuda_complex64 PASSED [4.5615s] [ 2%]
```
* [pull](https://github.com/pytorch/pytorch/actions/runs/5093566864)
* [trunk](https://github.com/pytorch/pytorch/actions/runs/5095364311)
* [periodic](https://github.com/pytorch/pytorch/actions/runs/5095378850)
* [slow](https://github.com/pytorch/pytorch/actions/runs/5095390285)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/102107
Approved by: https://github.com/clee2000, https://github.com/malfet
313 lines
11 KiB
Python
313 lines
11 KiB
Python
from _pytest.junitxml import LogXML, _NodeReporter, bin_xml_escape
|
|
from _pytest.terminal import _get_raw_skip_reason
|
|
from _pytest.stash import StashKey
|
|
from _pytest.reports import TestReport
|
|
from _pytest.config.argparsing import Parser
|
|
from _pytest.config import filename_arg
|
|
from _pytest.config import Config
|
|
from _pytest._code.code import ReprFileLocation
|
|
from _pytest.python import Module
|
|
from typing import Any, List, Union
|
|
from typing import Optional
|
|
from types import MethodType
|
|
import xml.etree.ElementTree as ET
|
|
import functools
|
|
import pytest
|
|
import sys
|
|
import os
|
|
import copy
|
|
import json
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
# a lot of this file is copied from _pytest.junitxml and modified to get rerun info
|
|
|
|
xml_key = StashKey["LogXMLReruns"]()
|
|
STEPCURRENT_CACHE_DIR = "cache/stepcurrent"
|
|
|
|
|
|
def pytest_addoption(parser: Parser) -> None:
|
|
group = parser.getgroup("general")
|
|
group.addoption(
|
|
"--scs",
|
|
action="store",
|
|
default=None,
|
|
dest="stepcurrent_skip",
|
|
)
|
|
group.addoption(
|
|
"--sc",
|
|
action="store",
|
|
default=None,
|
|
dest="stepcurrent",
|
|
)
|
|
|
|
parser.addoption("--use-main-module", action='store_true')
|
|
group = parser.getgroup("terminal reporting")
|
|
group.addoption(
|
|
"--junit-xml-reruns",
|
|
action="store",
|
|
dest="xmlpath_reruns",
|
|
metavar="path",
|
|
type=functools.partial(filename_arg, optname="--junit-xml-reruns"),
|
|
default=None,
|
|
help="create junit-xml style report file at given path.",
|
|
)
|
|
group.addoption(
|
|
"--junit-prefix-reruns",
|
|
action="store",
|
|
metavar="str",
|
|
default=None,
|
|
help="prepend prefix to classnames in junit-xml output",
|
|
)
|
|
parser.addini(
|
|
"junit_suite_name_reruns", "Test suite name for JUnit report", default="pytest"
|
|
)
|
|
parser.addini(
|
|
"junit_logging_reruns",
|
|
"Write captured log messages to JUnit report: "
|
|
"one of no|log|system-out|system-err|out-err|all",
|
|
default="no",
|
|
)
|
|
parser.addini(
|
|
"junit_log_passing_tests_reruns",
|
|
"Capture log information for passing tests to JUnit report: ",
|
|
type="bool",
|
|
default=True,
|
|
)
|
|
parser.addini(
|
|
"junit_duration_report_reruns",
|
|
"Duration time to report: one of total|call",
|
|
default="total",
|
|
)
|
|
parser.addini(
|
|
"junit_family_reruns",
|
|
"Emit XML for schema: one of legacy|xunit1|xunit2",
|
|
default="xunit2",
|
|
)
|
|
|
|
|
|
def pytest_configure(config: Config) -> None:
|
|
xmlpath = config.option.xmlpath_reruns
|
|
# Prevent opening xmllog on worker nodes (xdist).
|
|
if xmlpath and not hasattr(config, "workerinput"):
|
|
junit_family = config.getini("junit_family_reruns")
|
|
config.stash[xml_key] = LogXMLReruns(
|
|
xmlpath,
|
|
config.option.junitprefix,
|
|
config.getini("junit_suite_name_reruns"),
|
|
config.getini("junit_logging_reruns"),
|
|
config.getini("junit_duration_report_reruns"),
|
|
junit_family,
|
|
config.getini("junit_log_passing_tests_reruns"),
|
|
)
|
|
config.pluginmanager.register(config.stash[xml_key])
|
|
if config.getoption("stepcurrent_skip"):
|
|
config.option.stepcurrent = config.getoption("stepcurrent_skip")
|
|
if config.getoption("stepcurrent"):
|
|
config.pluginmanager.register(StepcurrentPlugin(config), "stepcurrentplugin")
|
|
|
|
|
|
def pytest_unconfigure(config: Config) -> None:
|
|
xml = config.stash.get(xml_key, None)
|
|
if xml:
|
|
del config.stash[xml_key]
|
|
config.pluginmanager.unregister(xml)
|
|
|
|
|
|
class _NodeReporterReruns(_NodeReporter):
|
|
def _prepare_content(self, content: str, header: str) -> str:
|
|
return content
|
|
|
|
def _write_content(self, report: TestReport, content: str, jheader: str) -> None:
|
|
if content == "":
|
|
return
|
|
tag = ET.Element(jheader)
|
|
tag.text = bin_xml_escape(content)
|
|
self.append(tag)
|
|
|
|
|
|
class LogXMLReruns(LogXML):
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def append_rerun(self, reporter: _NodeReporter, report: TestReport) -> None:
|
|
if hasattr(report, "wasxfail"):
|
|
reporter._add_simple("skipped", "xfail-marked test passes unexpectedly")
|
|
else:
|
|
assert report.longrepr is not None
|
|
reprcrash: Optional[ReprFileLocation] = getattr(
|
|
report.longrepr, "reprcrash", None
|
|
)
|
|
if reprcrash is not None:
|
|
message = reprcrash.message
|
|
else:
|
|
message = str(report.longrepr)
|
|
message = bin_xml_escape(message)
|
|
reporter._add_simple("rerun", message, str(report.longrepr))
|
|
|
|
def pytest_runtest_logreport(self, report: TestReport) -> None:
|
|
super().pytest_runtest_logreport(report)
|
|
if report.outcome == "rerun":
|
|
reporter = self._opentestcase(report)
|
|
self.append_rerun(reporter, report)
|
|
if report.outcome == "skipped":
|
|
if isinstance(report.longrepr, tuple):
|
|
fspath, lineno, reason = report.longrepr
|
|
reason = f"{report.nodeid}: {_get_raw_skip_reason(report)}"
|
|
report.longrepr = (fspath, lineno, reason)
|
|
|
|
def node_reporter(self, report: Union[TestReport, str]) -> _NodeReporterReruns:
|
|
nodeid: Union[str, TestReport] = getattr(report, "nodeid", report)
|
|
# Local hack to handle xdist report order.
|
|
workernode = getattr(report, "node", None)
|
|
|
|
key = nodeid, workernode
|
|
|
|
if key in self.node_reporters:
|
|
# TODO: breaks for --dist=each
|
|
return self.node_reporters[key]
|
|
|
|
reporter = _NodeReporterReruns(nodeid, self)
|
|
|
|
self.node_reporters[key] = reporter
|
|
self.node_reporters_ordered.append(reporter)
|
|
|
|
return reporter
|
|
|
|
|
|
# imitating summary_failures in pytest's terminal.py
|
|
# both hookwrapper and tryfirst to make sure this runs before pytest's
|
|
@pytest.hookimpl(hookwrapper=True, tryfirst=True)
|
|
def pytest_terminal_summary(terminalreporter, exitstatus, config):
|
|
# prints stack traces for reruns
|
|
if terminalreporter.config.option.tbstyle != "no":
|
|
reports = terminalreporter.getreports("rerun")
|
|
if reports:
|
|
terminalreporter.write_sep("=", "RERUNS")
|
|
if terminalreporter.config.option.tbstyle == "line":
|
|
for rep in reports:
|
|
line = terminalreporter._getcrashline(rep)
|
|
terminalreporter.write_line(line)
|
|
else:
|
|
for rep in reports:
|
|
msg = terminalreporter._getfailureheadline(rep)
|
|
terminalreporter.write_sep("_", msg, red=True, bold=True)
|
|
terminalreporter._outrep_summary(rep)
|
|
terminalreporter._handle_teardown_sections(rep.nodeid)
|
|
yield
|
|
|
|
|
|
@pytest.hookimpl(tryfirst=True)
|
|
def pytest_pycollect_makemodule(module_path, path, parent) -> Module:
|
|
if parent.config.getoption("--use-main-module"):
|
|
mod = Module.from_parent(parent, path=module_path)
|
|
mod._getobj = MethodType(lambda x: sys.modules['__main__'], mod)
|
|
return mod
|
|
|
|
|
|
@pytest.hookimpl(hookwrapper=True)
|
|
def pytest_report_teststatus(report, config):
|
|
# Add the test time to the verbose output, unforunately I don't think this
|
|
# includes setup or teardown
|
|
pluggy_result = yield
|
|
if not isinstance(report, pytest.TestReport):
|
|
return
|
|
outcome, letter, verbose = pluggy_result.get_result()
|
|
if verbose:
|
|
pluggy_result.force_result(
|
|
(outcome, letter, f"{verbose} [{report.duration:.4f}s]")
|
|
)
|
|
|
|
|
|
@pytest.hookimpl(trylast=True)
|
|
def pytest_collection_modifyitems(items: List[Any]) -> None:
|
|
"""
|
|
This hook is used when rerunning disabled tests to get rid of all skipped tests
|
|
instead of running and skipping them N times. This avoids flooding the console
|
|
and XML outputs with junk. So we want this to run last when collecting tests.
|
|
"""
|
|
rerun_disabled_tests = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1"
|
|
if not rerun_disabled_tests:
|
|
return
|
|
|
|
disabled_regex = re.compile(r"(?P<test_name>.+)\s+\([^\.]+\.(?P<test_class>.+)\)")
|
|
disabled_tests = defaultdict(set)
|
|
|
|
# This environment has already been set by run_test before it calls pytest
|
|
disabled_tests_file = os.getenv("DISABLED_TESTS_FILE", "")
|
|
if not disabled_tests_file or not os.path.exists(disabled_tests_file):
|
|
return
|
|
|
|
with open(disabled_tests_file) as fp:
|
|
for disabled_test in json.load(fp):
|
|
m = disabled_regex.match(disabled_test)
|
|
if m:
|
|
test_name = m["test_name"]
|
|
test_class = m["test_class"]
|
|
disabled_tests[test_class].add(test_name)
|
|
|
|
# When rerunning disabled test, ignore all test cases that are not disabled
|
|
filtered_items = []
|
|
|
|
for item in items:
|
|
test_name = item.name
|
|
test_class = item.parent.name
|
|
|
|
if test_class not in disabled_tests or test_name not in disabled_tests[test_class]:
|
|
continue
|
|
|
|
cpy = copy.copy(item)
|
|
cpy._initrequest()
|
|
|
|
filtered_items.append(cpy)
|
|
|
|
items.clear()
|
|
# NB: Need to edit items directly here to have the list reflected back to pytest
|
|
items.extend(filtered_items)
|
|
|
|
|
|
class StepcurrentPlugin:
|
|
# Modified fromo _pytest/stepwise.py in order to save the currently running
|
|
# test instead of the last failed test
|
|
def __init__(self, config: Config) -> None:
|
|
self.config = config
|
|
self.report_status = ""
|
|
assert config.cache is not None
|
|
self.cache: pytest.Cache = config.cache
|
|
self.directory = f"{STEPCURRENT_CACHE_DIR}/{config.getoption('stepcurrent')}"
|
|
self.lastrun: Optional[str] = self.cache.get(self.directory, None)
|
|
self.skip: bool = config.getoption("stepcurrent_skip")
|
|
|
|
def pytest_collection_modifyitems(self, config: Config, items: List[Any]) -> None:
|
|
if not self.lastrun:
|
|
self.report_status = "Cannot find last run test, not skipping"
|
|
return
|
|
|
|
# check all item nodes until we find a match on last run
|
|
failed_index = None
|
|
for index, item in enumerate(items):
|
|
if item.nodeid == self.lastrun:
|
|
failed_index = index
|
|
if self.skip:
|
|
failed_index += 1
|
|
break
|
|
|
|
# If the previously failed test was not found among the test items,
|
|
# do not skip any tests.
|
|
if failed_index is None:
|
|
self.report_status = "previously run test not found, not skipping."
|
|
else:
|
|
self.report_status = f"skipping {failed_index} already run items."
|
|
deselected = items[:failed_index]
|
|
del items[:failed_index]
|
|
config.hook.pytest_deselected(items=deselected)
|
|
|
|
def pytest_report_collectionfinish(self) -> Optional[str]:
|
|
if self.config.getoption("verbose") >= 0 and self.report_status:
|
|
return f"stepcurrent: {self.report_status}"
|
|
return None
|
|
|
|
def pytest_runtest_protocol(self, item, nextitem) -> None:
|
|
self.lastrun = item.nodeid
|
|
self.cache.set(self.directory, self.lastrun)
|