[inductor] Add features to docstring_linter (see #142496) (#145834)

## Improvements to `docstring_linter` * Add a "grandfather list" of existing undocumented classes and functions (`--grandfather`, `--grandfather-tolerance`, `--no-grandfather`, `--write-grandfather`) * In classes, now just one of the class itself or its `__init__()` method needs to be documented (`--lint-init` turns the old behavior back on) * Now classes and functions defined local to other functions do not need to be documented (`--lint-local` turns the old behavior back on) * New `--report` flag produces a compact report of long, undocumented classes or function definitions: see attached example run over all pytorch: [pytorch-docs.json](https://github.com/user-attachments/files/18455981/pytorch-docs.json) ## Help text ``` $ python tools/linter/adapters/docstring_linter.py --help usage: docstring_linter.py [-h] [-l] [-v] [--grandfather GRANDFATHER] [--grandfather-tolerance GRANDFATHER_TOLERANCE] [--lint-init] [--lint-local] [--lint-protected] [--max-class MAX_CLASS] [--max-def MAX_DEF] [--min-docstring MIN_DOCSTRING] [--no-grandfather] [--report] [--write-grandfather] [files ...] `docstring_linter` reports on long functions, methods or classes without docstrings positional arguments: files A list of files or directories to lint optional arguments: -h, --help show this help message and exit -l, --lintrunner Run for lintrunner and print LintMessages which aren't edits -v, --verbose Print more debug info --grandfather GRANDFATHER, -g GRANDFATHER Set the grandfather list --grandfather-tolerance GRANDFATHER_TOLERANCE, -t GRANDFATHER_TOLERANCE Tolerance for grandfather sizes, in percent --lint-init, -i Lint __init__ and class separately --lint-local, -o Lint definitions inside other functions --lint-protected, -p Lint functions, methods and classes that start with _ --max-class MAX_CLASS, -c MAX_CLASS Maximum number of lines for an undocumented class --max-def MAX_DEF, -d MAX_DEF Maximum number of lines for an undocumented function --min-docstring MIN_DOCSTRING, -s MIN_DOCSTRING Minimum number of characters for a docstring --no-grandfather, -n Disable the grandfather list --report, -r Print a report on all classes and defs --write-grandfather, -w Rewrite the grandfather list ``` --- Pull Request resolved: https://github.com/pytorch/pytorch/pull/145834 Approved by: https://github.com/amjames, https://github.com/eellison
2025-10-20 21:14:14 +08:00 · 2025-04-09 17:29:55 +00:00
parent 357814c85c
commit 31fe258efc
9 changed files with 1192 additions and 181 deletions
--- a/tools/linter/adapters/docstring_linter.py
+++ b/tools/linter/adapters/docstring_linter.py
@ -1,16 +1,21 @@
 from __future__ import annotations

+import dataclasses as dc
+import itertools
+import json
 import sys
 import token
-from functools import cached_property
+from enum import Enum
+from functools import cached_property, total_ordering
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import Any, Callable, TYPE_CHECKING
+from typing_extensions import Self


-_PARENT = Path(__file__).parent.absolute()
+_FILE = Path(__file__).absolute()
 _PATH = [Path(p).absolute() for p in sys.path]

-if TYPE_CHECKING or _PARENT not in _PATH:
+if TYPE_CHECKING or _FILE.parent not in _PATH:
    from . import _linter
 else:
    import _linter
@ -20,149 +25,485 @@ if TYPE_CHECKING:
    from tokenize import TokenInfo


+GRANDFATHER_LIST = Path(str(_FILE).replace(".py", "-grandfather.json"))
+
+# We tolerate a 10% increase in block size before demanding a docstring
+TOLERANCE_PERCENT = 10
+
 MAX_LINES = {"class": 100, "def": 80}

-MIN_DOCSTRING = 16  # docstrings shorter than this are ignored
-IGNORE_PROTECTED = True  # If True, ignore classes and files whose names start with _.
+MIN_DOCSTRING = 50  # docstrings shorter than this are too short

 ERROR_FMT = "Every {type} with more than {length} lines needs a docstring"

 DESCRIPTION = """`docstring_linter` reports on long functions, methods or classes
 without docstrings"""

-# How many top violations to report?
-REPORT_TOP_RESULTS = 3
+
+@total_ordering
+@dc.dataclass
+class Block:
+    """A block of Python code starting with either `def` or `class`"""
+
+    class Category(str, Enum):
+        CLASS = "class"
+        DEF = "def"
+
+    category: Category
+
+    # The sequence of tokens that contains this Block.
+    # Tokens are represented in `Block` as indexes into `self.tokens`
+    tokens: Sequence[TokenInfo] = dc.field(repr=False)
+
+    # The name of the function or class being defined
+    name: str
+
+    # The index of the very first token in the block (the "class" or "def" keyword)
+    begin: int
+
+    # The index of the first INDENT token for this block
+    indent: int
+
+    # The index of the DEDENT token for this end of this block
+    dedent: int
+
+    # The docstring for the block
+    docstring: str
+
+    # These next members only get filled in after all blocks have been constructed
+    # and figure out family ties
+
+    # The full qualified name of the block within the file.
+    # This is the name of this block and all its parents, joined with `.`.
+    full_name: str = ""
+
+    # The index of this block within the full list of blocks in the file
+    index: int = 0
+
+    # Is this block contained within a function definition?
+    is_local: bool = dc.field(default=False, repr=False)
+
+    # Is this block a function definition in a class definition?
+    is_method: bool = dc.field(default=False, repr=False)
+
+    # A block index to the parent of this block, or None for a top-level block.
+    parent: int | None = None
+
+    # A list of block indexes for the children
+    children: list[int] = dc.field(default_factory=list)
+
+    @property
+    def start_line(self) -> int:
+        return self.tokens[max(self.indent, self.index)].start[0]
+
+    @property
+    def end_line(self) -> int:
+        return self.tokens[max(self.dedent, self.index)].start[0]
+
+    @property
+    def line_count(self) -> int:
+        return self.end_line - self.start_line
+
+    @property
+    def is_class(self) -> bool:
+        return self.category == Block.Category.CLASS
+
+    @property
+    def display_name(self) -> str:
+        """A user-friendly name like 'class One' or 'def One.method()'"""
+        ending = "" if self.is_class else "()"
+        return f"{self.category.value} {self.full_name}{ending}"
+
+    DATA_FIELDS = (
+        "category",
+        "children",
+        "display_name",
+        "docstring",
+        "full_name",
+        "index",
+        "is_local",
+        "is_method",
+        "line_count",
+        "parent",
+        "start_line",
+    )
+
+    def as_data(self) -> dict[str, Any]:
+        d = {i: getattr(self, i) for i in self.DATA_FIELDS}
+        d["category"] = d["category"].value
+        return d
+
+    @property
+    def is_init(self) -> bool:
+        return not self.is_class and self.name == "__init__"
+
+    def contains(self, b: Block) -> bool:
+        return self.start_line < b.start_line and self.end_line >= b.end_line
+
+    def __eq__(self, o: object) -> bool:
+        assert isinstance(o, Block)
+        return o.tokens is self.tokens and o.index == self.index
+
+    def __hash__(self) -> int:
+        return super().__hash__()
+
+    def __lt__(self, o: Self) -> bool:
+        assert isinstance(o, Block) and o.tokens is self.tokens
+        return o.index < self.index


-def _is_def(t: TokenInfo) -> bool:
-    return t.type == token.NAME and t.string in ("class", "def")
+class DocstringFile(_linter.PythonFile):
+    def __getitem__(self, i: int | slice) -> TokenInfo | Sequence[TokenInfo]:
+        return self.tokens[i]
+
+    def next_token(self, start: int, token_type: int, error: str) -> int:
+        for i in range(start, len(self.tokens)):
+            if self.tokens[i].type == token_type:
+                return i
+        raise _linter.ParseError(self.tokens[-1], error)
+
+    def docstring(self, start: int) -> str:
+        for i in range(start + 1, len(self.tokens)):
+            tk = self.tokens[i]
+            if tk.type == token.STRING:
+                return tk.string
+            if tk.type not in _linter.EMPTY_TOKENS:
+                return ""
+        return ""
+
+    @cached_property
+    def indent_to_dedent(self) -> dict[int, int]:
+        dedents = dict[int, int]()
+        stack = list[int]()
+
+        for i, t in enumerate(self.tokens):
+            if t.type == token.INDENT:
+                stack.append(i)
+            elif t.type == token.DEDENT:
+                dedents[stack.pop()] = i
+
+        return dedents
+
+    @cached_property
+    def errors(self) -> dict[str, str]:
+        return {}
+
+    @cached_property
+    def blocks(self) -> list[Block]:
+        blocks: list[Block] = []
+
+        for i in range(len(self.tokens)):
+            try:
+                if (b := self.block(i)) is not None:
+                    blocks.append(b)
+            except _linter.ParseError as e:
+                self.errors[e.token.line] = " ".join(e.args)
+
+        for i, parent in enumerate(blocks):
+            for j in range(i + 1, len(blocks)):
+                if parent.contains(child := blocks[j]):
+                    child.parent = i
+                    parent.children.append(j)
+                else:
+                    break
+
+        for i, b in enumerate(blocks):
+            b.index = i
+
+            parents = [b]
+            while (p := parents[-1].parent) is not None:
+                parents.append(blocks[p])
+            parents = parents[1:]
+
+            b.is_local = not all(p.is_class for p in parents)
+            b.is_method = not b.is_class and bool(parents) and parents[0].is_class
+
+        def add_full_names(children: Sequence[Block], prefix: str = "") -> None:
+            dupes: dict[str, list[Block]] = {}
+            for b in children:
+                dupes.setdefault(b.name, []).append(b)
+
+            for dl in dupes.values():
+                for i, b in enumerate(dl):
+                    suffix = f"[{i + 1}]" if len(dl) > 1 else ""
+                    b.full_name = prefix + b.name + suffix
+
+            for b in children:
+                if kids := [blocks[i] for i in b.children]:
+                    add_full_names(kids, b.full_name + ".")
+
+        add_full_names([b for b in blocks if b.parent is None])
+        return blocks
+
+    def block(self, begin: int) -> Block | None:
+        t = self.tokens[begin]
+        if not (t.type == token.NAME and t.string in ("class", "def")):
+            return None
+
+        category = Block.Category[t.string.upper()]
+        try:
+            ni = self.next_token(begin + 1, token.NAME, "Definition but no name")
+            name = self.tokens[ni].string
+            indent = self.next_token(ni + 1, token.INDENT, "Definition but no indent")
+            dedent = self.indent_to_dedent[indent]
+            docstring = self.docstring(indent)
+        except _linter.ParseError:
+            name = "(ParseError)"
+            indent = -1
+            dedent = -1
+            docstring = ""
+
+        return Block(
+            begin=begin,
+            category=category,
+            dedent=dedent,
+            docstring=docstring,
+            indent=indent,
+            name=name,
+            tokens=self.tokens,
+        )


-class DocstringLinter(_linter.FileLinter[_linter.PythonFile]):
+class DocstringLinter(_linter.FileLinter[DocstringFile]):
    linter_name = "docstring_linter"
    description = DESCRIPTION
    is_fixer = False
-    results: dict[str, list[tuple[int, Path, str]]]

-    def __init__(self, argv: list[str] | None = None) -> None:
+    path_to_blocks: dict[str, list[dict[str, Any]]]
+    path_to_errors: dict[str, list[dict[str, Any]]]
+
+    def __init__(self, argv: Sequence[str] | None = None) -> None:
        super().__init__(argv)
-        self.results = {}
-
-        help = "Maximum number of lines for an undocumented class"
-        self.parser.add_argument(
-            "--max-class", "-c", default=MAX_LINES["class"], type=int, help=help
-        )
-
-        help = "Maximum number of lines for an undocumented function"
-        self.parser.add_argument(
-            "--max-def", "-d", default=MAX_LINES["def"], type=int, help=help
-        )
-
-        help = "Minimum number of characters for a docstring"
-        self.parser.add_argument(
-            "--min-docstring", "-m", default=MIN_DOCSTRING, type=int, help=help
-        )
-
-        help = "Lint functions, methods and classes that start with _"
-        self.parser.add_argument(
-            "--lint-protected", "-p", action="store_true", help=help
-        )
-
-    @cached_property
-    def max_lines(self) -> dict[str, int]:
-        return {"class": self.args.max_class, "def": self.args.max_def}
+        add_arguments(self.parser.add_argument)
+        self.path_to_blocks = {}
+        self.path_to_errors = {}

    def lint_all(self) -> bool:
        success = super().lint_all()
-        if not self.args.lintrunner and self.results:
-            self._report_results()
+        self._report()
+        self._write_grandfather()
        return success

-    def _lint(self, pf: _linter.PythonFile) -> Iterator[_linter.LintResult]:
-        tokens = pf.tokens
-        indents = indent_to_dedent(tokens)
-        defs = [i for i, t in enumerate(tokens) if _is_def(t)]
+    def _lint(self, df: DocstringFile) -> Iterator[_linter.LintResult]:
+        if (p := str(df.path)) in self.path_to_blocks:
+            print("Repeated file", p, file=sys.stderr)
+            return

-        def next_token(start: int, token_type: int, error: str) -> int:  # type: ignore[return]
-            for i in range(start, len(tokens)):
-                if tokens[i].type == token_type:
-                    return i
-            raise _linter.ParseError(tokens[-1], error)
+        blocks = df.blocks
+        bad = {b for b in blocks if self._is_bad_block(b, df)}
+        bad = self._dont_require_constructor_and_class_docs(blocks, bad)
+        gf = self._grandfathered(df.path, bad)

-        for i in defs:
-            name = next_token(i + 1, token.NAME, "Definition with no name")
-            if not self.args.lint_protected and tokens[name].string.startswith("_"):
-                continue
+        yield from (self._block_result(b, df) for b in sorted(bad - gf))

-            indent = next_token(name + 1, token.INDENT, "Definition with no indent")
-            dedent = indents[indent]
+        def as_data(b: Block) -> dict[str, Any]:
+            status = "grandfather" if b in gf else "bad" if b in bad else "good"
+            return {"status": status, **b.as_data()}

-            lines = tokens[dedent].start[0] - tokens[indent].start[0]
-            max_lines = self.max_lines[tokens[i].string]
-            if lines <= max_lines:
-                continue
+        self.path_to_blocks[p] = [as_data(b) for b in blocks]

-            # Now search for a docstring
-            docstring_len = -1
-            for k in range(indent + 1, len(tokens)):
-                tk = tokens[k]
-                if tk.type == token.STRING:
-                    docstring_len = len(tk.string)
-                    break
-                if tk.type not in _linter.EMPTY_TOKENS:
-                    break
+    def _error(self, df: DocstringFile, result: _linter.LintResult) -> None:
+        self.path_to_errors[str(df.path)] = [{str(result.line): result.name}]

-            if docstring_len >= self.args.min_docstring:
-                continue
+    @cached_property
+    def _grandfather(self) -> dict[str, dict[str, Any]]:
+        try:
+            with open(self.args.grandfather) as fp:
+                return json.load(fp)  # type: ignore[no-any-return]
+        except FileNotFoundError:
+            return {}
+        except Exception as e:
+            print("ERROR:", e, "in", GRANDFATHER_LIST, file=sys.stderr)
+            raise

-            # Now check if it's omitted
-            if pf.omitted(pf.tokens[i:indent]):
-                continue
+    @cached_property
+    def _max_lines(self) -> dict[str, int]:
+        return {"class": self.args.max_class, "def": self.args.max_def}

-            t = tokens[i]
-            def_name = "function" if t.string == "def" else t.string
-            tname = tokens[name].string
-            msg = f"docstring found for {def_name} '{tname}' ({lines} lines)"
-            if docstring_len < 0:
-                msg = "No " + msg
-            else:
-                msg = msg + f" was too short ({docstring_len} characters)"
-            yield _linter.LintResult(msg, *t.start)
-            if pf.path is not None:
-                self.results.setdefault(def_name, []).append((lines, pf.path, tname))
+    def _grandfathered(self, path: Path | None, bad: set[Block]) -> set[Block]:
+        if path is None or self.args.no_grandfather or self.args.write_grandfather:
+            return set()

-    def _report_results(self) -> None:
-        print()
-        for i, (k, v) in enumerate(sorted(self.results.items())):
-            if i:
-                print()
-            top = sorted(v, reverse=True)[:REPORT_TOP_RESULTS]
-            if len(top) == 1:
-                s = ""
-                t = f"{len(top)} "
-            else:
-                s = "es" if k.endswith("s") else "s"
-                t = ""
-            print(f"Top {t}undocumented {k}{s}:")
-            for lines, path, tname in top:
-                print(f"    {lines} lines: {path}:{tname}")
+        grand: dict[str, int] = self._grandfather.get(str(path), {})
+        tolerance_ratio = 1 + self.args.grandfather_tolerance / 100.0
+
+        def grandfathered(b: Block) -> bool:
+            lines = int(grand.get(b.display_name, 0) * tolerance_ratio)
+            return b.line_count <= lines
+
+        return {b for b in bad if grandfathered(b)}
+
+    def _block_result(self, b: Block, df: DocstringFile) -> _linter.LintResult:
+        def_name = "function" if b.category == "def" else "class"
+        msg = f"docstring found for {def_name} '{b.name}' ({b.line_count} lines)"
+        if len(b.docstring):
+            msg = msg + f" was too short ({len(b.docstring)} characters)"
+        else:
+            msg = "No " + msg
+        return _linter.LintResult(msg, *df.tokens[b.begin].start)
+
+    def _display(
+        self, df: DocstringFile, results: list[_linter.LintResult]
+    ) -> Iterator[str]:
+        if not self.args.report:
+            yield from super()._display(df, results)
+
+    def _dont_require_constructor_and_class_docs(
+        self, blocks: Sequence[Block], bad: set[Block]
+    ) -> set[Block]:
+        if self.args.lint_init:
+            return bad
+
+        good = {b for b in blocks if len(b.docstring) >= self.args.min_docstring}
+
+        def has_class_init_doc(b: Block) -> bool:
+            if b.is_class:
+                # Is it a class whose constructor is documented?
+                children = (blocks[i] for i in b.children)
+                return any(b.is_init and b in good for b in children)
+
+            # Is it a constructor whose class is documented?
+            return b.is_init and b.parent is not None and blocks[b.parent] in good
+
+        return {b for b in bad if not has_class_init_doc(b)}
+
+    def _is_bad_block(self, b: Block, df: DocstringFile) -> bool:
+        max_lines = self._max_lines[b.category]
+        return (
+            not df.omitted(df.tokens, b.begin, b.dedent)
+            and b.line_count > max_lines
+            and len(b.docstring) < self.args.min_docstring
+            and (self.args.lint_local or not b.is_local)
+            and (self.args.lint_protected or not b.name.startswith("_"))
+        )
+
+    def _report(self) -> None:
+        if not self.args.lintrunner and self.path_to_blocks and self.args.report:
+            report = {
+                k: s for k, v in self.path_to_blocks.items() if (s := file_summary(v))
+            } | self.path_to_errors
+            print(json.dumps(report, sort_keys=True, indent=2))
+
+    def _write_grandfather(self) -> None:
+        if self.args.write_grandfather:
+            results: dict[str, dict[str, int]] = {}
+
+            for path, blocks in self.path_to_blocks.items():
+                for block in blocks:
+                    if block["status"] == "bad":
+                        d = results.setdefault(path, {})
+                        d[block["display_name"]] = block["line_count"]
+
+            with open(self.args.grandfather, "w") as fp:
+                json.dump(results, fp, sort_keys=True, indent=2)


-def indent_to_dedent(tokens: Sequence[TokenInfo]) -> dict[int, int]:
-    indent_to_dedent: dict[int, int] = {}
-    stack: list[int] = []
+def make_recursive(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    def rec(i: int) -> dict[str, Any]:
+        d = dict(blocks[i])
+        d["children"] = [rec(c) for c in d["children"]]
+        return d

-    for i, t in enumerate(tokens):
-        if t.type == token.INDENT:
-            stack.append(i)
-        elif t.type == token.DEDENT:
-            assert stack
-            indent_to_dedent[stack.pop()] = i
+    return [rec(i) for i, b in enumerate(blocks) if b["parent"] is None]

-    assert not stack
-    # Can't happen: the tokenization process would already have failed on a bad indent

-    return indent_to_dedent
+def make_terse(
+    blocks: Sequence[dict[str, Any]],
+    index_by_line: bool = True,
+) -> dict[str, dict[str, Any]]:
+    result: dict[str, dict[str, Any]] = {}
+
+    max_line = max(b["start_line"] for b in blocks) if blocks else 0
+    line_field_width = len(str(max_line))
+
+    for b in blocks:
+        root = f"{b['category']} {b['full_name']}"
+        for i in itertools.count():
+            name = root + bool(i) * f"[{i + 1}]"
+            if name not in result:
+                break
+
+        d = {
+            "docstring_len": len(b["docstring"]),
+            "lines": b["line_count"],
+            "status": b.get("status", "good"),
+        }
+
+        start_line = b["start_line"]
+        if index_by_line:
+            d["name"] = name
+            result[f"{start_line:>{line_field_width}}"] = d
+        else:
+            d["line"] = start_line
+            result[name] = d
+
+        if kids := b["children"]:
+            if not all(isinstance(k, int) for k in kids):
+                assert all(isinstance(k, dict) for k in kids)
+                d["children"] = make_terse(kids)
+
+    return result
+
+
+def file_summary(
+    blocks: Sequence[dict[str, Any]], report_all: bool = False
+) -> dict[str, str]:
+    def to_line(v: dict[str, Any]) -> str | None:
+        if (status := v["status"]) == "good":
+            if not report_all:
+                return None
+            fail = ""
+        elif status == "grandfather":
+            fail = ": (grandfathered)"
+        else:
+            assert status == "bad"
+            fail = ": FAIL"
+        name = v["name"]
+        lines = v["lines"]
+        docs = v["docstring_len"]
+        parens = "()" if name.startswith("def ") else ""
+        return f"{name}{parens}: {lines=}, {docs=}{fail}"
+
+    t = make_terse(blocks)
+    r = {k: line for k, v in t.items() if (line := to_line(v))}
+    while r and all(k.startswith(" ") for k in r):
+        r = {k[1:]: v for k, v in r.items()}
+    return r
+
+
+def add_arguments(add: Callable[..., Any]) -> None:
+    h = "Set the grandfather list"
+    add("--grandfather", "-g", default=str(GRANDFATHER_LIST), type=str, help=h)
+
+    h = "Tolerance for grandfather sizes, in percent"
+    add("--grandfather-tolerance", "-t", default=TOLERANCE_PERCENT, type=float, help=h)
+
+    h = "Lint __init__ and class separately"
+    add("--lint-init", "-i", action="store_true", help=h)
+
+    h = "Lint definitions inside other functions"
+    add("--lint-local", "-o", action="store_true", help=h)
+
+    h = "Lint functions, methods and classes that start with _"
+    add("--lint-protected", "-p", action="store_true", help=h)
+
+    h = "Maximum number of lines for an undocumented class"
+    add("--max-class", "-c", default=MAX_LINES["class"], type=int, help=h)
+
+    h = "Maximum number of lines for an undocumented function"
+    add("--max-def", "-d", default=MAX_LINES["def"], type=int, help=h)
+
+    h = "Minimum number of characters for a docstring"
+    add("--min-docstring", "-s", default=MIN_DOCSTRING, type=int, help=h)
+
+    h = "Disable the grandfather list"
+    add("--no-grandfather", "-n", action="store_true", help=h)
+
+    h = "Print a report on all classes and defs"
+    add("--report", "-r", action="store_true", help=h)
+
+    h = "Rewrite the grandfather list"
+    add("--write-grandfather", "-w", action="store_true", help=h)


 if __name__ == "__main__":