Move code out of individual token linters (#152256)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/152256
Approved by: https://github.com/Skylion007
This commit is contained in:
Tom Ritchford
2025-06-23 14:05:30 +00:00
committed by PyTorch MergeBot
parent da910e603a
commit 98a34e8d4b
13 changed files with 1017 additions and 901 deletions

View File

@ -1,15 +1,11 @@
from __future__ import annotations
import dataclasses as dc
import itertools
import json
import sys
import token
from enum import Enum
from functools import cached_property, total_ordering
from functools import cached_property
from pathlib import Path
from typing import Any, Callable, Optional, TYPE_CHECKING
from typing_extensions import Self
from typing import Any, Callable, TYPE_CHECKING
_FILE = Path(__file__).absolute()
@ -22,10 +18,9 @@ else:
if TYPE_CHECKING:
from collections.abc import Iterator, Sequence
from tokenize import TokenInfo
GRANDFATHER_LIST = Path(str(_FILE).replace(".py", "-grandfather.json"))
GRANDFATHER_LIST = _FILE.parent / "docstring_linter-grandfather.json"
# We tolerate a 10% increase in block size before demanding a docstring
TOLERANCE_PERCENT = 10
@ -45,274 +40,7 @@ METHOD_OVERRIDE_HINT = (
)
@total_ordering
@dc.dataclass
class Block:
"""A block of Python code starting with either `def` or `class`"""
class Category(str, Enum):
CLASS = "class"
DEF = "def"
category: Category
# The sequence of tokens that contains this Block.
# Tokens are represented in `Block` as indexes into `self.tokens`
tokens: Sequence[TokenInfo] = dc.field(repr=False)
# The name of the function or class being defined
name: str
# The index of the very first token in the block (the "class" or "def" keyword)
begin: int
# The index of the first INDENT token for this block
indent: int
# The index of the DEDENT token for this end of this block
dedent: int
# The docstring for the block
docstring: str
# These next members only get filled in after all blocks have been constructed
# and figure out family ties
# The full qualified name of the block within the file.
# This is the name of this block and all its parents, joined with `.`.
full_name: str = ""
# The index of this block within the full list of blocks in the file
index: int = 0
# Is this block contained within a function definition?
is_local: bool = dc.field(default=False, repr=False)
# Is this block a function definition in a class definition?
is_method: bool = dc.field(default=False, repr=False)
# A block index to the parent of this block, or None for a top-level block.
parent: Optional[int] = None
# A list of block indexes for the children
children: list[int] = dc.field(default_factory=list)
@property
def start_line(self) -> int:
return self.tokens[max(self.indent, self.index)].start[0]
@property
def end_line(self) -> int:
return self.tokens[max(self.dedent, self.index)].start[0]
@property
def line_count(self) -> int:
return self.end_line - self.start_line
@property
def is_class(self) -> bool:
return self.category == Block.Category.CLASS
@property
def display_name(self) -> str:
"""A user-friendly name like 'class One' or 'def One.method()'"""
ending = "" if self.is_class else "()"
return f"{self.category.value} {self.full_name}{ending}"
@cached_property
def decorators(self) -> list[str]:
"""A list of decorators for this function or method.
Each decorator both the @ symbol and any arguments to the decorator
but no extra whitespace.
"""
return _get_decorators(self.tokens, self.begin)
@cached_property
def is_override(self) -> bool:
return not self.is_class and any(
d.rpartition(".")[2] == "override" for d in self.decorators
)
DATA_FIELDS = (
"category",
"children",
"decorators",
"display_name",
"docstring",
"full_name",
"index",
"is_local",
"is_method",
"line_count",
"parent",
"start_line",
)
def as_data(self) -> dict[str, Any]:
d = {i: getattr(self, i) for i in self.DATA_FIELDS}
d["category"] = d["category"].value
return d
@property
def is_init(self) -> bool:
return not self.is_class and self.name == "__init__"
def contains(self, b: Block) -> bool:
return self.start_line < b.start_line and self.end_line >= b.end_line
def __eq__(self, o: object) -> bool:
assert isinstance(o, Block)
return o.tokens is self.tokens and o.index == self.index
def __hash__(self) -> int:
return super().__hash__()
def __lt__(self, o: Self) -> bool:
assert isinstance(o, Block) and o.tokens is self.tokens
return o.index < self.index
_IGNORE = {token.COMMENT, token.DEDENT, token.INDENT, token.NL}
def _get_decorators(tokens: Sequence[TokenInfo], block_start: int) -> list[str]:
def decorators() -> Iterator[str]:
rev = reversed(range(block_start))
newlines = (i for i in rev if tokens[i].type == token.NEWLINE)
newlines = itertools.chain(newlines, [-1]) # To account for the first line
it = iter(newlines)
end = next(it, -1) # Like itertools.pairwise in Python 3.10
for begin in it:
for i in range(begin + 1, end):
t = tokens[i]
if t.type == token.OP and t.string == "@":
useful = (t for t in tokens[i:end] if t.type not in _IGNORE)
yield "".join(s.string.strip("\n") for s in useful)
break
elif t.type not in _IGNORE:
return # A statement means no more decorators
end = begin
out = list(decorators())
out.reverse()
return out
class DocstringFile(_linter.PythonFile):
def __getitem__(self, i: int | slice) -> TokenInfo | Sequence[TokenInfo]:
return self.tokens[i]
def next_token(self, start: int, token_type: int, error: str) -> int:
for i in range(start, len(self.tokens)):
if self.tokens[i].type == token_type:
return i
raise _linter.ParseError(self.tokens[-1], error)
def docstring(self, start: int) -> str:
for i in range(start + 1, len(self.tokens)):
tk = self.tokens[i]
if tk.type == token.STRING:
return tk.string
if tk.type not in _linter.EMPTY_TOKENS:
return ""
return ""
@cached_property
def indent_to_dedent(self) -> dict[int, int]:
dedents = dict[int, int]()
stack = list[int]()
for i, t in enumerate(self.tokens):
if t.type == token.INDENT:
stack.append(i)
elif t.type == token.DEDENT:
dedents[stack.pop()] = i
return dedents
@cached_property
def errors(self) -> dict[str, str]:
return {}
@cached_property
def blocks(self) -> list[Block]:
blocks: list[Block] = []
for i in range(len(self.tokens)):
try:
if (b := self.block(i)) is not None:
blocks.append(b)
except _linter.ParseError as e:
self.errors[e.token.line] = " ".join(e.args)
for i, parent in enumerate(blocks):
for j in range(i + 1, len(blocks)):
if parent.contains(child := blocks[j]):
child.parent = i
parent.children.append(j)
else:
break
for i, b in enumerate(blocks):
b.index = i
parents = [b]
while (p := parents[-1].parent) is not None:
parents.append(blocks[p])
parents = parents[1:]
b.is_local = not all(p.is_class for p in parents)
b.is_method = not b.is_class and bool(parents) and parents[0].is_class
def add_full_names(children: Sequence[Block], prefix: str = "") -> None:
dupes: dict[str, list[Block]] = {}
for b in children:
dupes.setdefault(b.name, []).append(b)
for dl in dupes.values():
for i, b in enumerate(dl):
suffix = f"[{i + 1}]" if len(dl) > 1 else ""
b.full_name = prefix + b.name + suffix
for b in children:
if kids := [blocks[i] for i in b.children]:
add_full_names(kids, b.full_name + ".")
add_full_names([b for b in blocks if b.parent is None])
return blocks
def block(self, begin: int) -> Block | None:
t = self.tokens[begin]
if not (t.type == token.NAME and t.string in ("class", "def")):
return None
category = Block.Category[t.string.upper()]
try:
ni = self.next_token(begin + 1, token.NAME, "Definition but no name")
name = self.tokens[ni].string
indent = self.next_token(ni + 1, token.INDENT, "Definition but no indent")
dedent = self.indent_to_dedent[indent]
docstring = self.docstring(indent)
except _linter.ParseError:
name = "(ParseError)"
indent = -1
dedent = -1
docstring = ""
return Block(
begin=begin,
category=category,
dedent=dedent,
docstring=docstring,
indent=indent,
name=name,
tokens=self.tokens,
)
class DocstringLinter(_linter.FileLinter[DocstringFile]):
class DocstringLinter(_linter.FileLinter):
linter_name = "docstring_linter"
description = DESCRIPTION
is_fixer = False
@ -332,26 +60,26 @@ class DocstringLinter(_linter.FileLinter[DocstringFile]):
self._write_grandfather()
return success
def _lint(self, df: DocstringFile) -> Iterator[_linter.LintResult]:
if (p := str(df.path)) in self.path_to_blocks:
def _lint(self, pf: _linter.PythonFile) -> Iterator[_linter.LintResult]:
if (p := str(pf.path)) in self.path_to_blocks:
print("Repeated file", p, file=sys.stderr)
return
blocks = df.blocks
bad = {b for b in blocks if self._is_bad_block(b, df)}
blocks = pf.blocks
bad = {b for b in blocks if self._is_bad_block(b, pf)}
bad = self._dont_require_constructor_and_class_docs(blocks, bad)
gf = self._grandfathered(df.path, bad)
gf = self._grandfathered(pf.path, bad)
yield from (self._block_result(b, df) for b in sorted(bad - gf))
yield from (self._block_result(b, pf) for b in sorted(bad - gf))
def as_data(b: Block) -> dict[str, Any]:
def as_data(b: _linter.Block) -> dict[str, Any]:
status = "grandfather" if b in gf else "bad" if b in bad else "good"
return {"status": status, **b.as_data()}
self.path_to_blocks[p] = [as_data(b) for b in blocks]
def _error(self, df: DocstringFile, result: _linter.LintResult) -> None:
self.path_to_errors[str(df.path)] = [{str(result.line): result.name}]
def _error(self, pf: _linter.PythonFile, result: _linter.LintResult) -> None:
self.path_to_errors[str(pf.path)] = [{str(result.line): result.name}]
@cached_property
def _grandfather(self) -> dict[str, dict[str, Any]]:
@ -368,20 +96,24 @@ class DocstringLinter(_linter.FileLinter[DocstringFile]):
def _max_lines(self) -> dict[str, int]:
return {"class": self.args.max_class, "def": self.args.max_def}
def _grandfathered(self, path: Path | None, bad: set[Block]) -> set[Block]:
def _grandfathered(
self, path: Path | None, bad: set[_linter.Block]
) -> set[_linter.Block]:
if path is None or self.args.no_grandfather or self.args.write_grandfather:
return set()
grand: dict[str, int] = self._grandfather.get(str(path), {})
tolerance_ratio = 1 + self.args.grandfather_tolerance / 100.0
def grandfathered(b: Block) -> bool:
def grandfathered(b: _linter.Block) -> bool:
lines = int(grand.get(b.display_name, 0) * tolerance_ratio)
return b.line_count <= lines
return {b for b in bad if grandfathered(b)}
def _block_result(self, b: Block, df: DocstringFile) -> _linter.LintResult:
def _block_result(
self, b: _linter.Block, pf: _linter.PythonFile
) -> _linter.LintResult:
def_name = "function" if b.category == "def" else "class"
msg = f"docstring found for {def_name} '{b.name}' ({b.line_count} lines)"
if len(b.docstring):
@ -392,23 +124,23 @@ class DocstringLinter(_linter.FileLinter[DocstringFile]):
msg = f"No {msg}"
if b.is_method:
msg = f"{msg}. {METHOD_OVERRIDE_HINT}"
return _linter.LintResult(msg, *df.tokens[b.begin].start)
return _linter.LintResult(msg, *pf.tokens[b.begin].start)
def _display(
self, df: DocstringFile, results: list[_linter.LintResult]
self, pf: _linter.PythonFile, results: list[_linter.LintResult]
) -> Iterator[str]:
if not self.args.report:
yield from super()._display(df, results)
yield from super()._display(pf, results)
def _dont_require_constructor_and_class_docs(
self, blocks: Sequence[Block], bad: set[Block]
) -> set[Block]:
self, blocks: Sequence[_linter.Block], bad: set[_linter.Block]
) -> set[_linter.Block]:
if self.args.lint_init:
return bad
good = {b for b in blocks if len(b.docstring) >= self.args.min_docstring}
def has_class_init_doc(b: Block) -> bool:
def has_class_init_doc(b: _linter.Block) -> bool:
if b.is_class:
# Is it a class whose constructor is documented?
children = (blocks[i] for i in b.children)
@ -419,10 +151,10 @@ class DocstringLinter(_linter.FileLinter[DocstringFile]):
return {b for b in bad if not has_class_init_doc(b)}
def _is_bad_block(self, b: Block, df: DocstringFile) -> bool:
def _is_bad_block(self, b: _linter.Block, pf: _linter.PythonFile) -> bool:
max_lines = self._max_lines[b.category]
return (
not df.omitted(df.tokens, b.begin, b.dedent)
not pf.omitted(pf.tokens, b.begin, b.dedent)
and b.line_count > max_lines
and len(b.docstring) < self.args.min_docstring
and (self.args.lint_local or not b.is_local)