[BE] add codespell linter (#156066)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156066 Approved by: https://github.com/malfet
2025-10-20 21:14:14 +08:00 · 2025-06-16 18:55:08 +08:00
parent 2d832c9587
commit 6c493e2b14
4 changed files with 202 additions and 0 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1134,6 +1134,54 @@ init_command = [
    'PyYAML==6.0.1',
 ]

+[[linter]]
+code = 'CODESPELL'
+command = [
+    'python3',
+    'tools/linter/adapters/codespell_linter.py',
+    '--',
+    '@{{PATHSFILE}}'
+]
+include_patterns = [
+    '**',
+]
+exclude_patterns = [
+    # We don't care too much about files in this directory, don't enforce
+    # spelling on them
+    'caffe2/**',
+    'fb/**',
+    '**/fb/**',
+    'third_party/**',
+    'test/dynamo/cpython/**',
+    'torch/_vendor/**',
+    'torch/_inductor/fx_passes/serialized_patterns/**',
+    'torch/_inductor/autoheuristic/artifacts/**',
+    # These files are all grandfathered in, feel free to remove from this list
+    # as necessary
+    '*',
+    '.ci/**',
+    '.circleci/**',
+    '.github/**',
+    'aten/**',
+    'benchmarks/**',
+    'c10/**',
+    'cmake/**',
+    'docs/**',
+    'functorch/**',
+    'scripts/**',
+    'test/**',
+    'tools/**',
+    'torch/**',
+    'torchgen/**',
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'codespell[toml]==2.4.1',
+]
+is_formatter = true
+
 # usort + ruff-format
 [[linter]]
 code = 'PYFMT'
--- a/pyproject.toml
+++ b/pyproject.toml
@ -281,3 +281,6 @@ select = [
 "tools/linter/**" = [
    "LOG015" # please fix
 ]
+
+[tool.codespell]
+ignore-words = "tools/linter/dictionary.txt"
--- a/tools/linter/adapters/codespell_linter.py
+++ b/tools/linter/adapters/codespell_linter.py
@ -0,0 +1,151 @@
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import json
+import logging
+import os
+import subprocess
+import sys
+from enum import Enum
+from pathlib import Path
+from typing import NamedTuple
+
+
+REPO_ROOT = Path(__file__).absolute().parents[3]
+PYPROJECT = REPO_ROOT / "pyproject.toml"
+DICTIONARY = REPO_ROOT / "tools" / "linter" / "dictionary.txt"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: str | None
+    line: int | None
+    char: int | None
+    code: str
+    severity: LintSeverity
+    name: str
+    original: str | None
+    replacement: str | None
+    description: str | None
+
+
+def format_error_message(
+    filename: str,
+    error: Exception | None = None,
+    *,
+    message: str | None = None,
+) -> LintMessage:
+    if message is None and error is not None:
+        message = (
+            f"Failed due to {error.__class__.__name__}:\n{error}\n"
+            "Please either fix the error or "
+            "add the word(s) to the dictionary file (lowercase is preferred)."
+        )
+    return LintMessage(
+        path=filename,
+        line=None,
+        char=None,
+        code="CODESPELL",
+        severity=LintSeverity.ERROR,
+        name="spelling error",
+        original=None,
+        replacement=None,
+        description=message,
+    )
+
+
+def run_codespell(path: Path) -> str:
+    try:
+        return subprocess.check_output(
+            [
+                sys.executable,
+                "-m",
+                "codespell_lib",
+                "--toml",
+                str(PYPROJECT),
+                str(path),
+            ],
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding="utf-8",
+        )
+    except subprocess.CalledProcessError as exc:
+        raise ValueError(exc.output) from exc
+
+
+def check_file(filename: str) -> list[LintMessage]:
+    path = Path(filename).absolute()
+    try:
+        run_codespell(path)
+    except Exception as err:
+        return [format_error_message(filename, err)]
+    return []
+
+
+def check_dictionary(filename: str) -> list[LintMessage]:
+    """Check the dictionary file for duplicates."""
+    path = Path(filename).absolute()
+    try:
+        words = path.read_text(encoding="utf-8").splitlines()
+        if len(words) != len(set(words)):
+            raise ValueError("The dictionary file contains duplicate entries.")
+        words = list(map(str.lower, words))
+        if words != sorted(words):
+            raise ValueError(
+                "The dictionary file is not sorted alphabetically (case-insensitive)."
+            )
+    except Exception as err:
+        return [format_error_message(str(filename), err)]
+    return []
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Check files for spelling mistakes using codespell.",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="<%(processName)s:%(levelname)s> %(message)s",
+        level=logging.NOTSET
+        if args.verbose
+        else logging.DEBUG
+        if len(args.filenames) < 1000
+        else logging.INFO,
+        stream=sys.stderr,
+    )
+
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=os.cpu_count(),
+    ) as executor:
+        futures = {executor.submit(check_file, x): x for x in args.filenames}
+        futures[executor.submit(check_dictionary, str(DICTIONARY))] = str(DICTIONARY)
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                for lint_message in future.result():
+                    print(json.dumps(lint_message._asdict()), flush=True)
+            except Exception:
+                logging.critical('Failed at "%s".', futures[future])
+                raise
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt