From 6c493e2b14c83bdf40daaa23f2210fca432c9a55 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 16 Jun 2025 18:55:08 +0800 Subject: [PATCH] [BE] add `codespell` linter (#156066) Pull Request resolved: https://github.com/pytorch/pytorch/pull/156066 Approved by: https://github.com/malfet --- .lintrunner.toml | 48 +++++++ pyproject.toml | 3 + tools/linter/adapters/codespell_linter.py | 151 ++++++++++++++++++++++ tools/linter/dictionary.txt | 0 4 files changed, 202 insertions(+) create mode 100644 tools/linter/adapters/codespell_linter.py create mode 100644 tools/linter/dictionary.txt diff --git a/.lintrunner.toml b/.lintrunner.toml index c4dda86d5a4d..e3684ef1be82 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -1134,6 +1134,54 @@ init_command = [ 'PyYAML==6.0.1', ] +[[linter]] +code = 'CODESPELL' +command = [ + 'python3', + 'tools/linter/adapters/codespell_linter.py', + '--', + '@{{PATHSFILE}}' +] +include_patterns = [ + '**', +] +exclude_patterns = [ + # We don't care too much about files in this directory, don't enforce + # spelling on them + 'caffe2/**', + 'fb/**', + '**/fb/**', + 'third_party/**', + 'test/dynamo/cpython/**', + 'torch/_vendor/**', + 'torch/_inductor/fx_passes/serialized_patterns/**', + 'torch/_inductor/autoheuristic/artifacts/**', + # These files are all grandfathered in, feel free to remove from this list + # as necessary + '*', + '.ci/**', + '.circleci/**', + '.github/**', + 'aten/**', + 'benchmarks/**', + 'c10/**', + 'cmake/**', + 'docs/**', + 'functorch/**', + 'scripts/**', + 'test/**', + 'tools/**', + 'torch/**', + 'torchgen/**', +] +init_command = [ + 'python3', + 'tools/linter/adapters/pip_init.py', + '--dry-run={{DRYRUN}}', + 'codespell[toml]==2.4.1', +] +is_formatter = true + # usort + ruff-format [[linter]] code = 'PYFMT' diff --git a/pyproject.toml b/pyproject.toml index 2e18f53163bc..ccf9c2aeb4f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -281,3 +281,6 @@ select = [ "tools/linter/**" = [ "LOG015" # please fix ] + +[tool.codespell] +ignore-words = "tools/linter/dictionary.txt" diff --git a/tools/linter/adapters/codespell_linter.py b/tools/linter/adapters/codespell_linter.py new file mode 100644 index 000000000000..1bdf5b978af4 --- /dev/null +++ b/tools/linter/adapters/codespell_linter.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import argparse +import concurrent.futures +import json +import logging +import os +import subprocess +import sys +from enum import Enum +from pathlib import Path +from typing import NamedTuple + + +REPO_ROOT = Path(__file__).absolute().parents[3] +PYPROJECT = REPO_ROOT / "pyproject.toml" +DICTIONARY = REPO_ROOT / "tools" / "linter" / "dictionary.txt" + + +class LintSeverity(str, Enum): + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +class LintMessage(NamedTuple): + path: str | None + line: int | None + char: int | None + code: str + severity: LintSeverity + name: str + original: str | None + replacement: str | None + description: str | None + + +def format_error_message( + filename: str, + error: Exception | None = None, + *, + message: str | None = None, +) -> LintMessage: + if message is None and error is not None: + message = ( + f"Failed due to {error.__class__.__name__}:\n{error}\n" + "Please either fix the error or " + "add the word(s) to the dictionary file (lowercase is preferred)." + ) + return LintMessage( + path=filename, + line=None, + char=None, + code="CODESPELL", + severity=LintSeverity.ERROR, + name="spelling error", + original=None, + replacement=None, + description=message, + ) + + +def run_codespell(path: Path) -> str: + try: + return subprocess.check_output( + [ + sys.executable, + "-m", + "codespell_lib", + "--toml", + str(PYPROJECT), + str(path), + ], + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + ) + except subprocess.CalledProcessError as exc: + raise ValueError(exc.output) from exc + + +def check_file(filename: str) -> list[LintMessage]: + path = Path(filename).absolute() + try: + run_codespell(path) + except Exception as err: + return [format_error_message(filename, err)] + return [] + + +def check_dictionary(filename: str) -> list[LintMessage]: + """Check the dictionary file for duplicates.""" + path = Path(filename).absolute() + try: + words = path.read_text(encoding="utf-8").splitlines() + if len(words) != len(set(words)): + raise ValueError("The dictionary file contains duplicate entries.") + words = list(map(str.lower, words)) + if words != sorted(words): + raise ValueError( + "The dictionary file is not sorted alphabetically (case-insensitive)." + ) + except Exception as err: + return [format_error_message(str(filename), err)] + return [] + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Check files for spelling mistakes using codespell.", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="verbose logging", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + args = parser.parse_args() + + logging.basicConfig( + format="<%(processName)s:%(levelname)s> %(message)s", + level=logging.NOTSET + if args.verbose + else logging.DEBUG + if len(args.filenames) < 1000 + else logging.INFO, + stream=sys.stderr, + ) + + with concurrent.futures.ProcessPoolExecutor( + max_workers=os.cpu_count(), + ) as executor: + futures = {executor.submit(check_file, x): x for x in args.filenames} + futures[executor.submit(check_dictionary, str(DICTIONARY))] = str(DICTIONARY) + for future in concurrent.futures.as_completed(futures): + try: + for lint_message in future.result(): + print(json.dumps(lint_message._asdict()), flush=True) + except Exception: + logging.critical('Failed at "%s".', futures[future]) + raise + + +if __name__ == "__main__": + main() diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt new file mode 100644 index 000000000000..e69de29bb2d1