Translate annotation line numbers from merge to head (#55569)

Summary: This PR - adds a `tools/translate_annotations.py` script that - parses annotations into JSON using the regexes that we were previously passing to [`pytorch/add-annotations-github-action`](https://github.com/pytorch/add-annotations-github-action) and - uses `git diff-index` to translate the line numbers for those annotations from the PR `merge` onto the PR `head`, since (as of https://github.com/pytorch/pytorch/issues/54967) we now run CI on the former instead of the latter; - modifies the `flake8-py3` and `clang-tidy` jobs to use that script and thus upload JSON in their artifacts instead of raw text; and - modifies the "Add annotations" workflow to specify `mode: json` to allow it to use those preprocessed annotations. Depends on https://github.com/pytorch/add-annotations-github-action/pull/18. Pull Request resolved: https://github.com/pytorch/pytorch/pull/55569 Test Plan: You can run the unit tests with this command: ``` python tools/test/test_translate_annotations.py ``` I also tested the entire system together in my personal sandbox repo. Reviewed By: malfet Differential Revision: D27662161 Pulled By: samestep fbshipit-source-id: ecca51b79b9cf00c90fd89f0d41d0c7b89d69c63
2025-10-20 21:14:14 +08:00 · 2021-04-09 11:10:53 -07:00
parent 11dd6d3dbb
commit f3367f917e
6 changed files with 504 additions and 11 deletions
--- a/.github/workflows/add_annotations.yml
+++ b/.github/workflows/add_annotations.yml
@ -12,11 +12,9 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        include:
-          - name: flake8-py3
-            regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorCode>\w+\d+) (?<errorDesc>.*)'
-          - name: clang-tidy
-            regex: '^(?<filename>.*?):(?<lineNumber>\d+):(?<columnNumber>\d+): (?<errorDesc>.*?) \[(?<errorCode>.*)\]'
+        name:
+          - flake8-py3
+          - clang-tidy
    if: github.event.workflow_run.event == 'pull_request'
    runs-on: ubuntu-18.04
    steps:
@ -62,8 +60,8 @@ jobs:
        uses: pytorch/add-annotations-github-action@master
        with:
          check_name: ${{ matrix.name }}
-          linter_output_path: warnings.txt
+          linter_output_path: annotations.json
          commit_sha: ${{ steps.unzip.outputs.commit-sha }}
-          regex: ${{ matrix.regex }}
+          mode: json
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -115,6 +115,8 @@ jobs:
          architecture: x64
      - name: Fetch PyTorch
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2 # to allow us to use github.event.pull_request.head.sha
      - name: Prepare output dir with HEAD commit SHA
        run: |
          mkdir flake8-output
@ -125,9 +127,21 @@ jobs:
      - name: Run flake8
        run: |
          set -eux
+          pip install typing-extensions # for tools/translate_annotations.py
          pip install -r requirements-flake8.txt
          flake8 --version
-          flake8 | tee "${GITHUB_WORKSPACE}"/flake8-output/warnings.txt
+          flake8 | tee "${GITHUB_WORKSPACE}"/flake8-output.txt
+          cp flake8-output.txt flake8-output/annotations.json
+      - name: Translate annotations
+        if: github.event_name == 'pull_request'
+        run: |
+          tools/translate_annotations.py \
+            --file=flake8-output.txt \
+            --regex='^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorCode>\w+\d+) (?P<errorDesc>.*)' \
+            --commit="$HEAD_SHA" \
+            > flake8-output/annotations.json
+        env:
+          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
      - name: Upload artifact
        uses: actions/upload-artifact@v2
        with:
@ -135,7 +149,8 @@ jobs:
          path: flake8-output/
      - name: Fail if there were any warnings
        run: |
-          [ ! -s flake8-output/warnings.txt ]
+          cat flake8-output.txt
+          [ ! -s flake8-output.txt ]

  clang-tidy:
    if: github.event_name == 'pull_request'
@ -148,6 +163,8 @@ jobs:
          architecture: x64
      - name: Checkout PyTorch
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # to allow tools/clang_tidy.py to do its thing
      - name: Prepare output dir with HEAD commit SHA
        run: |
          mkdir clang-tidy-output
@ -225,11 +242,18 @@ jobs:
            -g"-torch/csrc/deploy/interpreter/interpreter.h"  \
            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h"  \
            -g"-torch/csrc/deploy/interpreter/test_main.cpp"  \
-            "$@" > "${GITHUB_WORKSPACE}"/clang-tidy-output/warnings.txt
+            "$@" > "${GITHUB_WORKSPACE}"/clang-tidy-output.txt

-          cat "${GITHUB_WORKSPACE}"/clang-tidy-output/warnings.txt
+          cat "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
+
+          tools/translate_annotations.py \
+            --file=clang-tidy-output.txt \
+            --regex='^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorDesc>.*?) \[(?P<errorCode>.*)\]' \
+            --commit="$HEAD_SHA" \
+            > clang-tidy-output/annotations.json
        env:
          BASE_SHA: ${{ github.event.pull_request.base.sha }}
+          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
      - name: Upload artifact
        uses: actions/upload-artifact@v2
        with:
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@ -49,7 +49,9 @@ files =
    tools/test_history.py,
    tools/test/test_test_history.py,
    tools/test/test_trailing_newlines.py,
+    tools/test/test_translate_annotations.py,
    tools/trailing_newlines.py,
+    tools/translate_annotations.py,
    torch/testing/_internal/framework_utils.py,
    torch/utils/benchmark/utils/common.py,
    torch/utils/benchmark/utils/timer.py,
--- a/tools/README.md
+++ b/tools/README.md
@ -60,6 +60,11 @@ Developer tools which you might find useful:
  stdin, print names of nonempty files whose contents don't end in exactly one
  trailing newline, exit with status 1 if no output printed or 0 if some
  filenames were printed.
+* [translate_annotations.py](translate_annotations.py) - Read [Flake8][] or
+   [clang-tidy][] warnings (according to a `--regex`) from a `--file`, convert
+   to the JSON format accepted by [pytorch/add-annotations-github-action], and
+   translate line numbers from `HEAD` back in time to the given `--commit` by
+   running `git diff-index --unified=0` appropriately.

 Important if you want to run on AMD GPU:

@ -80,5 +85,8 @@ Tools which are only situationally useful:
 * [run-clang-tidy-in-ci.sh](run-clang-tidy-in-ci.sh) - Responsible
  for checking that C++ code is clang-tidy clean in CI on Travis

+[clang-tidy]: https://clang.llvm.org/extra/clang-tidy/
+[flake8]: https://flake8.pycqa.org/en/latest/
 [github actions expressions]: https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#about-contexts-and-expressions
+[pytorch/add-annotations-github-action]: https://github.com/pytorch/add-annotations-github-action
 [shellcheck]: https://github.com/koalaman/shellcheck
--- a/tools/test/test_translate_annotations.py
+++ b/tools/test/test_translate_annotations.py
@ -0,0 +1,280 @@
+import re
+import unittest
+
+from tools.translate_annotations import parse_annotation, parse_diff, translate
+
+flake8_regex \
+    = r'^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorCode>\w+\d+) (?P<errorDesc>.*)'
+clang_tidy_regex \
+    = r'^(?P<filename>.*?):(?P<lineNumber>\d+):(?P<columnNumber>\d+): (?P<errorDesc>.*?) \[(?P<errorCode>.*)\]'
+
+# in the below example patch, note that the filenames differ, so the
+# translation should reflect that as well as the line numbers
+
+# $ git clone -b 1.0.2 https://github.com/cscorley/whatthepatch.git
+# $ cd whatthepatch/tests/casefiles
+# $ git diff --no-index --unified=0 lao tzu
+lao_tzu_diff = '''
+diff --git a/lao b/tzu
+index 635ef2c..5af88a8 100644
+--- a/lao
+++ b/tzu
+@@ -1,2 +0,0 @@
+-The Way that can be told of is not the eternal Way;
+-The name that can be named is not the eternal name.
+@@ -4 +2,2 @@ The Nameless is the origin of Heaven and Earth;
+-The Named is the mother of all things.
+The named is the mother of all things.
+
+@@ -11,0 +11,3 @@ But after they are produced,
+They both may be called deep and profound.
+Deeper and more profound,
+The door of all subtleties!
+'''.lstrip()
+
+sparser_diff = '''
+diff --git a/foo.txt b/bar.txt
+index 27a6dad..6fae323 100644
+--- a/foo.txt
+++ b/bar.txt
+@@ -4,3 +4,2 @@ lines
+-lines
+-lines
+-lines
+A change!!
+Wow
+@@ -10,2 +8,0 @@ more lines
+-even more
+-even more
+'''.lstrip()
+
+new_file_diff = '''
+diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h
+new file mode 100644
+index 0000000000..a81eeae346
+--- /dev/null
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
+@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+TORCH_API Tensor* conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    BufHandle bias,
+    int stride,
+    int pad,
+    int groups);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+'''.lstrip()
+
+# fun fact, this example fools VS Code's diff syntax highlighter
+haskell_diff = '''
+diff --git a/hello.hs b/hello.hs
+index ffb8d4ad14..0872ac9db6 100644
+--- a/hello.hs
+++ b/hello.hs
+@@ -1 +1 @@
+--- a/hello/world/example
+main = putStrLn "Hello, world!"
+'''.lstrip()
+
+
+class TestTranslateAnnotations(unittest.TestCase):
+    maxDiff = None
+
+    def test_parse_diff_lao_tzu(self) -> None:
+        self.assertEqual(
+            parse_diff(lao_tzu_diff),
+            {
+                'old_filename': 'lao',
+                'hunks': [
+                    {
+                        'old_start': 1,
+                        'old_count': 2,
+                        'new_start': 0,
+                        'new_count': 0,
+                    },
+                    {
+                        'old_start': 4,
+                        'old_count': 1,
+                        'new_start': 2,
+                        'new_count': 2,
+                    },
+                    {
+                        'old_start': 11,
+                        'old_count': 0,
+                        'new_start': 11,
+                        'new_count': 3,
+                    },
+                ],
+            },
+        )
+
+    def test_parse_diff_new_file(self) -> None:
+        self.assertEqual(
+            parse_diff(new_file_diff),
+            {
+                'old_filename': None,
+                'hunks': [
+                    {
+                        'old_start': 0,
+                        'old_count': 0,
+                        'new_start': 1,
+                        'new_count': 19,
+                    },
+                ],
+            },
+        )
+
+    def test_parse_diff_haskell(self) -> None:
+        self.assertEqual(
+            parse_diff(haskell_diff),
+            {
+                'old_filename': 'hello.hs',
+                'hunks': [
+                    {
+                        'old_start': 1,
+                        'old_count': 1,
+                        'new_start': 1,
+                        'new_count': 1,
+                    },
+                ],
+            },
+        )
+
+    def test_translate_lao_tzu(self) -> None:
+        # we'll pretend that this diff represents the file lao being
+        # renamed to tzu and also modified
+        diff = parse_diff(lao_tzu_diff)
+
+        # line numbers less than 1 are invalid so they map to None
+        self.assertEqual(translate(diff, -1), None)
+        self.assertEqual(translate(diff, 0), None)
+
+        # the first two lines of the file were removed, so the first
+        # line of the new version corresponds to the third line of the
+        # original
+        self.assertEqual(translate(diff, 1), 3)
+
+        # the second and third lines of the new file were not present in
+        # the original version, so they map to None
+        self.assertEqual(translate(diff, 2), None)
+        self.assertEqual(translate(diff, 3), None)
+
+        # at this point, we have a stretch of lines that are identical
+        # in both versions of the file, but the original version of the
+        # file had 4 lines before this section whereas the new version
+        # has only 3 lines before this section
+        self.assertEqual(translate(diff, 4), 5)
+        self.assertEqual(translate(diff, 5), 6)
+        self.assertEqual(translate(diff, 6), 7)
+        self.assertEqual(translate(diff, 7), 8)
+        self.assertEqual(translate(diff, 8), 9)
+        self.assertEqual(translate(diff, 9), 10)
+        self.assertEqual(translate(diff, 10), 11)
+
+        # these three lines were added in the new version of the file,
+        # so they map to None
+        self.assertEqual(translate(diff, 11), None)
+        self.assertEqual(translate(diff, 12), None)
+        self.assertEqual(translate(diff, 13), None)
+
+        # the diff doesn't say how long the file is, so we keep mapping
+        # line numbers back; since we can look back at the original
+        # files, though, we can see that the original is two lines
+        # shorter than the new version, which explains why we are
+        # subtracting 2 here
+        self.assertEqual(translate(diff, 14), 12)
+        self.assertEqual(translate(diff, 15), 13)
+
+    def test_translate_empty(self) -> None:
+        diff = parse_diff('--- a/foo')
+
+        # again, we start numbering at 1
+        self.assertEqual(translate(diff, -1), None)
+        self.assertEqual(translate(diff, 0), None)
+
+        # this diff says there are no changes, so all line numbers
+        # greater than zero map to themselves
+        self.assertEqual(translate(diff, 1), 1)
+        self.assertEqual(translate(diff, 2), 2)
+        self.assertEqual(translate(diff, 3), 3)
+        self.assertEqual(translate(diff, 4), 4)
+        self.assertEqual(translate(diff, 5), 5)
+
+    def test_translate_sparser(self) -> None:
+        diff = parse_diff(sparser_diff)
+
+        # again, we start numbering at 1
+        self.assertEqual(translate(diff, -1), None)
+        self.assertEqual(translate(diff, 0), None)
+
+        # the first three lines are unchanged
+        self.assertEqual(translate(diff, 1), 1)
+        self.assertEqual(translate(diff, 2), 2)
+        self.assertEqual(translate(diff, 3), 3)
+
+        # we removed three lines here and added two, so the two lines we
+        # added don't map back to anything in the original file
+        self.assertEqual(translate(diff, 4), None)
+        self.assertEqual(translate(diff, 5), None)
+
+        # we have some unchanged lines here, but in the preceding hunk
+        # we removed 3 and added only 2, so we have an offset of 1
+        self.assertEqual(translate(diff, 6), 7)
+        self.assertEqual(translate(diff, 7), 8)
+
+        # since the unified diff format essentially subtracts 1 from the
+        # starting line number when the count is 0, and since we use
+        # bisect.bisect_right to decide which hunk to look at, an
+        # earlier version of translate had a bug that caused it to get
+        # confused because it would look at the second hunk (which lists
+        # 8 as its start line number) rather than the first hunk
+        self.assertEqual(translate(diff, 8), 9)
+
+        # after the two lines that we removed in the second hunk, we've
+        # reduced the total length of the file by 3 lines, so once we
+        # reach the end of the diff, we just add 3 to every line number
+        self.assertEqual(translate(diff, 9), 12)
+        self.assertEqual(translate(diff, 10), 13)
+        self.assertEqual(translate(diff, 11), 14)
+        self.assertEqual(translate(diff, 12), 15)
+
+    def test_parse_annotation_flake8(self) -> None:
+        regex = re.compile(flake8_regex)
+        self.assertEqual(
+            parse_annotation(regex, 'README.md:1:3: R100 make a better title'),
+            {
+                'filename': 'README.md',
+                'lineNumber': 1,
+                'columnNumber': 3,
+                'errorCode': 'R100',
+                'errorDesc': 'make a better title',
+            },
+        )
+
+    def test_parse_annotation_clang_tidy(self) -> None:
+        regex = re.compile(clang_tidy_regex)
+        self.assertEqual(
+            parse_annotation(regex, 'README.md:2:1: improve description [R200]'),
+            {
+                'filename': 'README.md',
+                'lineNumber': 2,
+                'columnNumber': 1,
+                'errorCode': 'R200',
+                'errorDesc': 'improve description',
+            },
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tools/translate_annotations.py
+++ b/tools/translate_annotations.py
@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import re
+import subprocess
+from bisect import bisect_right
+from collections import defaultdict
+from typing import (Callable, DefaultDict, Generic, List, Optional, Pattern,
+                    Sequence, TypeVar, cast)
+
+from typing_extensions import TypedDict
+
+
+class Hunk(TypedDict):
+    old_start: int
+    old_count: int
+    new_start: int
+    new_count: int
+
+
+class Diff(TypedDict):
+    old_filename: Optional[str]
+    hunks: List[Hunk]
+
+
+# adapted from the similar regex in tools/clang_tidy.py
+# @@ -start,count +start,count @@
+hunk_pattern = r'^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@'
+
+
+def parse_diff(diff: str) -> Diff:
+    name = None
+    name_found = False
+    hunks: List[Hunk] = []
+    for line in diff.splitlines():
+        hunk_match = re.match(hunk_pattern, line)
+        if name_found:
+            if hunk_match:
+                old_start, old_count, new_start, new_count = hunk_match.groups()
+                hunks.append({
+                    'old_start': int(old_start),
+                    'old_count': int(old_count or '1'),
+                    'new_start': int(new_start),
+                    'new_count': int(new_count or '1'),
+                })
+        else:
+            assert not hunk_match
+            name_match = re.match(r'^--- (?:(?:/dev/null)|(?:a/(.*)))$', line)
+            if name_match:
+                name_found = True
+                name, = name_match.groups()
+    return {
+        'old_filename': name,
+        'hunks': hunks,
+    }
+
+
+T = TypeVar('T')
+U = TypeVar('U')
+
+
+# we want to use bisect.bisect_right to find the closest hunk to a given
+# line number, but the bisect module won't have a key function until
+# Python 3.10 https://github.com/python/cpython/pull/20556 so we make an
+# O(1) wrapper around the list of hunks that makes it pretend to just be
+# a list of line numbers
+# https://gist.github.com/ericremoreynolds/2d80300dabc70eebc790
+class KeyifyList(Generic[T, U]):
+    def __init__(self, inner: List[T], key: Callable[[T], U]) -> None:
+        self.inner = inner
+        self.key = key
+
+    def __len__(self) -> int:
+        return len(self.inner)
+
+    def __getitem__(self, k: int) -> U:
+        return self.key(self.inner[k])
+
+
+def translate(diff: Diff, line_number: int) -> Optional[int]:
+    if line_number < 1:
+        return None
+
+    hunks = diff['hunks']
+    if not hunks:
+        return line_number
+
+    keyified = KeyifyList(
+        hunks,
+        lambda hunk: hunk['new_start'] + (0 if hunk['new_count'] > 0 else 1)
+    )
+    i = bisect_right(cast(Sequence[int], keyified), line_number)
+    if i < 1:
+        return line_number
+
+    hunk = hunks[i - 1]
+    d = line_number - (hunk['new_start'] + (hunk['new_count'] or 1))
+    return None if d < 0 else hunk['old_start'] + (hunk['old_count'] or 1) + d
+
+
+# we use camelCase here because this will be output as JSON and so the
+# field names need to match the group names from here:
+# https://github.com/pytorch/add-annotations-github-action/blob/3ab7d7345209f5299d53303f7aaca7d3bc09e250/action.yml#L23
+class Annotation(TypedDict):
+    filename: str
+    lineNumber: int
+    columnNumber: int
+    errorCode: str
+    errorDesc: str
+
+
+def parse_annotation(regex: Pattern[str], line: str) -> Optional[Annotation]:
+    m = re.match(regex, line)
+    if m:
+        try:
+            line_number = int(m.group('lineNumber'))
+            column_number = int(m.group('columnNumber'))
+        except ValueError:
+            return None
+        return {
+            'filename': m.group('filename'),
+            'lineNumber': line_number,
+            'columnNumber': column_number,
+            'errorCode': m.group('errorCode'),
+            'errorDesc': m.group('errorDesc'),
+        }
+    else:
+        return None
+
+
+def translate_all(
+    *,
+    lines: List[str],
+    regex: Pattern[str],
+    commit: str
+) -> List[Annotation]:
+    ann_dict: DefaultDict[str, List[Annotation]] = defaultdict(list)
+    for line in lines:
+        annotation = parse_annotation(regex, line)
+        if annotation is not None:
+            ann_dict[annotation['filename']].append(annotation)
+    ann_list = []
+    for filename, annotations in ann_dict.items():
+        raw_diff = subprocess.check_output(
+            ['git', 'diff-index', '--unified=0', commit, filename],
+            encoding='utf-8',
+        )
+        diff = parse_diff(raw_diff) if raw_diff.strip() else None
+        # if there is a diff but it doesn't list an old filename, that
+        # means the file is absent in the commit we're targeting, so we
+        # skip it
+        if not (diff and not diff['old_filename']):
+            for annotation in annotations:
+                line_number: Optional[int] = annotation['lineNumber']
+                if diff:
+                    annotation['filename'] = cast(str, diff['old_filename'])
+                    line_number = translate(diff, cast(int, line_number))
+                if line_number:
+                    annotation['lineNumber'] = line_number
+                    ann_list.append(annotation)
+    return ann_list
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--file')
+    parser.add_argument('--regex')
+    parser.add_argument('--commit')
+    args = parser.parse_args()
+    with open(args.file, 'r') as f:
+        lines = f.readlines()
+    print(json.dumps(translate_all(
+        lines=lines,
+        regex=args.regex,
+        commit=args.commit
+    )))
+
+
+if __name__ == '__main__':
+    main()