Harden "Add annotations" workflow (#56071)

Summary: Resolves https://github.com/pytorch/pytorch/issues/55810 by closing some possible security holes due to using [GitHub Actions `${{ <expressions> }}`](https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#about-contexts-and-expressions) in `.github/workflows/add_annotations.yml` and also patching a few other possible scenarios that could cause the workflow to fail by a PR passing a malformed artifact. - [x] flag and remove GitHub Actions expressions in JS scripts - [x] don't fail the workflow if the artifact doesn't look as expected - [x] write unit tests for `tools/extract_scripts.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/56071 Test Plan: I tested the end-to-end "Lint" and "Add annotations" system in a separate sandbox repo, including the following cases: - well-formed artifact - missing artifact - artifact containing a file named `linter-output.zip` (name clash) - artifact whose `commit-sha.txt` doesn't contain a 40-digit hex string - artifact whose `commit-sha.txt` contains a 40-digit hex string that isn't a valid Git hash for the current repo - in this last case, the workflow does fail, but handling that is the responsibility of [pytorch/add-annotations-github-action](https://github.com/pytorch/add-annotations-github-action), not pytorch/pytorch To run the new unit tests added in this PR: ``` python tools/test/test_extract_scripts.py ``` Reviewed By: seemethere Differential Revision: D27807074 Pulled By: samestep fbshipit-source-id: e2d3cc5437fe80ff03d46237ebba289901bc567c
2025-10-20 21:14:14 +08:00 · 2021-04-16 07:44:56 -07:00
parent e387bd780e
commit c5e80d30bf
7 changed files with 157 additions and 45 deletions
--- a/.github/workflows/add_annotations.yml
+++ b/.github/workflows/add_annotations.yml
@ -15,7 +15,6 @@ jobs:
        name:
          - flake8-py3
          - clang-tidy
-    if: github.event.workflow_run.event == 'pull_request'
    runs-on: ubuntu-18.04
    steps:
      - name: Download artifact
@ -26,10 +25,10 @@ jobs:
            const artifacts = await github.actions.listWorkflowRunArtifacts({
              owner: context.repo.owner,
              repo: context.repo.repo,
-              run_id: ${{ github.event.workflow_run.id }},
+              run_id: process.env.RUN_ID,
            });
            const filteredArtifacts = artifacts.data.artifacts.filter(artifact => {
-              return artifact.name == '${{ matrix.name }}';
+              return artifact.name == process.env.LINT_NAME;
            });
            if (filteredArtifacts.length > 0) {
              const matchArtifact = filteredArtifacts[0];
@ -41,27 +40,27 @@ jobs:
              });
              const fs = require('fs');
              fs.writeFileSync(
-                '${{ github.workspace }}/linter-output.zip',
+                `${process.env.GITHUB_WORKSPACE}/linter-output.zip`,
                Buffer.from(download.data),
              );
            }
+        env:
+          RUN_ID: ${{ github.event.workflow_run.id }}
+          LINT_NAME: ${{ matrix.name }}
      - name: Unzip artifact
        id: unzip
        run: |
-          FILENAME=linter-output.zip
-          EXISTS=$([ -f $FILENAME ]; echo $?)
-          echo ::set-output name=exists::"$EXISTS"
-          if [ "$EXISTS" -eq 0 ]; then
-            unzip $FILENAME
-            echo ::set-output name=commit-sha::"$(cat commit-sha.txt)"
+          if unzip linter-output.zip annotations.json commit-sha.txt; then
+            echo ::set-output \
+              name=sha::"$(grep -Em1 '^[[:xdigit:]]{40}$' commit-sha.txt)"
          fi
-      - if: steps.unzip.outputs.exists == '0' # i.e. true
+      - if: steps.unzip.outputs.sha
        name: Add annotations
        uses: pytorch/add-annotations-github-action@master
        with:
          check_name: ${{ matrix.name }}
          linter_output_path: annotations.json
-          commit_sha: ${{ steps.unzip.outputs.commit-sha }}
+          commit_sha: ${{ steps.unzip.outputs.sha }}
          mode: json
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -31,6 +31,8 @@ jobs:
        run: |
          pip install ruamel.yaml==0.17.4
          .github/scripts/lint_native_functions.py
+      - name: Extract scripts from GitHub Actions workflows
+        run: tools/extract_scripts.py --out=.extracted_scripts
      - name: ShellCheck
        # https://github.com/koalaman/shellcheck/tree/v0.7.1#installing-a-pre-compiled-binary
        run: |
@ -40,9 +42,7 @@ jobs:
          sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
          rm -r "shellcheck-${scversion}"
          shellcheck --version
-          EXTRACT_DIR=.shellcheck_generated
-          tools/extract_scripts.py --out=$EXTRACT_DIR
-          tools/run_shellcheck.sh .jenkins/pytorch $EXTRACT_DIR
+          tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
      - name: Ensure correct trailing newlines
        run: |
          (! git grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)tools/clang_format_hash' | tools/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false))
--- a/.gitignore
+++ b/.gitignore
@ -14,7 +14,7 @@ coverage.xml
 .gradle
 .hypothesis
 .mypy_cache
-/.shellcheck_generated/
+/.extracted_scripts/
 **/.pytorch-test-times
 **/.pytorch-slow-tests
 */*.pyc
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@ -46,6 +46,7 @@ files =
    tools/pyi/*.py,
    tools/stats_utils/*.py,
    tools/test_history.py,
+    tools/test/test_extract_scripts.py,
    tools/test/test_mypy_wrapper.py,
    tools/test/test_test_history.py,
    tools/test/test_trailing_newlines.py,
--- a/tools/README.md
+++ b/tools/README.md
@ -39,11 +39,14 @@ Developer tools which you might find useful:

 * [clang_tidy.py](clang_tidy.py) - Script for running clang-tidy
  on lines of your script which you changed.
-* [extract_scripts.py](extract_scripts.py) - Extract shell scripts from
-  `.github/workflows/*.yml` into a specified dir, on which
-  [run_shellcheck.sh](run_shellcheck.sh) can be run. Exits with nonzero status
-  if any of the extracted scripts contain [GitHub Actions expressions][]: `${{
-  <expression> }}`
+* [extract_scripts.py](extract_scripts.py) - Extract scripts from
+  `.github/workflows/*.yml` into a specified dir, on which linters such as
+  [run_shellcheck.sh](run_shellcheck.sh) can be run. Assumes that every `run`
+  script has `shell: bash` unless a different shell is explicitly listed on that
+  specific step (so `defaults` doesn't currently work), but also has some rules
+  for other situations such as [actions/github-script][]. Exits with nonzero
+  status if any of the extracted scripts contain [GitHub Actions expressions][]:
+  `${{<expression> }}`
 * [git_add_generated_dirs.sh](git_add_generated_dirs.sh) and
  [git_reset_generated_dirs.sh](git_reset_generated_dirs.sh) -
  Use this to force add generated files to your Git index, so that you
@ -85,6 +88,7 @@ Tools which are only situationally useful:
 * [run-clang-tidy-in-ci.sh](run-clang-tidy-in-ci.sh) - Responsible
  for checking that C++ code is clang-tidy clean in CI on Travis

+[actions/github-script]: https://github.com/actions/github-script
 [clang-tidy]: https://clang.llvm.org/extra/clang-tidy/
 [flake8]: https://flake8.pycqa.org/en/latest/
 [github actions expressions]: https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#about-contexts-and-expressions
--- a/tools/extract_scripts.py
+++ b/tools/extract_scripts.py
@ -1,24 +1,49 @@
 #!/usr/bin/env python3

-# these two pages have some relevant information:
-# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions
-# https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners
-
 import argparse
 import re
 import sys
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any, Dict, Optional

 import yaml
+from typing_extensions import TypedDict

-Job = Dict[str, Any]
-
-windows_labels = {'windows-latest', 'windows-2019'}
+Step = Dict[str, Any]


-def get_default_shell(job: Job) -> str:
-    return 'pwsh' if job['runs-on'] in windows_labels else 'bash'
+class Script(TypedDict):
+    extension: str
+    script: str
+
+
+def extract(step: Step) -> Optional[Script]:
+    run = step.get('run')
+
+    # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#using-a-specific-shell
+    shell = step.get('shell', 'bash')
+    extension = {
+        'bash': '.sh',
+        'pwsh': '.ps1',
+        'python': '.py',
+        'sh': '.sh',
+        'cmd': '.cmd',
+        'powershell': '.ps1',
+    }.get(shell)
+
+    is_gh_script = step.get('uses', '').startswith('actions/github-script@')
+    gh_script = step.get('with', {}).get('script')
+
+    if run is not None and extension is not None:
+        script = {
+            'bash': f'#!/usr/bin/env bash\nset -eo pipefail\n{run}',
+            'sh': f'#!/usr/bin/env sh\nset -e\n{run}',
+        }.get(shell, run)
+        return {'extension': extension, 'script': script}
+    elif is_gh_script and gh_script is not None:
+        return {'extension': '.js', 'script': gh_script}
+    else:
+        return None


 def main() -> None:
@ -38,13 +63,13 @@ def main() -> None:

        for job_name, job in workflow['jobs'].items():
            job_dir = out / p / job_name
-            default_shell = get_default_shell(job)
            steps = job['steps']
            index_chars = len(str(len(steps) - 1))
            for i, step in enumerate(steps, start=1):
-                script = step.get('run')
-                if script:
-                    step_name = step['name']
+                extracted = extract(step)
+                if extracted:
+                    script = extracted['script']
+                    step_name = step.get('name', '')
                    if '${{' in script:
                        gha_expressions_found = True
                        print(
@ -52,17 +77,15 @@ def main() -> None:
                            file=sys.stderr
                        )

-                    if step.get('shell', default_shell) == 'bash':
-                        job_dir.mkdir(parents=True, exist_ok=True)
+                    job_dir.mkdir(parents=True, exist_ok=True)

-                        sanitized = re.sub(
-                            '[^a-zA-Z_]+', '_',
-                            f'_{step_name}',
-                        ).rstrip('_')
-                        filename = f'{i:0{index_chars}}{sanitized}.sh'
-                        (job_dir / filename).write_text(
-                            f'#!/usr/bin/env bash\nset -eo pipefail\n{script}'
-                        )
+                    sanitized = re.sub(
+                        '[^a-zA-Z_]+', '_',
+                        f'_{step_name}',
+                    ).rstrip('_')
+                    extension = extracted['extension']
+                    filename = f'{i:0{index_chars}}{sanitized}{extension}'
+                    (job_dir / filename).write_text(script)

    if gha_expressions_found:
        sys.exit(
--- a/tools/test/test_extract_scripts.py
+++ b/tools/test/test_extract_scripts.py
@ -0,0 +1,85 @@
+import unittest
+
+from tools import extract_scripts
+
+requirements_sh = '''
+#!/usr/bin/env bash
+set -eo pipefail
+pip install -r requirements.txt
+'''.strip()
+
+hello_sh = '''
+#!/usr/bin/env sh
+set -e
+echo hello world
+'''.strip()
+
+
+class TestExtractScripts(unittest.TestCase):
+    def test_extract_none(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Checkout PyTorch',
+                'uses': 'actions/checkout@v2',
+            }),
+            None,
+        )
+
+    def test_extract_run_default_bash(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Install requirements',
+                'run': 'pip install -r requirements.txt',
+            }),
+            {
+                'extension': '.sh',
+                'script': requirements_sh,
+            },
+        )
+
+    def test_extract_run_sh(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Hello world',
+                'run': 'echo hello world',
+                'shell': 'sh',
+            }),
+            {
+                'extension': '.sh',
+                'script': hello_sh,
+            },
+        )
+
+    def test_extract_run_py(self) -> None:
+        self.assertEqual(
+            extract_scripts.extract({
+                'name': 'Hello world',
+                'run': 'print("Hello!")',
+                'shell': 'python',
+            }),
+            {
+                'extension': '.py',
+                'script': 'print("Hello!")',
+            },
+        )
+
+    def test_extract_github_script(self) -> None:
+        self.assertEqual(
+            # https://github.com/actions/github-script/tree/v3.1.1#reading-step-results
+            extract_scripts.extract({
+                'uses': 'actions/github-script@v3',
+                'id': 'set-result',
+                'with': {
+                    'script': 'return "Hello!"',
+                    'result-encoding': 'string',
+                },
+            }),
+            {
+                'extension': '.js',
+                'script': 'return "Hello!"',
+            },
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()