From 2f547ae6132f0e32b86fbd0c787823f46e188125 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 22 Feb 2023 04:39:19 +0000 Subject: [PATCH] Remove SHA checksum for bazel http_archive from GitHub (#95039) An action item from https://github.com/pytorch/pytorch/issues/94346 Although the security practice of setting the checksum is good, it doesn't work when the archive is downloaded from some sites like GitHub because it can change. Specifically, GitHub gives no guarantee to keep the same value forever https://github.com/community/community/discussions/46034. This also adds a new linter to make sure that SHA checksum from GitHub can be removed quickly. The WORKSPACE file is actually updated using the new linter: ``` >>> Lint for WORKSPACE: Advice (BAZEL_LINTER) format Redundant SHA checksum. Run `lintrunner -a` to apply this patch. You can run `lintrunner -a` to apply this patch. 5 5 | 6 6 | http_archive( 7 7 | name = "rules_cuda", 7 |- sha256 = "f80438bee9906e9ecb1a8a4ae2365374ac1e8a283897281a2db2fb7fcf746333", 9 8 | strip_prefix = "runtime-b1c7cce21ba4661c17ac72421c6a0e2015e7bef3/third_party/rules_cuda", 10 9 | urls = ["https://github.com/tensorflow/runtime/archive/b1c7cce21ba4661c17ac72421c6a0e2015e7bef3.tar.gz"], 11 10 | ) -------------------------------------------------------------------------------- 29 28 | name = "pybind11_bazel", 30 29 | strip_prefix = "pybind11_bazel-992381ced716ae12122360b0fbadbc3dda436dbf", 31 30 | urls = ["https://github.com/pybind/pybind11_bazel/archive/992381ced716ae12122360b0fbadbc3dda436dbf.zip"], 31 |- sha256 = "3dc6435bd41c058453efe102995ef084d0a86b0176fd6a67a6b7100a2e9a940e", 33 31 | ) 34 32 | 35 33 | new_local_repository( -------------------------------------------------------------------------------- 52 50 | urls = [ 53 51 | "https://github.com/gflags/gflags/archive/v2.2.2.tar.gz", 54 52 | ], 54 |- sha256 = "34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf", 56 53 | ) 57 54 | 58 55 | new_local_repository( ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/95039 Approved by: https://github.com/ZainRizvi --- .lintrunner.toml | 21 +++ WORKSPACE | 3 - tools/linter/adapters/bazel_linter.py | 175 ++++++++++++++++++++++ tools/linter/adapters/s3_init_config.json | 10 ++ 4 files changed, 206 insertions(+), 3 deletions(-) create mode 100644 tools/linter/adapters/bazel_linter.py diff --git a/.lintrunner.toml b/.lintrunner.toml index 89d817b90de1..dd94aae4a1d3 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -895,3 +895,24 @@ command = [ '--', '@{{PATHSFILE}}' ] + +[[linter]] +code = 'BAZEL_LINTER' +include_patterns = ['WORKSPACE'] +command = [ + 'python3', + 'tools/linter/adapters/bazel_linter.py', + '--binary=.lintbin/bazel', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python3', + 'tools/linter/adapters/s3_init.py', + '--config-json=tools/linter/adapters/s3_init_config.json', + '--linter=bazel', + '--dry-run={{DRYRUN}}', + '--output-dir=.lintbin', + '--output-name=bazel', +] +is_formatter = true diff --git a/WORKSPACE b/WORKSPACE index 29badf579543..5d2a0b78fd63 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -5,7 +5,6 @@ load("//tools/rules:workspace.bzl", "new_patched_local_repository") http_archive( name = "rules_cuda", - sha256 = "f80438bee9906e9ecb1a8a4ae2365374ac1e8a283897281a2db2fb7fcf746333", strip_prefix = "runtime-b1c7cce21ba4661c17ac72421c6a0e2015e7bef3/third_party/rules_cuda", urls = ["https://github.com/tensorflow/runtime/archive/b1c7cce21ba4661c17ac72421c6a0e2015e7bef3.tar.gz"], ) @@ -29,7 +28,6 @@ http_archive( name = "pybind11_bazel", strip_prefix = "pybind11_bazel-992381ced716ae12122360b0fbadbc3dda436dbf", urls = ["https://github.com/pybind/pybind11_bazel/archive/992381ced716ae12122360b0fbadbc3dda436dbf.zip"], - sha256 = "3dc6435bd41c058453efe102995ef084d0a86b0176fd6a67a6b7100a2e9a940e", ) new_local_repository( @@ -52,7 +50,6 @@ http_archive( urls = [ "https://github.com/gflags/gflags/archive/v2.2.2.tar.gz", ], - sha256 = "34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf", ) new_local_repository( diff --git a/tools/linter/adapters/bazel_linter.py b/tools/linter/adapters/bazel_linter.py new file mode 100644 index 000000000000..fd8eddea4841 --- /dev/null +++ b/tools/linter/adapters/bazel_linter.py @@ -0,0 +1,175 @@ +""" +This linter ensures that users don't set a SHA hash checksum in Bazel for the http_archive. +Although the security practice of setting the checksum is good, it doesn't work when the +archive is downloaded from some sites like GitHub because it can change. Specifically, +GitHub gives no guarantee to keep the same value forever. Check for more details at +https://github.com/community/community/discussions/46034. +""" +import argparse +import json +import re +import subprocess +import xml.etree.ElementTree as ET +from enum import Enum +from typing import List, NamedTuple, Optional, Set +from urllib.parse import urlparse + + +LINTER_CODE = "BAZEL_LINTER" +SHA256_REGEX = re.compile(r"\s*sha256\s*=\s*['\"](?P[a-zA-Z0-9]{64})['\"]\s*,") +DOMAINS_WITH_UNSTABLE_CHECKSUM = {"github.com"} + + +class LintSeverity(str, Enum): + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +class LintMessage(NamedTuple): + path: Optional[str] + line: Optional[int] + char: Optional[int] + code: str + severity: LintSeverity + name: str + original: Optional[str] + replacement: Optional[str] + description: Optional[str] + + +def is_required_checksum(urls: List[Optional[str]]) -> bool: + if not urls: + return False + + for url in urls: + if not url: + continue + + parsed_url = urlparse(url) + if parsed_url.hostname in DOMAINS_WITH_UNSTABLE_CHECKSUM: + return False + + return True + + +def get_disallowed_checksums( + binary: str, +) -> Set[str]: + """ + Return the set of disallowed checksums from all http_archive rules + """ + try: + # Use bazel to get the list of external dependencies in XML format + proc = subprocess.run( + [binary, "query", "kind(http_archive, //external:*)", "--output=xml"], + capture_output=True, + ) + except OSError: + raise + + stdout = str(proc.stdout, "utf-8").strip() + root = ET.fromstring(stdout) + + disallowed_checksums = set() + # Parse all the http_archive rules in the XML output + for rule in root.findall('.//rule[@class="http_archive"]'): + urls_node = rule.find('.//list[@name="urls"]') + if urls_node is None: + continue + urls = [n.get("value") for n in urls_node.findall(".//string")] + + checksum_node = rule.find('.//string[@name="sha256"]') + if checksum_node is None: + continue + checksum = checksum_node.get("value") + + if not checksum: + continue + + if not is_required_checksum(urls): + disallowed_checksums.add(checksum) + + return disallowed_checksums + + +def check_bazel( + filename: str, + disallowed_checksums: Set[str], +) -> List[LintMessage]: + original = "" + replacement = "" + + with open(filename) as f: + for line in f: + original += f"{line}" + + m = SHA256_REGEX.match(line) + if m: + sha256 = m.group("sha256") + + if sha256 in disallowed_checksums: + continue + + replacement += f"{line}" + + if original == replacement: + return [] + + return [ + LintMessage( + path=filename, + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ADVICE, + name="format", + original=original, + replacement=replacement, + description="Found redundant SHA checksums. Run `lintrunner -a` to apply this patch.", + ) + ] + + +def main() -> None: + parser = argparse.ArgumentParser( + description="A custom linter to detect redundant SHA checksums in Bazel", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "--binary", + required=True, + help="bazel binary path", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + args = parser.parse_args() + + try: + disallowed_checksums = get_disallowed_checksums(args.binary) + except Exception as e: + err_msg = LintMessage( + path=None, + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=(f"Failed due to {e.__class__.__name__}:\n{e}"), + ) + print(json.dumps(err_msg._asdict()), flush=True) + exit(0) + + for filename in args.filenames: + for lint_message in check_bazel(filename, disallowed_checksums): + print(json.dumps(lint_message._asdict()), flush=True) + + +if __name__ == "__main__": + main() diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json index a6362a12922b..dbb20e2ed7a0 100644 --- a/tools/linter/adapters/s3_init_config.json +++ b/tools/linter/adapters/s3_init_config.json @@ -39,5 +39,15 @@ "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint", "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76" } + }, + "bazel": { + "Darwin": { + "download_url": "https://ossci-macos.s3.amazonaws.com/bazel-4.2.1-darwin-x86_64", + "hash": "74d93848f0c9d592e341e48341c53c87e3cb304a54a2a1ee9cff3df422f0b23c" + }, + "Linux": { + "download_url": "https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64", + "hash": "1a4f3a3ce292307bceeb44f459883859c793436d564b95319aacb8af1f20557c" + } } }