From 11a43b6cad8d6f1f52738af49ca5307cd5b1b1be Mon Sep 17 00:00:00 2001 From: Slim Frikha Date: Mon, 25 Aug 2025 12:50:15 +0400 Subject: [PATCH] [env] fix: Improve License Check Hook Flexibility (#3202) ### What does this PR do? Solve #3201 #### Problem The existing license check hook scans all directories recursively from a single root directory, which causes issues in local development environments: * Virtual environments (`.venv`, `venv/`) get scanned and fail license checks * No easy way to exclude common build/cache directories without hardcoding exclusions * Different behavior between local development (with venvs) and CI/CD (clean environment) #### Solution Modified the `check_license.py` script to accept multiple target directories instead of a single root directory with exclusions. ### Design & Code Changes Changed argument from `--directory` to `--directories` * Now accepts multiple `Path` arguments using `nargs="+"` * Allows specifying exactly which directories to scan * in local mode: `--directories examples recipe scripts tests verl setup.py` * in github workflow: `--directories .` --- .github/workflows/sanity.yml | 4 ++-- .pre-commit-config.yaml | 2 +- tests/special_sanity/check_license.py | 31 ++++++++++++++++++++++++--- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/.github/workflows/sanity.yml b/.github/workflows/sanity.yml index ce759b826..39eaf0e31 100644 --- a/.github/workflows/sanity.yml +++ b/.github/workflows/sanity.yml @@ -12,7 +12,7 @@ # - `special_sanity`: a suite of quick sanity tests # - `special_standalone`: a set of test that are designed to run in dedicated environments -# Accelerators for tests +# Accelerators for tests # - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`. # - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment. @@ -78,7 +78,7 @@ jobs: pytest -s -x tests/special_sanity - name: Run license test run: | - python3 tests/special_sanity/check_license.py --directory . + python3 tests/special_sanity/check_license.py --directories . - name: Assert naming convention run: | if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b4c7b843..bd77c3620 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,6 @@ repos: hooks: - id: check-license name: Check license - entry: python3 tests/special_sanity/check_license.py --directory . + entry: python3 tests/special_sanity/check_license.py --directories examples recipe scripts tests verl setup.py language: python pass_filenames: false diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py index 1a2073e6b..a4ade0244 100644 --- a/tests/special_sanity/check_license.py +++ b/tests/special_sanity/check_license.py @@ -13,6 +13,7 @@ # limitations under the License. from argparse import ArgumentParser from pathlib import Path +from typing import Iterable license_head_bytedance = "Copyright 2024 Bytedance Ltd. and/or its affiliates" license_head_bytedance_25 = "Copyright 2025 Bytedance Ltd. and/or its affiliates" @@ -35,13 +36,37 @@ license_headers = [ ] +def get_py_files(path_arg: Path) -> Iterable[Path]: + """get py files under a dir. if already py file return it + + Args: + path_arg (Path): path to scan for py files + + Returns: + Iterable[Path]: list of py files + """ + if path_arg.is_dir(): + return path_arg.glob("**/*.py") + elif path_arg.is_file() and path_arg.suffix == ".py": + return [path_arg] + return [] + + if __name__ == "__main__": parser = ArgumentParser() - parser.add_argument("--directory", "-d", required=True, type=str) + parser.add_argument( + "--directories", + "-d", + required=True, + type=Path, + nargs="+", + help="List of directories to check for license headers", + ) args = parser.parse_args() - directory_in_str = args.directory - pathlist = Path(directory_in_str).glob("**/*.py") + # Collect all Python files from specified directories + pathlist = set(path for path_arg in args.directories for path in get_py_files(path_arg)) + for path in pathlist: # because path is object not string path_in_str = str(path.absolute())