Enable ruff on benchmark and scripts (#40634)

* Enable ruff on benchmark and scripts Signed-off-by: cyy <cyyever@outlook.com> * Cover benchmark_v2 Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> * correct * style * style --------- Signed-off-by: cyy <cyyever@outlook.com> Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
2025-09-10 17:38:06 +08:00
parent 08edec9f7d
commit a5ecd94a3f
11 changed files with 661 additions and 649 deletions
--- a/2
+++ b/2
@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src

-check_dirs := examples tests src utils
+check_dirs := examples tests src utils scripts benchmark benchmark_v2

 exclude_folders :=  ""

--- a/benchmark/benches/llama.py
+++ b/benchmark/benches/llama.py
@ -11,25 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from logging import Logger
 import os
+import sys
+from logging import Logger
 from threading import Event, Thread
 from time import perf_counter, sleep
 from typing import Optional
-import sys
+

 # Add the parent directory to Python path to import benchmarks_entrypoint
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from benchmarks_entrypoint import MetricsRecorder
-
 import gpustat
 import psutil
 import psycopg2
+from benchmarks_entrypoint import MetricsRecorder
+

 # Optional heavy ML dependencies - only required when actually running the benchmark
 try:
    import torch
+
    from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
+
    TRANSFORMERS_AVAILABLE = True
 except ImportError:
    TRANSFORMERS_AVAILABLE = False
@ -63,7 +66,13 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):


 def run_benchmark(
-    logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
+    logger: Logger,
+    repository: str,
+    branch: str,
+    commit_id: str,
+    commit_msg: str,
+    metrics_recorder=None,
+    num_tokens_to_generate=100,
 ):
    # Check if required ML dependencies are available
    if not TRANSFORMERS_AVAILABLE:
@ -71,11 +80,11 @@ def run_benchmark(
        logger.error("pip install torch transformers")
        logger.error("Skipping LLaMA benchmark due to missing dependencies.")
        return
-    
+
    continue_metric_collection = Event()
    metrics_thread = None
    model_id = "meta-llama/Llama-2-7b-hf"
-    
+
    # If no metrics_recorder is provided, create one for backward compatibility
    if metrics_recorder is None:
        try:
@ -154,7 +163,7 @@ def run_benchmark(
        # First eager forward pass
        logger.info("running first eager forward pass")
        start = perf_counter()
-        outputs = model(**inputs)
+        _ = model(**inputs)
        torch.cuda.synchronize()
        end = perf_counter()
        first_eager_fwd_pass_time = end - start
@ -163,7 +172,7 @@ def run_benchmark(
        # Second eager forward pass (should be faster)
        logger.info("running second eager forward pass")
        start = perf_counter()
-        outputs = model(**inputs)
+        _ = model(**inputs)
        torch.cuda.synchronize()
        end = perf_counter()
        second_eager_fwd_pass_time = end - start
@ -339,7 +348,7 @@ def run_benchmark(
    continue_metric_collection.set()
    if metrics_thread is not None:
        metrics_thread.join()
-    
+
    # Only close the recorder if we created it locally
    if should_close_recorder:
-        metrics_recorder.close() 
+        metrics_recorder.close()
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -31,9 +31,7 @@ from contextlib import contextmanager
 from pathlib import Path

 from git import Repo
-
 from huggingface_hub import HfApi
-
 from optimum_benchmark import Benchmark
 from optimum_benchmark_wrapper import main

--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -13,19 +13,20 @@
 # limitations under the License.
 import argparse
 import importlib.util
+import json
 import logging
 import os
 import sys
-import json
 import uuid
 from datetime import datetime
-from typing import Dict, Tuple, Optional, List

 import pandas as pd

+
 try:
    from psycopg2.extensions import register_adapter
    from psycopg2.extras import Json
+
    register_adapter(dict, Json)
    PSYCOPG2_AVAILABLE = True
 except ImportError:
@ -38,8 +39,14 @@ class ImportModuleException(Exception):

 class MetricsRecorder:
    def __init__(
-        self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str, 
-        collect_csv_data: bool = True
+        self,
+        connection,
+        logger: logging.Logger,
+        repository: str,
+        branch: str,
+        commit_id: str,
+        commit_msg: str,
+        collect_csv_data: bool = True,
    ):
        self.conn = connection
        self.use_database = connection is not None
@ -51,27 +58,43 @@ class MetricsRecorder:
        self.commit_id = commit_id
        self.commit_msg = commit_msg
        self.collect_csv_data = collect_csv_data
-        
+
        # For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
        if self.collect_csv_data:
            # Initialize empty DataFrames with proper schemas
-            self.benchmarks_df = pd.DataFrame(columns=[
-                'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message', 
-                'metadata', 'created_at'
-            ])
-            self.device_measurements_df = pd.DataFrame(columns=[
-                'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util', 
-                'gpu_mem_megabytes', 'time'
-            ])
-            self.model_measurements_df = pd.DataFrame(columns=[
-                'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
-                'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
-                'second_eager_generate_time_secs', 'time_to_first_token_secs',
-                'time_to_second_token_secs', 'time_to_third_token_secs',
-                'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
-                'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
-                'fourth_compile_generate_time_secs'
-            ])
+            self.benchmarks_df = pd.DataFrame(
+                columns=[
+                    "benchmark_id",
+                    "repository",
+                    "branch",
+                    "commit_id",
+                    "commit_message",
+                    "metadata",
+                    "created_at",
+                ]
+            )
+            self.device_measurements_df = pd.DataFrame(
+                columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
+            )
+            self.model_measurements_df = pd.DataFrame(
+                columns=[
+                    "benchmark_id",
+                    "time",
+                    "model_load_time",
+                    "first_eager_forward_pass_time_secs",
+                    "second_eager_forward_pass_time_secs",
+                    "first_eager_generate_time_secs",
+                    "second_eager_generate_time_secs",
+                    "time_to_first_token_secs",
+                    "time_to_second_token_secs",
+                    "time_to_third_token_secs",
+                    "time_to_next_token_mean_secs",
+                    "first_compile_generate_time_secs",
+                    "second_compile_generate_time_secs",
+                    "third_compile_generate_time_secs",
+                    "fourth_compile_generate_time_secs",
+                ]
+            )
        else:
            self.benchmarks_df = None
            self.device_measurements_df = None
@ -83,7 +106,7 @@ class MetricsRecorder:
        """
        # Generate a unique UUID for this benchmark
        benchmark_id = str(uuid.uuid4())
-        
+
        if self.use_database:
            with self.conn.cursor() as cur:
                cur.execute(
@ -91,28 +114,32 @@ class MetricsRecorder:
                    (benchmark_id, self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
                )
                self.logger.debug(f"initialised benchmark #{benchmark_id}")
-        
+
        # Store benchmark data for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame
-            new_row = pd.DataFrame([{
-                'benchmark_id': benchmark_id,
-                'repository': self.repository,
-                'branch': self.branch,
-                'commit_id': self.commit_id,
-                'commit_message': self.commit_msg,
-                'metadata': json.dumps(metadata),
-                'created_at': datetime.utcnow().isoformat()
-            }])
+            new_row = pd.DataFrame(
+                [
+                    {
+                        "benchmark_id": benchmark_id,
+                        "repository": self.repository,
+                        "branch": self.branch,
+                        "commit_id": self.commit_id,
+                        "commit_message": self.commit_msg,
+                        "metadata": json.dumps(metadata),
+                        "created_at": datetime.utcnow().isoformat(),
+                    }
+                ]
+            )
            self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
-            
+
        mode_info = []
        if self.use_database:
            mode_info.append("database")
        if self.collect_csv_data:
            mode_info.append("CSV")
        mode_str = " + ".join(mode_info) if mode_info else "no storage"
-        
+
        self.logger.debug(f"initialised benchmark #{benchmark_id} ({mode_str} mode)")
        return benchmark_id

@ -123,16 +150,20 @@ class MetricsRecorder:
        # Store device measurements for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame
-            new_row = pd.DataFrame([{
-                'benchmark_id': benchmark_id,
-                'cpu_util': cpu_util,
-                'mem_megabytes': mem_megabytes,
-                'gpu_util': gpu_util,
-                'gpu_mem_megabytes': gpu_mem_megabytes,
-                'time': datetime.utcnow().isoformat()
-            }])
+            new_row = pd.DataFrame(
+                [
+                    {
+                        "benchmark_id": benchmark_id,
+                        "cpu_util": cpu_util,
+                        "mem_megabytes": mem_megabytes,
+                        "gpu_util": gpu_util,
+                        "gpu_mem_megabytes": gpu_mem_megabytes,
+                        "time": datetime.utcnow().isoformat(),
+                    }
+                ]
+            )
            self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
-        
+
        # Store in database if available
        if self.use_database:
            with self.conn.cursor() as cur:
@ -140,7 +171,7 @@ class MetricsRecorder:
                    "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
                    (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
                )
-            
+
        self.logger.debug(
            f"collected device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
        )
@ -149,16 +180,13 @@ class MetricsRecorder:
        # Store model measurements for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame with flattened measurements
-            row_data = {
-                'benchmark_id': benchmark_id,
-                'time': datetime.utcnow().isoformat()
-            }
+            row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
            # Flatten the measurements dict into the row
            row_data.update(measurements)
-            
+
            new_row = pd.DataFrame([row_data])
            self.model_measurements_df = pd.concat([self.model_measurements_df, new_row], ignore_index=True)
-        
+
        # Store in database if available
        if self.use_database:
            with self.conn.cursor() as cur:
@ -174,7 +202,7 @@ class MetricsRecorder:
                        measurements,
                    ),
                )
-            
+
        self.logger.debug(f"collected model measurements for benchmark #{benchmark_id}: {measurements}")

    def export_to_csv(self, output_dir: str = "benchmark_results"):
@ -184,19 +212,19 @@ class MetricsRecorder:
        if not self.collect_csv_data:
            self.logger.warning("CSV data collection is disabled - no CSV files will be generated")
            return
-            
+
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            self.logger.info(f"Created output directory: {output_dir}")
-            
+
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        files_created = []
-        
+
        # Export using pandas DataFrames
        self._export_pandas_data(output_dir, timestamp, files_created)
-        
+
        self.logger.info(f"CSV export complete! Created {len(files_created)} files in {output_dir}")
-    
+
    def _export_pandas_data(self, output_dir: str, timestamp: str, files_created: list):
        """
        Export CSV files using pandas DataFrames
@ -206,24 +234,24 @@ class MetricsRecorder:
        self.benchmarks_df.to_csv(benchmarks_file, index=False)
        files_created.append(benchmarks_file)
        self.logger.info(f"Exported {len(self.benchmarks_df)} benchmark records to {benchmarks_file}")
-        
-        # Export device measurements  
+
+        # Export device measurements
        device_file = os.path.join(output_dir, f"device_measurements_{timestamp}.csv")
        self.device_measurements_df.to_csv(device_file, index=False)
        files_created.append(device_file)
        self.logger.info(f"Exported {len(self.device_measurements_df)} device measurement records to {device_file}")
-        
+
        # Export model measurements (already flattened)
        model_file = os.path.join(output_dir, f"model_measurements_{timestamp}.csv")
        self.model_measurements_df.to_csv(model_file, index=False)
        files_created.append(model_file)
        self.logger.info(f"Exported {len(self.model_measurements_df)} model measurement records to {model_file}")
-        
+
        # Create comprehensive summary using pandas operations
        summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.csv")
        self._create_summary(summary_file)
        files_created.append(summary_file)
-    
+
    def _create_summary(self, summary_file: str):
        """
        Create a comprehensive summary CSV using pandas operations
@ -234,36 +262,42 @@ class MetricsRecorder:
            summary_df.to_csv(summary_file, index=False)
            self.logger.info(f"Created empty benchmark summary at {summary_file}")
            return
-        
+
        # Start with benchmarks as the base
        summary_df = self.benchmarks_df.copy()
-        
+
        # Add model measurements (join on benchmark_id)
        if len(self.model_measurements_df) > 0:
            # Drop 'time' column from model measurements to avoid conflicts
-            model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
-            summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
-        
+            model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
+            summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")
+
        # Calculate device measurement aggregates using pandas groupby
        if len(self.device_measurements_df) > 0:
-            device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
-                'cpu_util': ['mean', 'max', 'std', 'count'],
-                'mem_megabytes': ['mean', 'max', 'std'],
-                'gpu_util': ['mean', 'max', 'std'],
-                'gpu_mem_megabytes': ['mean', 'max', 'std']
-            }).round(3)
-            
+            device_agg = (
+                self.device_measurements_df.groupby("benchmark_id")
+                .agg(
+                    {
+                        "cpu_util": ["mean", "max", "std", "count"],
+                        "mem_megabytes": ["mean", "max", "std"],
+                        "gpu_util": ["mean", "max", "std"],
+                        "gpu_mem_megabytes": ["mean", "max", "std"],
+                    }
+                )
+                .round(3)
+            )
+
            # Flatten column names
            device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
            device_agg = device_agg.reset_index()
-            
+
            # Rename count column to be more descriptive
-            if 'cpu_util_count' in device_agg.columns:
-                device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
-            
+            if "cpu_util_count" in device_agg.columns:
+                device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})
+
            # Merge with summary
-            summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
-        
+            summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")
+
        # Export the comprehensive summary
        summary_df.to_csv(summary_file, index=False)
        self.logger.info(f"Created comprehensive benchmark summary with {len(summary_df)} records at {summary_file}")
@ -312,23 +346,18 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
        type=str,
        help="The commit message associated with the commit, truncated to 70 characters.",
    )
-    
-    parser.add_argument(
-        "--csv",
-        action="store_true",
-        default=False,
-        help="Enable CSV output files generation."
-    )
-    
+
+    parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")
+
    parser.add_argument(
        "--csv-output-dir",
        type=str,
        default="benchmark_results",
-        help="Directory for CSV output files (default: benchmark_results)."
+        help="Directory for CSV output files (default: benchmark_results).",
    )

    args = parser.parse_args()
-    
+
    # CSV is disabled by default, only enabled when --csv is used
    generate_csv = args.csv

@ -353,9 +382,10 @@ def create_database_connection():
    if not PSYCOPG2_AVAILABLE:
        logger.warning("psycopg2 not available - running in CSV-only mode")
        return None
-        
+
    try:
        import psycopg2
+
        conn = psycopg2.connect("dbname=metrics")
        logger.info("Successfully connected to database")
        return conn
@ -364,27 +394,28 @@ def create_database_connection():
        return None


-def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str, 
-                                   generate_csv: bool = False) -> MetricsRecorder:
+def create_global_metrics_recorder(
+    repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
+) -> MetricsRecorder:
    """
    Create a global metrics recorder that will be used across all benchmarks.
    """
    connection = create_database_connection()
    recorder = MetricsRecorder(connection, logger, repository, branch, commit_id, commit_msg, generate_csv)
-    
+
    # Log the storage mode
    storage_modes = []
    if connection is not None:
        storage_modes.append("database")
    if generate_csv:
        storage_modes.append("CSV")
-    
+
    if not storage_modes:
        logger.warning("Running benchmarks with NO data storage (no database connection, CSV disabled)")
        logger.warning("Use --csv flag to enable CSV output when database is unavailable")
    else:
        logger.info(f"Running benchmarks with: {' + '.join(storage_modes)} storage")
-    
+
    return recorder


@ -393,16 +424,16 @@ if __name__ == "__main__":
    benches_folder_path = os.path.join(benchmarks_folder_path, "benches")

    repository, branch, commit_id, commit_msg, generate_csv, csv_output_dir = parse_arguments()
-    
+
    # Create a global metrics recorder
    global_metrics_recorder = create_global_metrics_recorder(repository, branch, commit_id, commit_msg, generate_csv)
-    
+
    successful_benchmarks = 0
    failed_benchmarks = 0
-    
+
    # Automatically discover all benchmark modules in benches/ folder
    benchmark_modules = []
-    
+
    if os.path.exists(benches_folder_path):
        logger.debug(f"Scanning for benchmarks in: {benches_folder_path}")
        for entry in os.scandir(benches_folder_path):
@ -410,12 +441,12 @@ if __name__ == "__main__":
                continue
            if entry.name.startswith("__"):  # Skip __init__.py, __pycache__, etc.
                continue
-                
+
            # Check if the file has a run_benchmark function
            try:
                logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
                module = import_from_path(entry.name.split(".")[0], entry.path)
-                if hasattr(module, 'run_benchmark'):
+                if hasattr(module, "run_benchmark"):
                    benchmark_modules.append(entry.name)
                    logger.debug(f"discovered benchmark: {entry.name}")
                else:
@ -436,16 +467,18 @@ if __name__ == "__main__":
            logger.debug(f"loading: {module_name}")
            module = import_from_path(module_name.split(".")[0], module_path)
            logger.info(f"running benchmarks in: {module_name}")
-            
+
            # Check if the module has an updated run_benchmark function that accepts metrics_recorder
            try:
                # Try the new signature first
                module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
            except TypeError:
                # Fall back to the old signature for backward compatibility
-                logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
+                logger.warning(
+                    f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
+                )
                module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
-            
+
            successful_benchmarks += 1
        except ImportModuleException as e:
            logger.error(e)
@ -461,7 +494,7 @@ if __name__ == "__main__":
            logger.info(f"CSV reports have been generated and saved to the {csv_output_dir} directory")
        else:
            logger.info("CSV generation disabled - no CSV files created (use --csv to enable)")
-        
+
        logger.info(f"Benchmark run completed. Successful: {successful_benchmarks}, Failed: {failed_benchmarks}")
    except Exception as e:
        logger.error(f"Failed to export CSV results: {e}")
--- a/benchmark/optimum_benchmark_wrapper.py
+++ b/benchmark/optimum_benchmark_wrapper.py
@ -3,7 +3,11 @@ import subprocess


 def main(config_dir, config_name, args):
-    subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
+    subprocess.run(
+        ["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
+        + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
+        + args
+    )


 if __name__ == "__main__":
--- a/benchmark_v2/benches/init.py
+++ b/benchmark_v2/benches/init.py
@ -1 +1 @@
-# Benchmark implementations directory 
+# Benchmark implementations directory
--- a/benchmark_v2/benches/llama.py
+++ b/benchmark_v2/benches/llama.py
@ -12,55 +12,63 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
 import logging
-from typing import Dict, Any, List
-
-from benchmark_framework import ModelBenchmark
+import os
+from typing import Any

 import torch
+from benchmark_framework import ModelBenchmark
+

 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "1"
 torch.set_float32_matmul_precision("high")

+
 class LLaMABenchmark(ModelBenchmark):
    """Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
-    
+
    def __init__(self, logger: logging.Logger):
        super().__init__(logger)
        self._default_prompt = "Why dogs are so cute?"  # Custom prompt for LLaMA
-    

-    
-    def get_scenario_configs(self) -> List[Dict[str, Any]]:
+    def get_scenario_configs(self) -> list[dict[str, Any]]:
        """
        Get LLaMA-specific scenario configurations.
-        
+
        Returns:
            List of scenario configuration dictionaries
        """
        return [
            # Eager variants
            {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
-            
            # Compiled variants
-            {"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
-            
+            {
+                "variant": "compiled",
+                "compile_mode": "max-autotune",
+                "use_cache": True,
+                "description": "Compiled with max autotune",
+            },
            # Kernelized variant (if available)
-            {"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
+            {
+                "variant": "kernelized",
+                "compile_mode": "max-autotune",
+                "use_cache": True,
+                "description": "Kernelized execution",
+            },
        ]
-    
+
    def _is_kernelization_available(self) -> bool:
        """Check if kernelization is available for LLaMA."""
        try:
-            from kernels import Mode, kernelize
+            from kernels import Mode, kernelize  # noqa: F401
+
            return True
        except ImportError:
            self.logger.debug("Kernelization not available: kernels module not found")
            return False
-    
-    def get_default_generation_config(self) -> Dict[str, Any]:
+
+    def get_default_generation_config(self) -> dict[str, Any]:
        """Get LLaMA-specific generation configuration."""
        return {
            "do_sample": False,
@ -69,20 +77,19 @@ class LLaMABenchmark(ModelBenchmark):
            "repetition_penalty": 1.0,
            "max_new_tokens": None,  # Will be set per scenario
        }
-    
-    def get_model_init_kwargs(self, config) -> Dict[str, Any]:
+
+    def get_model_init_kwargs(self, config) -> dict[str, Any]:
        """Get LLaMA-specific model initialization kwargs."""
-        from benchmark_framework import BenchmarkConfig
        return {
            "torch_dtype": getattr(torch, config.torch_dtype),
            "attn_implementation": config.attn_implementation,
            "use_cache": True,
        }
-    
+
    def get_default_torch_dtype(self) -> str:
        """Get default torch dtype for LLaMA."""
        return "float16"  # LLaMA works well with float16
-    
+
    def get_default_device(self) -> str:
        """Get default device for LLaMA."""
        return "cuda"  # LLaMA prefers CUDA
@ -91,35 +98,37 @@ class LLaMABenchmark(ModelBenchmark):
 def run_llama(logger, output_dir, **kwargs):
    """
    Run LLaMA benchmark with the given configuration.
-    
+
    Args:
        logger: Logger instance
        output_dir: Output directory for results
        **kwargs: Additional configuration options
-        
+
    Returns:
        Path to output file if successful
    """
    from benchmark_framework import BenchmarkRunner
-    
+
    # Extract parameters with defaults
-    model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf')
-    warmup_iterations = kwargs.get('warmup_iterations', 3)
-    measurement_iterations = kwargs.get('measurement_iterations', 5)
-    num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
-    include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
-    device = kwargs.get('device', 'cuda')
-    torch_dtype = kwargs.get('torch_dtype', 'float16')
-    batch_size = kwargs.get('batch_size', 1)
-    commit_id = kwargs.get('commit_id', None)
-    
+    model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
+    warmup_iterations = kwargs.get("warmup_iterations", 3)
+    measurement_iterations = kwargs.get("measurement_iterations", 5)
+    num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
+    include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
+    device = kwargs.get("device", "cuda")
+    torch_dtype = kwargs.get("torch_dtype", "float16")
+    batch_size = kwargs.get("batch_size", 1)
+    commit_id = kwargs.get("commit_id")
+
    logger.info(f"Starting LLaMA benchmark for model: {model_id}")
-    logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}")
-    
+    logger.info(
+        f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
+    )
+
    try:
        # Create benchmark instance
        benchmark = LLaMABenchmark(logger)
-        
+
        # Create scenarios
        scenarios = benchmark.create_scenarios(
            model_id=model_id,
@ -129,28 +138,29 @@ def run_llama(logger, output_dir, **kwargs):
            include_sdpa_variants=include_sdpa_variants,
            device=device,
            torch_dtype=torch_dtype,
-            batch_size=batch_size
+            batch_size=batch_size,
        )
-        
+
        logger.info(f"Created {len(scenarios)} benchmark scenarios")
-        
+
        # Create runner and execute benchmarks
        runner = BenchmarkRunner(logger, output_dir)
        results = runner.run_benchmark(benchmark, scenarios, commit_id=commit_id)
-        
+
        if not results:
            logger.warning("No successful benchmark results")
            return None
-        
+
        # Save results
-        model_name = model_id.split('/')[-1]  # Extract model name from ID
+        model_name = model_id.split("/")[-1]  # Extract model name from ID
        output_file = runner.save_results(model_name, results)
-        
+
        logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
        return output_file
-        
+
    except Exception as e:
        logger.error(f"LLaMA benchmark failed: {e}")
        import traceback
+
        logger.debug(traceback.format_exc())
-        raise
+        raise
--- a/benchmark_v2/benchmark_framework.py
+++ b/benchmark_v2/benchmark_framework.py
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -14,350 +14,304 @@
 # limitations under the License.

 """
-Top-level benchmarking script that automatically discovers and runs all benchmarks 
+Top-level benchmarking script that automatically discovers and runs all benchmarks
 in the ./benches directory, organizing outputs into model-specific subfolders.
 """

 import argparse
 import importlib.util
+import json
 import logging
 import os
 import sys
-import json
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Any, Optional
+from typing import Any, Optional


 def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
    """Setup logging configuration."""
    numeric_level = getattr(logging, log_level.upper(), None)
    if not isinstance(numeric_level, int):
-        raise ValueError(f'Invalid log level: {log_level}')
-    
+        raise ValueError(f"Invalid log level: {log_level}")
+
    handlers = [logging.StreamHandler(sys.stdout)]
-    
+
    if enable_file_logging:
-        handlers.append(
-            logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
-        )
-    
+        handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
+
    logging.basicConfig(
-        level=numeric_level,
-        format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
-        handlers=handlers
+        level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
    )
-    
+
    return logging.getLogger(__name__)


-def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
+def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
    """
    Discover all benchmark modules in the benches directory.
-    
+
    Returns:
        List of dictionaries containing benchmark module info
    """
    benchmarks = []
    benches_path = Path(benches_dir)
-    
+
    if not benches_path.exists():
        raise FileNotFoundError(f"Benches directory not found: {benches_dir}")
-    
+
    for py_file in benches_path.glob("*.py"):
        if py_file.name.startswith("__"):
            continue
-            
+
        module_name = py_file.stem
-        
+
        try:
            # Import the module
            spec = importlib.util.spec_from_file_location(module_name, py_file)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
-            
+
            # Check if it has a benchmark runner function
-            if hasattr(module, f'run_{module_name}'):
-                benchmarks.append({
-                    'name': module_name,
-                    'path': str(py_file),
-                    'module': module,
-                    'runner_function': getattr(module, f'run_{module_name}')
-                })
-            elif hasattr(module, 'run_benchmark'):
-                benchmarks.append({
-                    'name': module_name,
-                    'path': str(py_file),
-                    'module': module,
-                    'runner_function': getattr(module, 'run_benchmark')
-                })
+            if hasattr(module, f"run_{module_name}"):
+                benchmarks.append(
+                    {
+                        "name": module_name,
+                        "path": str(py_file),
+                        "module": module,
+                        "runner_function": getattr(module, f"run_{module_name}"),
+                    }
+                )
+            elif hasattr(module, "run_benchmark"):
+                benchmarks.append(
+                    {
+                        "name": module_name,
+                        "path": str(py_file),
+                        "module": module,
+                        "runner_function": getattr(module, "run_benchmark"),
+                    }
+                )
            else:
                logging.warning(f"No runner function found in {py_file}")
-                
+
        except Exception as e:
            logging.error(f"Failed to import {py_file}: {e}")
-            
+
    return benchmarks


 def run_single_benchmark(
-    benchmark_info: Dict[str, Any], 
-    output_dir: str,
-    logger: logging.Logger,
-    **kwargs
+    benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
 ) -> Optional[str]:
    """
    Run a single benchmark and return the output file path.
-    
+
    Args:
        benchmark_info: Dictionary containing benchmark module info
        output_dir: Base output directory
        logger: Logger instance
        **kwargs: Additional arguments to pass to the benchmark
-        
+
    Returns:
        Path to the output file if successful, None otherwise
    """
-    benchmark_name = benchmark_info['name']
-    runner_func = benchmark_info['runner_function']
-    
+    benchmark_name = benchmark_info["name"]
+    runner_func = benchmark_info["runner_function"]
+
    logger.info(f"Running benchmark: {benchmark_name}")
-    
+
    try:
        # Check function signature to determine what arguments to pass
        import inspect
+
        sig = inspect.signature(runner_func)
-        
+
        # Prepare arguments based on function signature
-        func_kwargs = {
-            'logger': logger,
-            'output_dir': output_dir
-        }
-        
+        func_kwargs = {"logger": logger, "output_dir": output_dir}
+
        # Add other kwargs if the function accepts them
        for param_name in sig.parameters:
            if param_name in kwargs:
                func_kwargs[param_name] = kwargs[param_name]
-        
+
        # Filter kwargs to only include parameters the function accepts
        # If function has **kwargs, include all provided kwargs
        has_var_kwargs = any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values())
        if has_var_kwargs:
            valid_kwargs = {**func_kwargs, **kwargs}
        else:
-            valid_kwargs = {k: v for k, v in func_kwargs.items() 
-                           if k in sig.parameters}
-        
+            valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
+
        # Run the benchmark
        result = runner_func(**valid_kwargs)
-        
+
        if isinstance(result, str):
            # Function returned a file path
            return result
        else:
            logger.info(f"Benchmark {benchmark_name} completed successfully")
            return "completed"
-            
+
    except Exception as e:
        logger.error(f"Benchmark {benchmark_name} failed: {e}")
        import traceback
+
        logger.debug(traceback.format_exc())
        return None


-def generate_summary_report(
-    output_dir: str, 
-    benchmark_results: Dict[str, Any],
-    logger: logging.Logger
-) -> str:
+def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
    """Generate a summary report of all benchmark runs."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
-    
+
    summary_data = {
        "run_metadata": {
            "timestamp": datetime.utcnow().isoformat(),
            "total_benchmarks": len(benchmark_results),
            "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
-            "failed_benchmarks": len([r for r in benchmark_results.values() if r is None])
+            "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
        },
        "benchmark_results": benchmark_results,
-        "output_directory": output_dir
+        "output_directory": output_dir,
    }
-    
-    with open(summary_file, 'w') as f:
+
+    with open(summary_file, "w") as f:
        json.dump(summary_data, f, indent=2, default=str)
-    
+
    logger.info(f"Summary report saved to: {summary_file}")
    return summary_file


 def main():
    """Main entry point for the benchmarking script."""
-    parser = argparse.ArgumentParser(
-        description="Run all benchmarks in the ./benches directory"
-    )
-    
+    parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
+
    parser.add_argument(
        "--output-dir",
        type=str,
        default="benchmark_results",
-        help="Base output directory for benchmark results (default: benchmark_results)"
+        help="Base output directory for benchmark results (default: benchmark_results)",
    )
-    
+
    parser.add_argument(
        "--benches-dir",
        type=str,
        default="./benches",
-        help="Directory containing benchmark implementations (default: ./benches)"
+        help="Directory containing benchmark implementations (default: ./benches)",
    )
-    
+
    parser.add_argument(
        "--log-level",
        type=str,
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        default="INFO",
-        help="Logging level (default: INFO)"
+        help="Logging level (default: INFO)",
    )
-    
+
+    parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
+
+    parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
+
    parser.add_argument(
-        "--model-id",
-        type=str,
-        help="Specific model ID to benchmark (if supported by benchmarks)"
+        "--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
    )
-    
-    parser.add_argument(
-        "--warmup-iterations",
-        type=int,
-        default=3,
-        help="Number of warmup iterations (default: 3)"
-    )
-    
-    parser.add_argument(
-        "--measurement-iterations",
-        type=int,
-        default=5,
-        help="Number of measurement iterations (default: 5)"
-    )
-    
+
    parser.add_argument(
        "--num-tokens-to-generate",
        type=int,
        default=100,
-        help="Number of tokens to generate in benchmarks (default: 100)"
+        help="Number of tokens to generate in benchmarks (default: 100)",
    )
-    
+
+    parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
+
+    parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
+
+    parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
+
+    parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
+
    parser.add_argument(
-        "--include",
-        type=str,
-        nargs="*",
-        help="Only run benchmarks matching these names"
+        "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
    )
-    
-    parser.add_argument(
-        "--exclude",
-        type=str,
-        nargs="*",
-        help="Exclude benchmarks matching these names"
-    )
-    
-    parser.add_argument(
-        "--enable-mock",
-        action="store_true",
-        help="Enable mock benchmark (skipped by default)"
-    )
-    
-    parser.add_argument(
-        "--enable-file-logging",
-        action="store_true",
-        help="Enable file logging (disabled by default)"
-    )
-    
-    parser.add_argument(
-        "--commit-id",
-        type=str,
-        help="Git commit ID for metadata (if not provided, will auto-detect from git)"
-    )
-    
+
    args = parser.parse_args()
-    
+
    # Setup logging
    logger = setup_logging(args.log_level, args.enable_file_logging)
-    
+
    logger.info("Starting benchmark discovery and execution")
    logger.info(f"Output directory: {args.output_dir}")
    logger.info(f"Benches directory: {args.benches_dir}")
-    
+
    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)
-    
+
    try:
        # Discover benchmarks
        benchmarks = discover_benchmarks(args.benches_dir)
        logger.info(f"Discovered {len(benchmarks)} benchmark(s): {[b['name'] for b in benchmarks]}")
-        
+
        if not benchmarks:
            logger.warning("No benchmarks found!")
            return 1
-        
+
        # Filter benchmarks based on include/exclude
        filtered_benchmarks = benchmarks
-        
+
        if args.include:
-            filtered_benchmarks = [b for b in filtered_benchmarks 
-                                 if any(pattern in b['name'] for pattern in args.include)]
+            filtered_benchmarks = [
+                b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
+            ]
            logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
-        
+
        if args.exclude:
-            filtered_benchmarks = [b for b in filtered_benchmarks 
-                                 if not any(pattern in b['name'] for pattern in args.exclude)]
+            filtered_benchmarks = [
+                b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
+            ]
            logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
-        
+
        if not filtered_benchmarks:
            logger.warning("No benchmarks remaining after filtering!")
            return 1
-        
+
        # Prepare common kwargs for benchmarks
        benchmark_kwargs = {
-            'warmup_iterations': args.warmup_iterations,
-            'measurement_iterations': args.measurement_iterations,
-            'num_tokens_to_generate': args.num_tokens_to_generate
+            "warmup_iterations": args.warmup_iterations,
+            "measurement_iterations": args.measurement_iterations,
+            "num_tokens_to_generate": args.num_tokens_to_generate,
        }
-        
+
        if args.model_id:
-            benchmark_kwargs['model_id'] = args.model_id
-        
+            benchmark_kwargs["model_id"] = args.model_id
+
        # Add enable_mock flag for mock benchmark
-        benchmark_kwargs['enable_mock'] = args.enable_mock
-        
+        benchmark_kwargs["enable_mock"] = args.enable_mock
+
        # Add commit_id if provided
        if args.commit_id:
-            benchmark_kwargs['commit_id'] = args.commit_id
-        
+            benchmark_kwargs["commit_id"] = args.commit_id
+
        # Run benchmarks
        benchmark_results = {}
        successful_count = 0
-        
+
        for benchmark_info in filtered_benchmarks:
-            result = run_single_benchmark(
-                benchmark_info,
-                args.output_dir,
-                logger,
-                **benchmark_kwargs
-            )
-            
-            benchmark_results[benchmark_info['name']] = result
-            
+            result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
+
+            benchmark_results[benchmark_info["name"]] = result
+
            if result is not None:
                successful_count += 1
-        
+
        # Generate summary report
        summary_file = generate_summary_report(args.output_dir, benchmark_results, logger)
-        
+
        # Final summary
        total_benchmarks = len(filtered_benchmarks)
        failed_count = total_benchmarks - successful_count
-        
+
        logger.info("=" * 60)
        logger.info("BENCHMARK RUN SUMMARY")
        logger.info("=" * 60)
@ -366,20 +320,21 @@ def main():
        logger.info(f"Failed: {failed_count}")
        logger.info(f"Output directory: {args.output_dir}")
        logger.info(f"Summary report: {summary_file}")
-        
+
        if failed_count > 0:
            logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
            return 1
        else:
            logger.info("All benchmarks completed successfully!")
            return 0
-            
+
    except Exception as e:
        logger.error(f"Benchmark run failed: {e}")
        import traceback
+
        logger.debug(traceback.format_exc())
        return 1


 if __name__ == "__main__":
-    sys.exit(main()) 
+    sys.exit(main())
--- a/scripts/check_tokenizers.py
+++ b/scripts/check_tokenizers.py
@ -4,8 +4,8 @@ import datasets

 import transformers
 from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
-from transformers.utils import logging
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.utils import logging


 logging.set_verbosity_info()
@ -22,7 +22,9 @@ imperfect = 0
 wrong = 0


-def check_diff(spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
+def check_diff(
+    spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
+) -> bool:
    if spm_diff == list(reversed(tok_diff)):
        # AAA -> AA+A vs A+AA case.
        return True
@ -54,7 +56,9 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
    return False


-def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
+def check_details(
+    line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
+) -> bool:
    # Encoding can be the same with same result AAA -> A + AA vs AA + A
    # We can check that we use at least exactly the same number of tokens.
    for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
@ -90,7 +94,9 @@ def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTr
                    if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
                ]
                for j in possible_matches:
-                    if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], slow, fast) and check_details(
+                    if check_diff(
+                        spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
+                    ) and check_details(
                        line,
                        spm_ids[first + i : last],
                        tok_ids[first + j : last],
@ -140,9 +146,9 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
    if skip_assert:
        return

-    assert (
-        slow_ids == fast_ids
-    ), f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
+    assert slow_ids == fast_ids, (
+        f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
+    )


 def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
--- a/scripts/stale.py
+++ b/scripts/stale.py
@ -15,6 +15,7 @@
 Script to close stale issue. Taken in part from the AllenNLP repository.
 https://github.com/allenai/allennlp.
 """
+
 import os
 from datetime import datetime as dt

@ -39,10 +40,11 @@ def main():

    for i, issue in enumerate(open_issues):
        print(i, issue)
-        comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True)
+        comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
        last_comment = comments[0] if len(comments) > 0 else None
        if (
-            last_comment is not None and last_comment.user.login == "github-actions[bot]"
+            last_comment is not None
+            and last_comment.user.login == "github-actions[bot]"
            and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
            and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())