Enable ruff on benchmark and scripts (#40634)

* Enable ruff on benchmark and scripts Signed-off-by: cyy <cyyever@outlook.com> * Cover benchmark_v2 Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> * correct * style * style --------- Signed-off-by: cyy <cyyever@outlook.com> Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
2025-10-20 17:13:56 +08:00 · 2025-09-10 17:38:06 +08:00
parent 08edec9f7d
commit a5ecd94a3f
11 changed files with 661 additions and 649 deletions
--- a/2
+++ b/2
@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src

-check_dirs := examples tests src utils
+check_dirs := examples tests src utils scripts benchmark benchmark_v2

 exclude_folders :=  ""

--- a/benchmark/benches/llama.py
+++ b/benchmark/benches/llama.py
@ -11,25 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from logging import Logger
 import os
+import sys
+from logging import Logger
 from threading import Event, Thread
 from time import perf_counter, sleep
 from typing import Optional
-import sys
+

 # Add the parent directory to Python path to import benchmarks_entrypoint
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from benchmarks_entrypoint import MetricsRecorder
-
 import gpustat
 import psutil
 import psycopg2
+from benchmarks_entrypoint import MetricsRecorder
+

 # Optional heavy ML dependencies - only required when actually running the benchmark
 try:
    import torch
+
    from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
+
    TRANSFORMERS_AVAILABLE = True
 except ImportError:
    TRANSFORMERS_AVAILABLE = False
@ -63,7 +66,13 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):


 def run_benchmark(
-    logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
+    logger: Logger,
+    repository: str,
+    branch: str,
+    commit_id: str,
+    commit_msg: str,
+    metrics_recorder=None,
+    num_tokens_to_generate=100,
 ):
    # Check if required ML dependencies are available
    if not TRANSFORMERS_AVAILABLE:
@ -154,7 +163,7 @@ def run_benchmark(
        # First eager forward pass
        logger.info("running first eager forward pass")
        start = perf_counter()
-        outputs = model(**inputs)
+        _ = model(**inputs)
        torch.cuda.synchronize()
        end = perf_counter()
        first_eager_fwd_pass_time = end - start
@ -163,7 +172,7 @@ def run_benchmark(
        # Second eager forward pass (should be faster)
        logger.info("running second eager forward pass")
        start = perf_counter()
-        outputs = model(**inputs)
+        _ = model(**inputs)
        torch.cuda.synchronize()
        end = perf_counter()
        second_eager_fwd_pass_time = end - start
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -31,9 +31,7 @@ from contextlib import contextmanager
 from pathlib import Path

 from git import Repo
-
 from huggingface_hub import HfApi
-
 from optimum_benchmark import Benchmark
 from optimum_benchmark_wrapper import main

--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -13,19 +13,20 @@
 # limitations under the License.
 import argparse
 import importlib.util
+import json
 import logging
 import os
 import sys
-import json
 import uuid
 from datetime import datetime
-from typing import Dict, Tuple, Optional, List

 import pandas as pd

+
 try:
    from psycopg2.extensions import register_adapter
    from psycopg2.extras import Json
+
    register_adapter(dict, Json)
    PSYCOPG2_AVAILABLE = True
 except ImportError:
@ -38,8 +39,14 @@ class ImportModuleException(Exception):

 class MetricsRecorder:
    def __init__(
-        self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str, 
-        collect_csv_data: bool = True
+        self,
+        connection,
+        logger: logging.Logger,
+        repository: str,
+        branch: str,
+        commit_id: str,
+        commit_msg: str,
+        collect_csv_data: bool = True,
    ):
        self.conn = connection
        self.use_database = connection is not None
@ -55,23 +62,39 @@ class MetricsRecorder:
        # For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
        if self.collect_csv_data:
            # Initialize empty DataFrames with proper schemas
-            self.benchmarks_df = pd.DataFrame(columns=[
-                'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message', 
-                'metadata', 'created_at'
-            ])
-            self.device_measurements_df = pd.DataFrame(columns=[
-                'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util', 
-                'gpu_mem_megabytes', 'time'
-            ])
-            self.model_measurements_df = pd.DataFrame(columns=[
-                'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
-                'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
-                'second_eager_generate_time_secs', 'time_to_first_token_secs',
-                'time_to_second_token_secs', 'time_to_third_token_secs',
-                'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
-                'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
-                'fourth_compile_generate_time_secs'
-            ])
+            self.benchmarks_df = pd.DataFrame(
+                columns=[
+                    "benchmark_id",
+                    "repository",
+                    "branch",
+                    "commit_id",
+                    "commit_message",
+                    "metadata",
+                    "created_at",
+                ]
+            )
+            self.device_measurements_df = pd.DataFrame(
+                columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
+            )
+            self.model_measurements_df = pd.DataFrame(
+                columns=[
+                    "benchmark_id",
+                    "time",
+                    "model_load_time",
+                    "first_eager_forward_pass_time_secs",
+                    "second_eager_forward_pass_time_secs",
+                    "first_eager_generate_time_secs",
+                    "second_eager_generate_time_secs",
+                    "time_to_first_token_secs",
+                    "time_to_second_token_secs",
+                    "time_to_third_token_secs",
+                    "time_to_next_token_mean_secs",
+                    "first_compile_generate_time_secs",
+                    "second_compile_generate_time_secs",
+                    "third_compile_generate_time_secs",
+                    "fourth_compile_generate_time_secs",
+                ]
+            )
        else:
            self.benchmarks_df = None
            self.device_measurements_df = None
@ -95,15 +118,19 @@ class MetricsRecorder:
        # Store benchmark data for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame
-            new_row = pd.DataFrame([{
-                'benchmark_id': benchmark_id,
-                'repository': self.repository,
-                'branch': self.branch,
-                'commit_id': self.commit_id,
-                'commit_message': self.commit_msg,
-                'metadata': json.dumps(metadata),
-                'created_at': datetime.utcnow().isoformat()
-            }])
+            new_row = pd.DataFrame(
+                [
+                    {
+                        "benchmark_id": benchmark_id,
+                        "repository": self.repository,
+                        "branch": self.branch,
+                        "commit_id": self.commit_id,
+                        "commit_message": self.commit_msg,
+                        "metadata": json.dumps(metadata),
+                        "created_at": datetime.utcnow().isoformat(),
+                    }
+                ]
+            )
            self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)

        mode_info = []
@ -123,14 +150,18 @@ class MetricsRecorder:
        # Store device measurements for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame
-            new_row = pd.DataFrame([{
-                'benchmark_id': benchmark_id,
-                'cpu_util': cpu_util,
-                'mem_megabytes': mem_megabytes,
-                'gpu_util': gpu_util,
-                'gpu_mem_megabytes': gpu_mem_megabytes,
-                'time': datetime.utcnow().isoformat()
-            }])
+            new_row = pd.DataFrame(
+                [
+                    {
+                        "benchmark_id": benchmark_id,
+                        "cpu_util": cpu_util,
+                        "mem_megabytes": mem_megabytes,
+                        "gpu_util": gpu_util,
+                        "gpu_mem_megabytes": gpu_mem_megabytes,
+                        "time": datetime.utcnow().isoformat(),
+                    }
+                ]
+            )
            self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)

        # Store in database if available
@ -149,10 +180,7 @@ class MetricsRecorder:
        # Store model measurements for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame with flattened measurements
-            row_data = {
-                'benchmark_id': benchmark_id,
-                'time': datetime.utcnow().isoformat()
-            }
+            row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
            # Flatten the measurements dict into the row
            row_data.update(measurements)

@ -241,28 +269,34 @@ class MetricsRecorder:
        # Add model measurements (join on benchmark_id)
        if len(self.model_measurements_df) > 0:
            # Drop 'time' column from model measurements to avoid conflicts
-            model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
-            summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
+            model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
+            summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")

        # Calculate device measurement aggregates using pandas groupby
        if len(self.device_measurements_df) > 0:
-            device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
-                'cpu_util': ['mean', 'max', 'std', 'count'],
-                'mem_megabytes': ['mean', 'max', 'std'],
-                'gpu_util': ['mean', 'max', 'std'],
-                'gpu_mem_megabytes': ['mean', 'max', 'std']
-            }).round(3)
+            device_agg = (
+                self.device_measurements_df.groupby("benchmark_id")
+                .agg(
+                    {
+                        "cpu_util": ["mean", "max", "std", "count"],
+                        "mem_megabytes": ["mean", "max", "std"],
+                        "gpu_util": ["mean", "max", "std"],
+                        "gpu_mem_megabytes": ["mean", "max", "std"],
+                    }
+                )
+                .round(3)
+            )

            # Flatten column names
            device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
            device_agg = device_agg.reset_index()

            # Rename count column to be more descriptive
-            if 'cpu_util_count' in device_agg.columns:
-                device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
+            if "cpu_util_count" in device_agg.columns:
+                device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})

            # Merge with summary
-            summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
+            summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")

        # Export the comprehensive summary
        summary_df.to_csv(summary_file, index=False)
@ -313,18 +347,13 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
        help="The commit message associated with the commit, truncated to 70 characters.",
    )

-    parser.add_argument(
-        "--csv",
-        action="store_true",
-        default=False,
-        help="Enable CSV output files generation."
-    )
+    parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")

    parser.add_argument(
        "--csv-output-dir",
        type=str,
        default="benchmark_results",
-        help="Directory for CSV output files (default: benchmark_results)."
+        help="Directory for CSV output files (default: benchmark_results).",
    )

    args = parser.parse_args()
@ -356,6 +385,7 @@ def create_database_connection():

    try:
        import psycopg2
+
        conn = psycopg2.connect("dbname=metrics")
        logger.info("Successfully connected to database")
        return conn
@ -364,8 +394,9 @@ def create_database_connection():
        return None


-def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str, 
-                                   generate_csv: bool = False) -> MetricsRecorder:
+def create_global_metrics_recorder(
+    repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
+) -> MetricsRecorder:
    """
    Create a global metrics recorder that will be used across all benchmarks.
    """
@ -415,7 +446,7 @@ if __name__ == "__main__":
            try:
                logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
                module = import_from_path(entry.name.split(".")[0], entry.path)
-                if hasattr(module, 'run_benchmark'):
+                if hasattr(module, "run_benchmark"):
                    benchmark_modules.append(entry.name)
                    logger.debug(f"discovered benchmark: {entry.name}")
                else:
@ -443,7 +474,9 @@ if __name__ == "__main__":
                module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
            except TypeError:
                # Fall back to the old signature for backward compatibility
-                logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
+                logger.warning(
+                    f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
+                )
                module.run_benchmark(logger, repository, branch, commit_id, commit_msg)

            successful_benchmarks += 1
--- a/benchmark/optimum_benchmark_wrapper.py
+++ b/benchmark/optimum_benchmark_wrapper.py
@ -3,7 +3,11 @@ import subprocess


 def main(config_dir, config_name, args):
-    subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
+    subprocess.run(
+        ["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
+        + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
+        + args
+    )


 if __name__ == "__main__":
--- a/benchmark_v2/benches/llama.py
+++ b/benchmark_v2/benches/llama.py
@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
 import logging
-from typing import Dict, Any, List
-
-from benchmark_framework import ModelBenchmark
+import os
+from typing import Any

 import torch
+from benchmark_framework import ModelBenchmark
+

 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "1"
 torch.set_float32_matmul_precision("high")

+
 class LLaMABenchmark(ModelBenchmark):
    """Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""

@ -31,9 +32,7 @@ class LLaMABenchmark(ModelBenchmark):
        super().__init__(logger)
        self._default_prompt = "Why dogs are so cute?"  # Custom prompt for LLaMA

-
-    
-    def get_scenario_configs(self) -> List[Dict[str, Any]]:
+    def get_scenario_configs(self) -> list[dict[str, Any]]:
        """
        Get LLaMA-specific scenario configurations.

@ -43,24 +42,33 @@ class LLaMABenchmark(ModelBenchmark):
        return [
            # Eager variants
            {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
-            
            # Compiled variants
-            {"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
-            
+            {
+                "variant": "compiled",
+                "compile_mode": "max-autotune",
+                "use_cache": True,
+                "description": "Compiled with max autotune",
+            },
            # Kernelized variant (if available)
-            {"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
+            {
+                "variant": "kernelized",
+                "compile_mode": "max-autotune",
+                "use_cache": True,
+                "description": "Kernelized execution",
+            },
        ]

    def _is_kernelization_available(self) -> bool:
        """Check if kernelization is available for LLaMA."""
        try:
-            from kernels import Mode, kernelize
+            from kernels import Mode, kernelize  # noqa: F401
+
            return True
        except ImportError:
            self.logger.debug("Kernelization not available: kernels module not found")
            return False

-    def get_default_generation_config(self) -> Dict[str, Any]:
+    def get_default_generation_config(self) -> dict[str, Any]:
        """Get LLaMA-specific generation configuration."""
        return {
            "do_sample": False,
@ -70,9 +78,8 @@ class LLaMABenchmark(ModelBenchmark):
            "max_new_tokens": None,  # Will be set per scenario
        }

-    def get_model_init_kwargs(self, config) -> Dict[str, Any]:
+    def get_model_init_kwargs(self, config) -> dict[str, Any]:
        """Get LLaMA-specific model initialization kwargs."""
-        from benchmark_framework import BenchmarkConfig
        return {
            "torch_dtype": getattr(torch, config.torch_dtype),
            "attn_implementation": config.attn_implementation,
@ -103,18 +110,20 @@ def run_llama(logger, output_dir, **kwargs):
    from benchmark_framework import BenchmarkRunner

    # Extract parameters with defaults
-    model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf')
-    warmup_iterations = kwargs.get('warmup_iterations', 3)
-    measurement_iterations = kwargs.get('measurement_iterations', 5)
-    num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
-    include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
-    device = kwargs.get('device', 'cuda')
-    torch_dtype = kwargs.get('torch_dtype', 'float16')
-    batch_size = kwargs.get('batch_size', 1)
-    commit_id = kwargs.get('commit_id', None)
+    model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
+    warmup_iterations = kwargs.get("warmup_iterations", 3)
+    measurement_iterations = kwargs.get("measurement_iterations", 5)
+    num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
+    include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
+    device = kwargs.get("device", "cuda")
+    torch_dtype = kwargs.get("torch_dtype", "float16")
+    batch_size = kwargs.get("batch_size", 1)
+    commit_id = kwargs.get("commit_id")

    logger.info(f"Starting LLaMA benchmark for model: {model_id}")
-    logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}")
+    logger.info(
+        f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
+    )

    try:
        # Create benchmark instance
@ -129,7 +138,7 @@ def run_llama(logger, output_dir, **kwargs):
            include_sdpa_variants=include_sdpa_variants,
            device=device,
            torch_dtype=torch_dtype,
-            batch_size=batch_size
+            batch_size=batch_size,
        )

        logger.info(f"Created {len(scenarios)} benchmark scenarios")
@ -143,7 +152,7 @@ def run_llama(logger, output_dir, **kwargs):
            return None

        # Save results
-        model_name = model_id.split('/')[-1]  # Extract model name from ID
+        model_name = model_id.split("/")[-1]  # Extract model name from ID
        output_file = runner.save_results(model_name, results)

        logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
@ -152,5 +161,6 @@ def run_llama(logger, output_dir, **kwargs):
    except Exception as e:
        logger.error(f"LLaMA benchmark failed: {e}")
        import traceback
+
        logger.debug(traceback.format_exc())
        raise
--- a/benchmark_v2/benchmark_framework.py
+++ b/benchmark_v2/benchmark_framework.py
@ -14,28 +14,26 @@

 import gc
 import json
-import os
-import subprocess
-import sys
-import time
-import statistics
-import threading
-from abc import ABC, abstractmethod
-from contextlib import nullcontext
-from dataclasses import dataclass, field, asdict
-from datetime import datetime
-from typing import Any, Callable, Dict, List, Optional, Union, TypedDict
 import logging
+import os
+import statistics
+import sys
+import threading
+import time
+from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from typing import Any, Optional, TypedDict, Union

+import gpustat
 import numpy as np
 import psutil
-import gpustat
-
 import torch


 class GPUMetrics(TypedDict):
    """GPU monitoring result with GPU metrics."""
+
    gpu_utilization_mean: float
    gpu_utilization_max: float
    gpu_utilization_min: float
@ -48,6 +46,7 @@ class GPUMetrics(TypedDict):

 class NoGPU(TypedDict):
    """GPU monitoring result without GPU metrics."""
+
    gpu_monitoring_status: str
    gpu_monitoring_reason: str

@ -134,6 +133,7 @@ class ArchAwareTimer:
@dataclass
 class BenchmarkConfig:
    """Configuration for a single benchmark scenario."""
+
    name: str
    model_id: str
    variant: str = "eager"  # "eager", "compiled", "kernelized"
@ -143,13 +143,13 @@ class BenchmarkConfig:
    device: str = "cuda"
    torch_dtype: str = "float16"
    compile_mode: Optional[str] = None  # None, "default", "reduce-overhead", "max-autotune"
-    compile_options: Dict[str, Any] = field(default_factory=dict)
+    compile_options: dict[str, Any] = field(default_factory=dict)
    use_cache: bool = True
    batch_size: int = 1
    sequence_length: Optional[int] = None
    attn_implementation: str = "sdpa"  # "eager", "sdpa", "flash_attention_2"
    sdpa_backend: Optional[str] = None  # None, "math", "flash_attention", "efficient_attention", "cudnn_attention"
-    custom_params: Dict[str, Any] = field(default_factory=dict)
+    custom_params: dict[str, Any] = field(default_factory=dict)


 class BenchmarkScenario:
@ -195,24 +195,24 @@ class BenchmarkScenario:
        return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')"


-
-
@dataclass
 class TimingResult:
    """Result from a timing measurement."""
+
    time_to_first_token_seconds: Optional[float] = None
    latency_seconds: float = 0.0
    tokens_per_second: Optional[float] = None
    time_per_output_token_seconds: Optional[float] = None
    total_tokens_generated: int = 0
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    metadata: dict[str, Any] = field(default_factory=dict)


@dataclass
 class BenchmarkStatistics:
    """Statistical analysis of benchmark measurements."""
+
    name: str
-    measurements: List[float]
+    measurements: list[float]
    mean: float
    median: float
    std: float
@ -226,7 +226,7 @@ class BenchmarkStatistics:
    unit: str = "seconds"

    @classmethod
-    def from_measurements(cls, name: str, measurements: List[float], unit: str = "seconds") -> 'BenchmarkStatistics':
+    def from_measurements(cls, name: str, measurements: list[float], unit: str = "seconds") -> "BenchmarkStatistics":
        """Create statistics from a list of measurements."""
        if not measurements:
            raise ValueError("Cannot create statistics from empty measurements")
@ -246,13 +246,14 @@ class BenchmarkStatistics:
            p90=float(np.percentile(measurements_array, 90)),
            p95=float(np.percentile(measurements_array, 95)),
            p99=float(np.percentile(measurements_array, 99)),
-            unit=unit
+            unit=unit,
        )


@dataclass
 class HardwareInfo:
    """Hardware information collected during benchmarking."""
+
    gpu_name: str
    gpu_memory_total_mb: int
    cpu_count: int
@ -265,6 +266,7 @@ class HardwareInfo:
@dataclass
 class BenchmarkMetadata:
    """Metadata collected for each benchmark run."""
+
    timestamp: str
    commit_id: str
    hardware_info: HardwareInfo
@ -274,7 +276,7 @@ class BenchmarkMetadata:
 class GPUMonitor:
    """Monitor GPU utilization during benchmark execution."""

-    def __init__(self, sample_interval: float = 0.1, logger: logging.Logger = None):
+    def __init__(self, sample_interval: float = 0.1, logger: Optional[logging.Logger] = None):
        self.sample_interval = sample_interval
        self.logger = logger or logging.getLogger(__name__)
        self.stop_event = threading.Event()
@ -321,10 +323,7 @@ class GPUMonitor:
    def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]:
        """Stop monitoring and return collected metrics."""
        if not self.gpu_available:
-            return NoGPU(
-                gpu_monitoring_status="disabled",
-                gpu_monitoring_reason="no_gpus_available"
-            )
+            return NoGPU(gpu_monitoring_status="disabled", gpu_monitoring_reason="no_gpus_available")

        # Signal the monitoring thread to stop
        self.stop_event.set()
@ -340,15 +339,12 @@ class GPUMonitor:
                gpu_memory_used_max=max(self.gpu_memory_used),
                gpu_memory_used_min=min(self.gpu_memory_used),
                sample_count=len(self.gpu_utilization),
-                gpu_monitoring_status="success"
+                gpu_monitoring_status="success",
            )
            self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected")
            return metrics
        else:
-            return NoGPU(
-                gpu_monitoring_status="failed",
-                gpu_monitoring_reason="no_samples_collected"
-            )
+            return NoGPU(gpu_monitoring_status="failed", gpu_monitoring_reason="no_samples_collected")

    def _monitor_loop(self):
        """Background monitoring loop using threading.Event for communication."""
@ -400,7 +396,7 @@ def get_hardware_info() -> HardwareInfo:

    torch_version = torch.__version__
    cuda_version = None
-    if hasattr(torch, 'cuda') and torch.cuda.is_available():
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
        cuda_version = torch.version.cuda

    return HardwareInfo(
@ -410,14 +406,14 @@ def get_hardware_info() -> HardwareInfo:
        memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)),
        python_version=f"{sys.version.split()[0]}",
        torch_version=torch_version,
-        cuda_version=cuda_version
+        cuda_version=cuda_version,
    )


 def flush_memory():
    """Flush GPU memory and run garbage collection."""
    gc.collect()
-    if hasattr(torch, 'cuda') and torch.cuda.is_available():
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()
@ -442,13 +438,10 @@ def get_sdpa_backend(backend_name: Optional[str]):
        return None


-
-
-
 class SDPAContext:
    """Context manager for SDPA kernel selection."""

-    def __init__(self, backend_name: Optional[str], logger: logging.Logger = None):
+    def __init__(self, backend_name: Optional[str], logger: Optional[logging.Logger] = None):
        self.backend_name = backend_name
        self.logger = logger or logging.getLogger(__name__)
        self.backend = get_sdpa_backend(backend_name) if backend_name else None
@ -466,7 +459,9 @@ class SDPAContext:
                    self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}")
                self.context = None
        elif self.backend_name and self.logger:
-            self.logger.debug(f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})")
+            self.logger.debug(
+                f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})"
+            )
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
@ -490,7 +485,7 @@ class AbstractModelBenchmark(ABC):
        self.scenarios = {}  # Map of scenario_name -> BenchmarkScenario

    @abstractmethod
-    def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
+    def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
        """Create and return a dictionary of benchmark scenarios."""
        pass

@ -518,7 +513,7 @@ class AbstractModelBenchmark(ABC):
        """Prepare inputs for the model. Override if needed."""
        return None

-    def get_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
+    def get_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
        """Get benchmark scenarios. Creates them if they don't exist."""
        if not self.scenarios:
            self.scenarios = self.create_scenarios(**kwargs)
@ -547,9 +542,7 @@ class ModelBenchmark(AbstractModelBenchmark):
        """Default prompt for text generation. Override in subclasses if needed."""
        return self._default_prompt

-
-    
-    def get_attention_configs(self, include_sdpa_variants: bool = True) -> List[Dict[str, Any]]:
+    def get_attention_configs(self, include_sdpa_variants: bool = True) -> list[dict[str, Any]]:
        """
        Get attention implementation configurations.

@ -565,15 +558,17 @@ class ModelBenchmark(AbstractModelBenchmark):

        # Add SDPA variants if requested
        if include_sdpa_variants:
-            attention_configs.append({
-                "attn_implementation": "sdpa", 
-                "sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
-                "desc_suffix": ""
-            })
+            attention_configs.append(
+                {
+                    "attn_implementation": "sdpa",
+                    "sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
+                    "desc_suffix": "",
+                }
+            )

        return attention_configs

-    def get_scenario_configs(self) -> List[Dict[str, Any]]:
+    def get_scenario_configs(self) -> list[dict[str, Any]]:
        """
        Get base scenario configurations. Override in subclasses to customize.

@ -583,36 +578,38 @@ class ModelBenchmark(AbstractModelBenchmark):
        return [
            # Eager variants
            {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
-            
            # Compiled variants
-            {"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
-            
+            {
+                "variant": "compiled",
+                "compile_mode": "max-autotune",
+                "use_cache": True,
+                "description": "Compiled with max autotune",
+            },
            # Kernelized variant (if available)
-            {"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
+            {
+                "variant": "kernelized",
+                "compile_mode": "max-autotune",
+                "use_cache": True,
+                "description": "Kernelized execution",
+            },
        ]

    def _is_kernelization_available(self) -> bool:
        """Check if kernelization is available. Override in subclasses."""
        try:
-            from kernels import Mode, kernelize
+            from kernels import Mode, kernelize  # noqa: F401
+
            return True
        except ImportError:
            return False

-    def get_default_generation_config(self) -> Dict[str, Any]:
+    def get_default_generation_config(self) -> dict[str, Any]:
        """Get default generation configuration. Override in subclasses for model-specific defaults."""
-        return {
-            "do_sample": False,
-            "top_p": 1.0,
-            "temperature": 1.0
-        }
+        return {"do_sample": False, "top_p": 1.0, "temperature": 1.0}

-    def get_model_init_kwargs(self, config: BenchmarkConfig) -> Dict[str, Any]:
+    def get_model_init_kwargs(self, config: BenchmarkConfig) -> dict[str, Any]:
        """Get model initialization kwargs. Override in subclasses for model-specific parameters."""
-        return {
-            "torch_dtype": getattr(torch, config.torch_dtype),
-            "attn_implementation": config.attn_implementation
-        }
+        return {"torch_dtype": getattr(torch, config.torch_dtype), "attn_implementation": config.attn_implementation}

    def get_default_torch_dtype(self) -> str:
        """Get default torch dtype. Override in subclasses."""
@ -622,19 +619,19 @@ class ModelBenchmark(AbstractModelBenchmark):
        """Get default device. Override in subclasses."""
        return "cuda"

-    def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
+    def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
        """Create benchmark scenarios for HuggingFace models."""
        scenarios = {}

        # Extract parameters with model-specific defaults
-        model_id = kwargs.get('model_id', 'microsoft/DialoGPT-medium')
-        warmup_iterations = kwargs.get('warmup_iterations', 3)
-        measurement_iterations = kwargs.get('measurement_iterations', 5)
-        num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
-        include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
-        device = kwargs.get('device', self.get_default_device())
-        torch_dtype = kwargs.get('torch_dtype', self.get_default_torch_dtype())
-        batch_size = kwargs.get('batch_size', 1)
+        model_id = kwargs.get("model_id", "microsoft/DialoGPT-medium")
+        warmup_iterations = kwargs.get("warmup_iterations", 3)
+        measurement_iterations = kwargs.get("measurement_iterations", 5)
+        num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
+        include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
+        device = kwargs.get("device", self.get_default_device())
+        torch_dtype = kwargs.get("torch_dtype", self.get_default_torch_dtype())
+        batch_size = kwargs.get("batch_size", 1)

        # Get configurations
        attention_configs = self.get_attention_configs(include_sdpa_variants)
@ -654,7 +651,7 @@ class ModelBenchmark(AbstractModelBenchmark):

                    # Create unique config for this scenario
                    config = BenchmarkConfig(
-                        name=scenario_config['variant'],
+                        name=scenario_config["variant"],
                        model_id=model_id,
                        variant=scenario_config["variant"],
                        compile_mode=scenario_config["compile_mode"],
@ -666,7 +663,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                        torch_dtype=torch_dtype,
                        batch_size=batch_size,
                        attn_implementation=attn_implementation,
-                        sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None
+                        sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None,
                    )

                    # Create scenario name
@ -695,11 +692,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                        description += desc_suffix

                    # Create scenario
-                    scenario = BenchmarkScenario(
-                        name=scenario_name,
-                        config=config,
-                        description=description
-                    )
+                    scenario = BenchmarkScenario(name=scenario_name, config=config, description=description)

                    # Add setup callbacks based on variant
                    if scenario_config["variant"] == "compiled":
@ -718,16 +711,12 @@ class ModelBenchmark(AbstractModelBenchmark):

        # Perform torch.compile
        if config.compile_mode is not None:
-            self.compiled_model = torch.compile(
-                model, 
-                mode=config.compile_mode, 
-                **config.compile_options
-            )
+            self.compiled_model = torch.compile(model, mode=config.compile_mode, **config.compile_options)
        else:
            self.compiled_model = torch.compile(model, **config.compile_options)

        # Setup static cache for compiled mode if needed
-        if config.use_cache and hasattr(self, 'inputs') and self.inputs is not None:
+        if config.use_cache and hasattr(self, "inputs") and self.inputs is not None:
            self._setup_static_cache(config)

    def _setup_kernelization_callback(self, model, tokenizer, config, logger):
@ -737,10 +726,8 @@ class ModelBenchmark(AbstractModelBenchmark):

        try:
            from kernels import Mode, kernelize
-            self.compiled_model = kernelize(
-                model,
-                mode=Mode.INFERENCE
-            )
+
+            self.compiled_model = kernelize(model, mode=Mode.INFERENCE)
        except Exception as e:
            if logger:
                logger.warning(f"Failed to setup kernelized mode: {e}")
@ -749,13 +736,14 @@ class ModelBenchmark(AbstractModelBenchmark):

    def _setup_static_cache(self, config: BenchmarkConfig):
        """Setup static cache for compiled models. Override if needed."""
-        if hasattr(self, 'inputs') and self.inputs is not None:
+        if hasattr(self, "inputs") and self.inputs is not None:
            try:
                from transformers import StaticCache
+
                seq_length = self.inputs["input_ids"].shape[1]

                # Get the actual device the model is on
-                if hasattr(self.model, 'device'):
+                if hasattr(self.model, "device"):
                    cache_device = self.model.device
                else:
                    cache_device = self.device
@ -765,7 +753,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                    max_batch_size=config.batch_size,
                    max_cache_len=seq_length + config.num_tokens_to_generate,
                    device=cache_device,
-                    dtype=getattr(torch, config.torch_dtype)
+                    dtype=getattr(torch, config.torch_dtype),
                )
                self.logger.debug(f"StaticCache created on device: {cache_device}")
            except (ImportError, TypeError) as e:
@ -794,7 +782,6 @@ class ModelBenchmark(AbstractModelBenchmark):
    def _load_model_and_tokenizer(self, config: BenchmarkConfig):
        """Load the model and tokenizer. Override in subclasses for custom loading."""

-        
        from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

        # Load tokenizer
@ -812,14 +799,9 @@ class ModelBenchmark(AbstractModelBenchmark):
        target_device = config.device
        # Get model initialization kwargs
        model_init_kwargs = self.get_model_init_kwargs(config)
-        model_init_kwargs.update({
-            "generation_config": gen_config
-        })
+        model_init_kwargs.update({"generation_config": gen_config})

-        self.model = AutoModelForCausalLM.from_pretrained(
-            config.model_id, 
-            **model_init_kwargs
-        ).eval()
+        self.model = AutoModelForCausalLM.from_pretrained(config.model_id, **model_init_kwargs).eval()

        # Move model to target device
        self.logger.info(f"Moving model to device: {target_device}")
@ -832,7 +814,7 @@ class ModelBenchmark(AbstractModelBenchmark):
        self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt")

        # Move inputs to the same device as the model
-        if hasattr(self.model, 'device'):
+        if hasattr(self.model, "device"):
            # Model is on a single device
            model_device = self.model.device
        else:
@ -849,16 +831,16 @@ class ModelBenchmark(AbstractModelBenchmark):

    def cleanup_model(self) -> None:
        """Cleanup model resources."""
-        if hasattr(self, 'model') and self.model is not None:
+        if hasattr(self, "model") and self.model is not None:
            del self.model
            self.model = None
-        if hasattr(self, 'compiled_model') and self.compiled_model is not None:
+        if hasattr(self, "compiled_model") and self.compiled_model is not None:
            del self.compiled_model
            self.compiled_model = None
-        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+        if hasattr(self, "tokenizer") and self.tokenizer is not None:
            del self.tokenizer
            self.tokenizer = None
-        if hasattr(self, 'past_key_values') and self.past_key_values is not None:
+        if hasattr(self, "past_key_values") and self.past_key_values is not None:
            del self.past_key_values
            self.past_key_values = None

@ -877,7 +859,7 @@ class ModelBenchmark(AbstractModelBenchmark):
            # Use SDPA context if specified
            with SDPAContext(config.sdpa_backend, self.logger):
                with torch.no_grad():
-                    outputs = model_to_use.generate(**generation_kwargs)
+                    _ = model_to_use.generate(**generation_kwargs)

        return timer.elapsed_time()

@ -915,11 +897,11 @@ class ModelBenchmark(AbstractModelBenchmark):
                "variant": config.variant,
                "compile_mode": config.compile_mode,
                "attn_implementation": config.attn_implementation,
-                "sdpa_backend": config.sdpa_backend
-            }
+                "sdpa_backend": config.sdpa_backend,
+            },
        )

-    def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> Dict[str, Any]:
+    def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> dict[str, Any]:
        """Get generation kwargs. Override in subclasses for custom generation."""
        generation_config_dict = self.get_default_generation_config()
        generation_kwargs = {
@ -935,11 +917,12 @@ class ModelBenchmark(AbstractModelBenchmark):
        if self.past_key_values is not None and config.variant == "compiled":
            try:
                from transformers import StaticCache
+
                # Reset cache for each measurement
                seq_length = self.inputs["input_ids"].shape[1]

                # Get the actual device the model is on
-                if hasattr(self.model, 'device'):
+                if hasattr(self.model, "device"):
                    cache_device = self.model.device
                else:
                    cache_device = self.device
@ -949,7 +932,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                    max_batch_size=config.batch_size,
                    max_cache_len=seq_length + max_new_tokens,
                    device=cache_device,
-                    dtype=getattr(torch, config.torch_dtype)
+                    dtype=getattr(torch, config.torch_dtype),
                )
                generation_kwargs["past_key_values"] = fresh_cache
            except (ImportError, TypeError) as e:
@ -967,14 +950,13 @@ class BenchmarkRunner:
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

-
    def run_benchmark(
        self,
        benchmark: ModelBenchmark,
-        scenarios: Dict[str, BenchmarkScenario],
+        scenarios: dict[str, BenchmarkScenario],
        collect_gpu_metrics: bool = True,
-        commit_id: Optional[str] = None
-    ) -> Dict[str, Dict[str, Any]]:
+        commit_id: Optional[str] = None,
+    ) -> dict[str, dict[str, Any]]:
        """
        Run benchmarks using scenarios.

@ -1021,7 +1003,7 @@ class BenchmarkRunner:
                    timestamp=datetime.utcnow().isoformat(),
                    commit_id=commit_id,
                    hardware_info=get_hardware_info(),
-                    config=config
+                    config=config,
                )

                # Initialize GPU monitor
@ -1037,11 +1019,13 @@ class BenchmarkRunner:
                        _ = benchmark.measure_latency(config)
                    except Exception as e:
                        warmup_failures += 1
-                        self.logger.warning(f"Warmup iteration {i+1} failed: {e}")
+                        self.logger.warning(f"Warmup iteration {i + 1} failed: {e}")

                # If more than half the warmup iterations failed, skip this scenario
                if warmup_failures > config.warmup_iterations // 2:
-                    self.logger.warning(f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})")
+                    self.logger.warning(
+                        f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})"
+                    )
                    try:
                        scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
                        benchmark.cleanup_model()
@ -1077,12 +1061,18 @@ class BenchmarkRunner:
                        if timing_result.time_per_output_token_seconds is not None:
                            itl_measurements.append(timing_result.time_per_output_token_seconds)

-                        itl_str = f", itl={timing_result.time_per_output_token_seconds:.4f}s/token" if timing_result.time_per_output_token_seconds else ""
-                        self.logger.debug(f"Iteration {i+1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}")
+                        itl_str = (
+                            f", itl={timing_result.time_per_output_token_seconds:.4f}s/token"
+                            if timing_result.time_per_output_token_seconds
+                            else ""
+                        )
+                        self.logger.debug(
+                            f"Iteration {i + 1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}"
+                        )

                    except Exception as e:
                        measurement_failures += 1
-                        self.logger.warning(f"Measurement iteration {i+1} failed: {e}")
+                        self.logger.warning(f"Measurement iteration {i + 1} failed: {e}")

                # Stop GPU monitoring
                gpu_metrics = {}
@ -1091,7 +1081,9 @@ class BenchmarkRunner:

                # If we don't have enough successful measurements, skip this scenario
                if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2:
-                    self.logger.warning(f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})")
+                    self.logger.warning(
+                        f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})"
+                    )
                    try:
                        scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
                        benchmark.cleanup_model()
@ -1104,7 +1096,7 @@ class BenchmarkRunner:
                    "metadata": asdict(metadata),
                    "measurements": {},
                    "gpu_metrics": gpu_metrics,
-                    "scenario_description": scenario.description
+                    "scenario_description": scenario.description,
                }

                if latency_measurements:
@ -1112,15 +1104,21 @@ class BenchmarkRunner:
                    scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats)

                if ttft_measurements:
-                    ttft_stats = BenchmarkStatistics.from_measurements("time_to_first_token_seconds", ttft_measurements)
+                    ttft_stats = BenchmarkStatistics.from_measurements(
+                        "time_to_first_token_seconds", ttft_measurements
+                    )
                    scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats)

                if tokens_per_sec_measurements:
-                    tps_stats = BenchmarkStatistics.from_measurements("tokens_per_second", tokens_per_sec_measurements, "tokens/sec")
+                    tps_stats = BenchmarkStatistics.from_measurements(
+                        "tokens_per_second", tokens_per_sec_measurements, "tokens/sec"
+                    )
                    scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats)

                if itl_measurements:
-                    itl_stats = BenchmarkStatistics.from_measurements("time_per_output_token_seconds", itl_measurements, "seconds/token")
+                    itl_stats = BenchmarkStatistics.from_measurements(
+                        "time_per_output_token_seconds", itl_measurements, "seconds/token"
+                    )
                    scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats)

                # Log summary
@ -1149,6 +1147,7 @@ class BenchmarkRunner:
            except Exception as e:
                self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}")
                import traceback
+
                self.logger.debug(traceback.format_exc())

                # Try to clean up if possible
@ -1169,7 +1168,7 @@ class BenchmarkRunner:

        return all_results

-    def save_results(self, model_name: str, results: Dict[str, Dict[str, Any]]) -> str:
+    def save_results(self, model_name: str, results: dict[str, dict[str, Any]]) -> str:
        """Save benchmark results to JSON file."""
        # Create model-specific subdirectory
        model_dir = os.path.join(self.output_dir, model_name)
@ -1181,24 +1180,20 @@ class BenchmarkRunner:
        filepath = os.path.join(model_dir, filename)

        # Prepare output structure
-        output_data = {
-            "model_name": model_name,
-            "benchmark_scenarios": []
-        }
+        output_data = {"model_name": model_name, "benchmark_scenarios": []}

        for config_name, config_results in results.items():
            scenario = {
                "scenario_name": config_name,
                "metadata": config_results["metadata"],
                "measurements": config_results["measurements"],
-                "gpu_metrics": config_results.get("gpu_metrics", {})
+                "gpu_metrics": config_results.get("gpu_metrics", {}),
            }
            output_data["benchmark_scenarios"].append(scenario)

        # Save to JSON file
-        with open(filepath, 'w') as f:
+        with open(filepath, "w") as f:
            json.dump(output_data, f, indent=2, default=str)

        self.logger.info(f"Results saved to {filepath}")
        return filepath
- 
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -20,38 +20,34 @@ in the ./benches directory, organizing outputs into model-specific subfolders.

 import argparse
 import importlib.util
+import json
 import logging
 import os
 import sys
-import json
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Any, Optional
+from typing import Any, Optional


 def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
    """Setup logging configuration."""
    numeric_level = getattr(logging, log_level.upper(), None)
    if not isinstance(numeric_level, int):
-        raise ValueError(f'Invalid log level: {log_level}')
+        raise ValueError(f"Invalid log level: {log_level}")

    handlers = [logging.StreamHandler(sys.stdout)]

    if enable_file_logging:
-        handlers.append(
-            logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
-        )
+        handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))

    logging.basicConfig(
-        level=numeric_level,
-        format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
-        handlers=handlers
+        level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
    )

    return logging.getLogger(__name__)


-def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
+def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
    """
    Discover all benchmark modules in the benches directory.

@ -77,20 +73,24 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
            spec.loader.exec_module(module)

            # Check if it has a benchmark runner function
-            if hasattr(module, f'run_{module_name}'):
-                benchmarks.append({
-                    'name': module_name,
-                    'path': str(py_file),
-                    'module': module,
-                    'runner_function': getattr(module, f'run_{module_name}')
-                })
-            elif hasattr(module, 'run_benchmark'):
-                benchmarks.append({
-                    'name': module_name,
-                    'path': str(py_file),
-                    'module': module,
-                    'runner_function': getattr(module, 'run_benchmark')
-                })
+            if hasattr(module, f"run_{module_name}"):
+                benchmarks.append(
+                    {
+                        "name": module_name,
+                        "path": str(py_file),
+                        "module": module,
+                        "runner_function": getattr(module, f"run_{module_name}"),
+                    }
+                )
+            elif hasattr(module, "run_benchmark"):
+                benchmarks.append(
+                    {
+                        "name": module_name,
+                        "path": str(py_file),
+                        "module": module,
+                        "runner_function": getattr(module, "run_benchmark"),
+                    }
+                )
            else:
                logging.warning(f"No runner function found in {py_file}")

@ -101,10 +101,7 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:


 def run_single_benchmark(
-    benchmark_info: Dict[str, Any], 
-    output_dir: str,
-    logger: logging.Logger,
-    **kwargs
+    benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
 ) -> Optional[str]:
    """
    Run a single benchmark and return the output file path.
@ -118,21 +115,19 @@ def run_single_benchmark(
    Returns:
        Path to the output file if successful, None otherwise
    """
-    benchmark_name = benchmark_info['name']
-    runner_func = benchmark_info['runner_function']
+    benchmark_name = benchmark_info["name"]
+    runner_func = benchmark_info["runner_function"]

    logger.info(f"Running benchmark: {benchmark_name}")

    try:
        # Check function signature to determine what arguments to pass
        import inspect
+
        sig = inspect.signature(runner_func)

        # Prepare arguments based on function signature
-        func_kwargs = {
-            'logger': logger,
-            'output_dir': output_dir
-        }
+        func_kwargs = {"logger": logger, "output_dir": output_dir}

        # Add other kwargs if the function accepts them
        for param_name in sig.parameters:
@ -145,8 +140,7 @@ def run_single_benchmark(
        if has_var_kwargs:
            valid_kwargs = {**func_kwargs, **kwargs}
        else:
-            valid_kwargs = {k: v for k, v in func_kwargs.items() 
-                           if k in sig.parameters}
+            valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}

        # Run the benchmark
        result = runner_func(**valid_kwargs)
@ -161,15 +155,12 @@ def run_single_benchmark(
    except Exception as e:
        logger.error(f"Benchmark {benchmark_name} failed: {e}")
        import traceback
+
        logger.debug(traceback.format_exc())
        return None


-def generate_summary_report(
-    output_dir: str, 
-    benchmark_results: Dict[str, Any],
-    logger: logging.Logger
-) -> str:
+def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
    """Generate a summary report of all benchmark runs."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
@ -179,13 +170,13 @@ def generate_summary_report(
            "timestamp": datetime.utcnow().isoformat(),
            "total_benchmarks": len(benchmark_results),
            "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
-            "failed_benchmarks": len([r for r in benchmark_results.values() if r is None])
+            "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
        },
        "benchmark_results": benchmark_results,
-        "output_directory": output_dir
+        "output_directory": output_dir,
    }

-    with open(summary_file, 'w') as f:
+    with open(summary_file, "w") as f:
        json.dump(summary_data, f, indent=2, default=str)

    logger.info(f"Summary report saved to: {summary_file}")
@ -194,22 +185,20 @@ def generate_summary_report(

 def main():
    """Main entry point for the benchmarking script."""
-    parser = argparse.ArgumentParser(
-        description="Run all benchmarks in the ./benches directory"
-    )
+    parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")

    parser.add_argument(
        "--output-dir",
        type=str,
        default="benchmark_results",
-        help="Base output directory for benchmark results (default: benchmark_results)"
+        help="Base output directory for benchmark results (default: benchmark_results)",
    )

    parser.add_argument(
        "--benches-dir",
        type=str,
        default="./benches",
-        help="Directory containing benchmark implementations (default: ./benches)"
+        help="Directory containing benchmark implementations (default: ./benches)",
    )

    parser.add_argument(
@ -217,66 +206,34 @@ def main():
        type=str,
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        default="INFO",
-        help="Logging level (default: INFO)"
+        help="Logging level (default: INFO)",
    )

-    parser.add_argument(
-        "--model-id",
-        type=str,
-        help="Specific model ID to benchmark (if supported by benchmarks)"
-    )
+    parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
+
+    parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")

    parser.add_argument(
-        "--warmup-iterations",
-        type=int,
-        default=3,
-        help="Number of warmup iterations (default: 3)"
-    )
-    
-    parser.add_argument(
-        "--measurement-iterations",
-        type=int,
-        default=5,
-        help="Number of measurement iterations (default: 5)"
+        "--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
    )

    parser.add_argument(
        "--num-tokens-to-generate",
        type=int,
        default=100,
-        help="Number of tokens to generate in benchmarks (default: 100)"
+        help="Number of tokens to generate in benchmarks (default: 100)",
    )

-    parser.add_argument(
-        "--include",
-        type=str,
-        nargs="*",
-        help="Only run benchmarks matching these names"
-    )
+    parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
+
+    parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
+
+    parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
+
+    parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")

    parser.add_argument(
-        "--exclude",
-        type=str,
-        nargs="*",
-        help="Exclude benchmarks matching these names"
-    )
-    
-    parser.add_argument(
-        "--enable-mock",
-        action="store_true",
-        help="Enable mock benchmark (skipped by default)"
-    )
-    
-    parser.add_argument(
-        "--enable-file-logging",
-        action="store_true",
-        help="Enable file logging (disabled by default)"
-    )
-    
-    parser.add_argument(
-        "--commit-id",
-        type=str,
-        help="Git commit ID for metadata (if not provided, will auto-detect from git)"
+        "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
    )

    args = parser.parse_args()
@ -304,13 +261,15 @@ def main():
        filtered_benchmarks = benchmarks

        if args.include:
-            filtered_benchmarks = [b for b in filtered_benchmarks 
-                                 if any(pattern in b['name'] for pattern in args.include)]
+            filtered_benchmarks = [
+                b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
+            ]
            logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")

        if args.exclude:
-            filtered_benchmarks = [b for b in filtered_benchmarks 
-                                 if not any(pattern in b['name'] for pattern in args.exclude)]
+            filtered_benchmarks = [
+                b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
+            ]
            logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")

        if not filtered_benchmarks:
@ -319,34 +278,29 @@ def main():

        # Prepare common kwargs for benchmarks
        benchmark_kwargs = {
-            'warmup_iterations': args.warmup_iterations,
-            'measurement_iterations': args.measurement_iterations,
-            'num_tokens_to_generate': args.num_tokens_to_generate
+            "warmup_iterations": args.warmup_iterations,
+            "measurement_iterations": args.measurement_iterations,
+            "num_tokens_to_generate": args.num_tokens_to_generate,
        }

        if args.model_id:
-            benchmark_kwargs['model_id'] = args.model_id
+            benchmark_kwargs["model_id"] = args.model_id

        # Add enable_mock flag for mock benchmark
-        benchmark_kwargs['enable_mock'] = args.enable_mock
+        benchmark_kwargs["enable_mock"] = args.enable_mock

        # Add commit_id if provided
        if args.commit_id:
-            benchmark_kwargs['commit_id'] = args.commit_id
+            benchmark_kwargs["commit_id"] = args.commit_id

        # Run benchmarks
        benchmark_results = {}
        successful_count = 0

        for benchmark_info in filtered_benchmarks:
-            result = run_single_benchmark(
-                benchmark_info,
-                args.output_dir,
-                logger,
-                **benchmark_kwargs
-            )
+            result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)

-            benchmark_results[benchmark_info['name']] = result
+            benchmark_results[benchmark_info["name"]] = result

            if result is not None:
                successful_count += 1
@ -377,6 +331,7 @@ def main():
    except Exception as e:
        logger.error(f"Benchmark run failed: {e}")
        import traceback
+
        logger.debug(traceback.format_exc())
        return 1

--- a/scripts/check_tokenizers.py
+++ b/scripts/check_tokenizers.py
@ -4,8 +4,8 @@ import datasets

 import transformers
 from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
-from transformers.utils import logging
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.utils import logging


 logging.set_verbosity_info()
@ -22,7 +22,9 @@ imperfect = 0
 wrong = 0


-def check_diff(spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
+def check_diff(
+    spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
+) -> bool:
    if spm_diff == list(reversed(tok_diff)):
        # AAA -> AA+A vs A+AA case.
        return True
@ -54,7 +56,9 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
    return False


-def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
+def check_details(
+    line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
+) -> bool:
    # Encoding can be the same with same result AAA -> A + AA vs AA + A
    # We can check that we use at least exactly the same number of tokens.
    for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
@ -90,7 +94,9 @@ def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTr
                    if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
                ]
                for j in possible_matches:
-                    if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], slow, fast) and check_details(
+                    if check_diff(
+                        spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
+                    ) and check_details(
                        line,
                        spm_ids[first + i : last],
                        tok_ids[first + j : last],
@ -140,9 +146,9 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
    if skip_assert:
        return

-    assert (
-        slow_ids == fast_ids
-    ), f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
+    assert slow_ids == fast_ids, (
+        f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
+    )


 def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
--- a/scripts/stale.py
+++ b/scripts/stale.py
@ -15,6 +15,7 @@
 Script to close stale issue. Taken in part from the AllenNLP repository.
 https://github.com/allenai/allennlp.
 """
+
 import os
 from datetime import datetime as dt

@ -39,10 +40,11 @@ def main():

    for i, issue in enumerate(open_issues):
        print(i, issue)
-        comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True)
+        comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
        last_comment = comments[0] if len(comments) > 0 else None
        if (
-            last_comment is not None and last_comment.user.login == "github-actions[bot]"
+            last_comment is not None
+            and last_comment.user.login == "github-actions[bot]"
            and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
            and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())