Enable ruff on benchmark and scripts (#40634)

* Enable ruff on benchmark and scripts Signed-off-by: cyy <cyyever@outlook.com> * Cover benchmark_v2 Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> * correct * style * style --------- Signed-off-by: cyy <cyyever@outlook.com> Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
2025-10-21 01:23:56 +08:00 · 2025-09-10 17:38:06 +08:00
parent 08edec9f7d
commit a5ecd94a3f
11 changed files with 661 additions and 649 deletions
--- a/2
+++ b/2
@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
-check_dirs := examples tests src utils
+check_dirs := examples tests src utils scripts benchmark benchmark_v2
 exclude_folders :=  ""
--- a/benchmark/benches/llama.py
+++ b/benchmark/benches/llama.py
@ -11,25 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from logging import Logger
 import os
 import sys
 from logging import Logger
 from threading import Event, Thread
 from time import perf_counter, sleep
 from typing import Optional
-import sys
+
 # Add the parent directory to Python path to import benchmarks_entrypoint
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from benchmarks_entrypoint import MetricsRecorder
 import gpustat
 import psutil
 import psycopg2
 from benchmarks_entrypoint import MetricsRecorder
 # Optional heavy ML dependencies - only required when actually running the benchmark
 try:
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
    TRANSFORMERS_AVAILABLE = True
 except ImportError:
    TRANSFORMERS_AVAILABLE = False
@ -63,7 +66,13 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
 def run_benchmark(
-    logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
+    logger: Logger,
    repository: str,
    branch: str,
    commit_id: str,
    commit_msg: str,
    metrics_recorder=None,
    num_tokens_to_generate=100,
 ):
    # Check if required ML dependencies are available
    if not TRANSFORMERS_AVAILABLE:
@ -154,7 +163,7 @@ def run_benchmark(
        # First eager forward pass
        logger.info("running first eager forward pass")
        start = perf_counter()
-        outputs = model(**inputs)
+        _ = model(**inputs)
        torch.cuda.synchronize()
        end = perf_counter()
        first_eager_fwd_pass_time = end - start
@ -163,7 +172,7 @@ def run_benchmark(
        # Second eager forward pass (should be faster)
        logger.info("running second eager forward pass")
        start = perf_counter()
-        outputs = model(**inputs)
+        _ = model(**inputs)
        torch.cuda.synchronize()
        end = perf_counter()
        second_eager_fwd_pass_time = end - start
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -31,9 +31,7 @@ from contextlib import contextmanager
 from pathlib import Path
 from git import Repo
 from huggingface_hub import HfApi
 from optimum_benchmark import Benchmark
 from optimum_benchmark_wrapper import main
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -13,19 +13,20 @@
 # limitations under the License.
 import argparse
 import importlib.util
 import json
 import logging
 import os
 import sys
 import json
 import uuid
 from datetime import datetime
 from typing import Dict, Tuple, Optional, List
 import pandas as pd
 try:
    from psycopg2.extensions import register_adapter
    from psycopg2.extras import Json
    register_adapter(dict, Json)
    PSYCOPG2_AVAILABLE = True
 except ImportError:
@ -38,8 +39,14 @@ class ImportModuleException(Exception):
 class MetricsRecorder:
    def __init__(
-        self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str, 
+        self,
-        collect_csv_data: bool = True
+        connection,
        logger: logging.Logger,
        repository: str,
        branch: str,
        commit_id: str,
        commit_msg: str,
        collect_csv_data: bool = True,
    ):
        self.conn = connection
        self.use_database = connection is not None
@ -55,23 +62,39 @@ class MetricsRecorder:
        # For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
        if self.collect_csv_data:
            # Initialize empty DataFrames with proper schemas
-            self.benchmarks_df = pd.DataFrame(columns=[
+            self.benchmarks_df = pd.DataFrame(
-                'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message', 
+                columns=[
-                'metadata', 'created_at'
+                    "benchmark_id",
-            ])
+                    "repository",
-            self.device_measurements_df = pd.DataFrame(columns=[
+                    "branch",
-                'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util', 
+                    "commit_id",
-                'gpu_mem_megabytes', 'time'
+                    "commit_message",
-            ])
+                    "metadata",
-            self.model_measurements_df = pd.DataFrame(columns=[
+                    "created_at",
-                'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
+                ]
-                'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
+            )
-                'second_eager_generate_time_secs', 'time_to_first_token_secs',
+            self.device_measurements_df = pd.DataFrame(
-                'time_to_second_token_secs', 'time_to_third_token_secs',
+                columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
-                'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
+            )
-                'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
+            self.model_measurements_df = pd.DataFrame(
-                'fourth_compile_generate_time_secs'
+                columns=[
-            ])
+                    "benchmark_id",
                    "time",
                    "model_load_time",
                    "first_eager_forward_pass_time_secs",
                    "second_eager_forward_pass_time_secs",
                    "first_eager_generate_time_secs",
                    "second_eager_generate_time_secs",
                    "time_to_first_token_secs",
                    "time_to_second_token_secs",
                    "time_to_third_token_secs",
                    "time_to_next_token_mean_secs",
                    "first_compile_generate_time_secs",
                    "second_compile_generate_time_secs",
                    "third_compile_generate_time_secs",
                    "fourth_compile_generate_time_secs",
                ]
            )
        else:
            self.benchmarks_df = None
            self.device_measurements_df = None
@ -95,15 +118,19 @@ class MetricsRecorder:
        # Store benchmark data for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame
-            new_row = pd.DataFrame([{
+            new_row = pd.DataFrame(
-                'benchmark_id': benchmark_id,
+                [
-                'repository': self.repository,
+                    {
-                'branch': self.branch,
+                        "benchmark_id": benchmark_id,
-                'commit_id': self.commit_id,
+                        "repository": self.repository,
-                'commit_message': self.commit_msg,
+                        "branch": self.branch,
-                'metadata': json.dumps(metadata),
+                        "commit_id": self.commit_id,
-                'created_at': datetime.utcnow().isoformat()
+                        "commit_message": self.commit_msg,
-            }])
+                        "metadata": json.dumps(metadata),
                        "created_at": datetime.utcnow().isoformat(),
                    }
                ]
            )
            self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
        mode_info = []
@ -123,14 +150,18 @@ class MetricsRecorder:
        # Store device measurements for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame
-            new_row = pd.DataFrame([{
+            new_row = pd.DataFrame(
-                'benchmark_id': benchmark_id,
+                [
-                'cpu_util': cpu_util,
+                    {
-                'mem_megabytes': mem_megabytes,
+                        "benchmark_id": benchmark_id,
-                'gpu_util': gpu_util,
+                        "cpu_util": cpu_util,
-                'gpu_mem_megabytes': gpu_mem_megabytes,
+                        "mem_megabytes": mem_megabytes,
-                'time': datetime.utcnow().isoformat()
+                        "gpu_util": gpu_util,
-            }])
+                        "gpu_mem_megabytes": gpu_mem_megabytes,
                        "time": datetime.utcnow().isoformat(),
                    }
                ]
            )
            self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
        # Store in database if available
@ -149,10 +180,7 @@ class MetricsRecorder:
        # Store model measurements for CSV export (if enabled)
        if self.collect_csv_data:
            # Add row to pandas DataFrame with flattened measurements
-            row_data = {
+            row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
                'benchmark_id': benchmark_id,
                'time': datetime.utcnow().isoformat()
            }
            # Flatten the measurements dict into the row
            row_data.update(measurements)
@ -241,28 +269,34 @@ class MetricsRecorder:
        # Add model measurements (join on benchmark_id)
        if len(self.model_measurements_df) > 0:
            # Drop 'time' column from model measurements to avoid conflicts
-            model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
+            model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
-            summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
+            summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")
        # Calculate device measurement aggregates using pandas groupby
        if len(self.device_measurements_df) > 0:
-            device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
+            device_agg = (
-                'cpu_util': ['mean', 'max', 'std', 'count'],
+                self.device_measurements_df.groupby("benchmark_id")
-                'mem_megabytes': ['mean', 'max', 'std'],
+                .agg(
-                'gpu_util': ['mean', 'max', 'std'],
+                    {
-                'gpu_mem_megabytes': ['mean', 'max', 'std']
+                        "cpu_util": ["mean", "max", "std", "count"],
-            }).round(3)
+                        "mem_megabytes": ["mean", "max", "std"],
                        "gpu_util": ["mean", "max", "std"],
                        "gpu_mem_megabytes": ["mean", "max", "std"],
                    }
                )
                .round(3)
            )
            # Flatten column names
            device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
            device_agg = device_agg.reset_index()
            # Rename count column to be more descriptive
-            if 'cpu_util_count' in device_agg.columns:
+            if "cpu_util_count" in device_agg.columns:
-                device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
+                device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})
            # Merge with summary
-            summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
+            summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")
        # Export the comprehensive summary
        summary_df.to_csv(summary_file, index=False)
@ -313,18 +347,13 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
        help="The commit message associated with the commit, truncated to 70 characters.",
    )
-    parser.add_argument(
+    parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")
        "--csv",
        action="store_true",
        default=False,
        help="Enable CSV output files generation."
    )
    parser.add_argument(
        "--csv-output-dir",
        type=str,
        default="benchmark_results",
-        help="Directory for CSV output files (default: benchmark_results)."
+        help="Directory for CSV output files (default: benchmark_results).",
    )
    args = parser.parse_args()
@ -356,6 +385,7 @@ def create_database_connection():
    try:
        import psycopg2
        conn = psycopg2.connect("dbname=metrics")
        logger.info("Successfully connected to database")
        return conn
@ -364,8 +394,9 @@ def create_database_connection():
        return None
-def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str, 
+def create_global_metrics_recorder(
-                                   generate_csv: bool = False) -> MetricsRecorder:
+    repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
 ) -> MetricsRecorder:
    """
    Create a global metrics recorder that will be used across all benchmarks.
    """
@ -415,7 +446,7 @@ if __name__ == "__main__":
            try:
                logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
                module = import_from_path(entry.name.split(".")[0], entry.path)
-                if hasattr(module, 'run_benchmark'):
+                if hasattr(module, "run_benchmark"):
                    benchmark_modules.append(entry.name)
                    logger.debug(f"discovered benchmark: {entry.name}")
                else:
@ -443,7 +474,9 @@ if __name__ == "__main__":
                module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
            except TypeError:
                # Fall back to the old signature for backward compatibility
-                logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
+                logger.warning(
                    f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
                )
                module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
            successful_benchmarks += 1
--- a/benchmark/optimum_benchmark_wrapper.py
+++ b/benchmark/optimum_benchmark_wrapper.py
@ -3,7 +3,11 @@ import subprocess
 def main(config_dir, config_name, args):
-    subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
+    subprocess.run(
        ["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
        + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
        + args
    )
 if __name__ == "__main__":
--- a/benchmark_v2/benches/llama.py
+++ b/benchmark_v2/benches/llama.py
@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import logging
-from typing import Dict, Any, List
+import os
-
+from typing import Any
 from benchmark_framework import ModelBenchmark
 import torch
 from benchmark_framework import ModelBenchmark
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "1"
 torch.set_float32_matmul_precision("high")
 class LLaMABenchmark(ModelBenchmark):
    """Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
@ -31,9 +32,7 @@ class LLaMABenchmark(ModelBenchmark):
        super().__init__(logger)
        self._default_prompt = "Why dogs are so cute?"  # Custom prompt for LLaMA
-
+    def get_scenario_configs(self) -> list[dict[str, Any]]:
    def get_scenario_configs(self) -> List[Dict[str, Any]]:
        """
        Get LLaMA-specific scenario configurations.
@ -43,24 +42,33 @@ class LLaMABenchmark(ModelBenchmark):
        return [
            # Eager variants
            {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
            # Compiled variants
-            {"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
+            {
-            
+                "variant": "compiled",
                "compile_mode": "max-autotune",
                "use_cache": True,
                "description": "Compiled with max autotune",
            },
            # Kernelized variant (if available)
-            {"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
+            {
                "variant": "kernelized",
                "compile_mode": "max-autotune",
                "use_cache": True,
                "description": "Kernelized execution",
            },
        ]
    def _is_kernelization_available(self) -> bool:
        """Check if kernelization is available for LLaMA."""
        try:
-            from kernels import Mode, kernelize
+            from kernels import Mode, kernelize  # noqa: F401
            return True
        except ImportError:
            self.logger.debug("Kernelization not available: kernels module not found")
            return False
-    def get_default_generation_config(self) -> Dict[str, Any]:
+    def get_default_generation_config(self) -> dict[str, Any]:
        """Get LLaMA-specific generation configuration."""
        return {
            "do_sample": False,
@ -70,9 +78,8 @@ class LLaMABenchmark(ModelBenchmark):
            "max_new_tokens": None,  # Will be set per scenario
        }
-    def get_model_init_kwargs(self, config) -> Dict[str, Any]:
+    def get_model_init_kwargs(self, config) -> dict[str, Any]:
        """Get LLaMA-specific model initialization kwargs."""
        from benchmark_framework import BenchmarkConfig
        return {
            "torch_dtype": getattr(torch, config.torch_dtype),
            "attn_implementation": config.attn_implementation,
@ -103,18 +110,20 @@ def run_llama(logger, output_dir, **kwargs):
    from benchmark_framework import BenchmarkRunner
    # Extract parameters with defaults
-    model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf')
+    model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
-    warmup_iterations = kwargs.get('warmup_iterations', 3)
+    warmup_iterations = kwargs.get("warmup_iterations", 3)
-    measurement_iterations = kwargs.get('measurement_iterations', 5)
+    measurement_iterations = kwargs.get("measurement_iterations", 5)
-    num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
+    num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
-    include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
+    include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
-    device = kwargs.get('device', 'cuda')
+    device = kwargs.get("device", "cuda")
-    torch_dtype = kwargs.get('torch_dtype', 'float16')
+    torch_dtype = kwargs.get("torch_dtype", "float16")
-    batch_size = kwargs.get('batch_size', 1)
+    batch_size = kwargs.get("batch_size", 1)
-    commit_id = kwargs.get('commit_id', None)
+    commit_id = kwargs.get("commit_id")
    logger.info(f"Starting LLaMA benchmark for model: {model_id}")
-    logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}")
+    logger.info(
        f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
    )
    try:
        # Create benchmark instance
@ -129,7 +138,7 @@ def run_llama(logger, output_dir, **kwargs):
            include_sdpa_variants=include_sdpa_variants,
            device=device,
            torch_dtype=torch_dtype,
-            batch_size=batch_size
+            batch_size=batch_size,
        )
        logger.info(f"Created {len(scenarios)} benchmark scenarios")
@ -143,7 +152,7 @@ def run_llama(logger, output_dir, **kwargs):
            return None
        # Save results
-        model_name = model_id.split('/')[-1]  # Extract model name from ID
+        model_name = model_id.split("/")[-1]  # Extract model name from ID
        output_file = runner.save_results(model_name, results)
        logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
@ -152,5 +161,6 @@ def run_llama(logger, output_dir, **kwargs):
    except Exception as e:
        logger.error(f"LLaMA benchmark failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        raise
--- a/benchmark_v2/benchmark_framework.py
+++ b/benchmark_v2/benchmark_framework.py
@ -14,28 +14,26 @@
 import gc
 import json
 import os
 import subprocess
 import sys
 import time
 import statistics
 import threading
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
 from dataclasses import dataclass, field, asdict
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Union, TypedDict
 import logging
 import os
 import statistics
 import sys
 import threading
 import time
 from abc import ABC, abstractmethod
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from typing import Any, Optional, TypedDict, Union
 import gpustat
 import numpy as np
 import psutil
 import gpustat
 import torch
 class GPUMetrics(TypedDict):
    """GPU monitoring result with GPU metrics."""
    gpu_utilization_mean: float
    gpu_utilization_max: float
    gpu_utilization_min: float
@ -48,6 +46,7 @@ class GPUMetrics(TypedDict):
 class NoGPU(TypedDict):
    """GPU monitoring result without GPU metrics."""
    gpu_monitoring_status: str
    gpu_monitoring_reason: str
@ -134,6 +133,7 @@ class ArchAwareTimer:
@dataclass
 class BenchmarkConfig:
    """Configuration for a single benchmark scenario."""
    name: str
    model_id: str
    variant: str = "eager"  # "eager", "compiled", "kernelized"
@ -143,13 +143,13 @@ class BenchmarkConfig:
    device: str = "cuda"
    torch_dtype: str = "float16"
    compile_mode: Optional[str] = None  # None, "default", "reduce-overhead", "max-autotune"
-    compile_options: Dict[str, Any] = field(default_factory=dict)
+    compile_options: dict[str, Any] = field(default_factory=dict)
    use_cache: bool = True
    batch_size: int = 1
    sequence_length: Optional[int] = None
    attn_implementation: str = "sdpa"  # "eager", "sdpa", "flash_attention_2"
    sdpa_backend: Optional[str] = None  # None, "math", "flash_attention", "efficient_attention", "cudnn_attention"
-    custom_params: Dict[str, Any] = field(default_factory=dict)
+    custom_params: dict[str, Any] = field(default_factory=dict)
 class BenchmarkScenario:
@ -195,24 +195,24 @@ class BenchmarkScenario:
        return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')"
@dataclass
 class TimingResult:
    """Result from a timing measurement."""
    time_to_first_token_seconds: Optional[float] = None
    latency_seconds: float = 0.0
    tokens_per_second: Optional[float] = None
    time_per_output_token_seconds: Optional[float] = None
    total_tokens_generated: int = 0
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    metadata: dict[str, Any] = field(default_factory=dict)
@dataclass
 class BenchmarkStatistics:
    """Statistical analysis of benchmark measurements."""
    name: str
-    measurements: List[float]
+    measurements: list[float]
    mean: float
    median: float
    std: float
@ -226,7 +226,7 @@ class BenchmarkStatistics:
    unit: str = "seconds"
    @classmethod
-    def from_measurements(cls, name: str, measurements: List[float], unit: str = "seconds") -> 'BenchmarkStatistics':
+    def from_measurements(cls, name: str, measurements: list[float], unit: str = "seconds") -> "BenchmarkStatistics":
        """Create statistics from a list of measurements."""
        if not measurements:
            raise ValueError("Cannot create statistics from empty measurements")
@ -246,13 +246,14 @@ class BenchmarkStatistics:
            p90=float(np.percentile(measurements_array, 90)),
            p95=float(np.percentile(measurements_array, 95)),
            p99=float(np.percentile(measurements_array, 99)),
-            unit=unit
+            unit=unit,
        )
@dataclass
 class HardwareInfo:
    """Hardware information collected during benchmarking."""
    gpu_name: str
    gpu_memory_total_mb: int
    cpu_count: int
@ -265,6 +266,7 @@ class HardwareInfo:
@dataclass
 class BenchmarkMetadata:
    """Metadata collected for each benchmark run."""
    timestamp: str
    commit_id: str
    hardware_info: HardwareInfo
@ -274,7 +276,7 @@ class BenchmarkMetadata:
 class GPUMonitor:
    """Monitor GPU utilization during benchmark execution."""
-    def __init__(self, sample_interval: float = 0.1, logger: logging.Logger = None):
+    def __init__(self, sample_interval: float = 0.1, logger: Optional[logging.Logger] = None):
        self.sample_interval = sample_interval
        self.logger = logger or logging.getLogger(__name__)
        self.stop_event = threading.Event()
@ -321,10 +323,7 @@ class GPUMonitor:
    def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]:
        """Stop monitoring and return collected metrics."""
        if not self.gpu_available:
-            return NoGPU(
+            return NoGPU(gpu_monitoring_status="disabled", gpu_monitoring_reason="no_gpus_available")
                gpu_monitoring_status="disabled",
                gpu_monitoring_reason="no_gpus_available"
            )
        # Signal the monitoring thread to stop
        self.stop_event.set()
@ -340,15 +339,12 @@ class GPUMonitor:
                gpu_memory_used_max=max(self.gpu_memory_used),
                gpu_memory_used_min=min(self.gpu_memory_used),
                sample_count=len(self.gpu_utilization),
-                gpu_monitoring_status="success"
+                gpu_monitoring_status="success",
            )
            self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected")
            return metrics
        else:
-            return NoGPU(
+            return NoGPU(gpu_monitoring_status="failed", gpu_monitoring_reason="no_samples_collected")
                gpu_monitoring_status="failed",
                gpu_monitoring_reason="no_samples_collected"
            )
    def _monitor_loop(self):
        """Background monitoring loop using threading.Event for communication."""
@ -400,7 +396,7 @@ def get_hardware_info() -> HardwareInfo:
    torch_version = torch.__version__
    cuda_version = None
-    if hasattr(torch, 'cuda') and torch.cuda.is_available():
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
        cuda_version = torch.version.cuda
    return HardwareInfo(
@ -410,14 +406,14 @@ def get_hardware_info() -> HardwareInfo:
        memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)),
        python_version=f"{sys.version.split()[0]}",
        torch_version=torch_version,
-        cuda_version=cuda_version
+        cuda_version=cuda_version,
    )
 def flush_memory():
    """Flush GPU memory and run garbage collection."""
    gc.collect()
-    if hasattr(torch, 'cuda') and torch.cuda.is_available():
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()
@ -442,13 +438,10 @@ def get_sdpa_backend(backend_name: Optional[str]):
        return None
 class SDPAContext:
    """Context manager for SDPA kernel selection."""
-    def __init__(self, backend_name: Optional[str], logger: logging.Logger = None):
+    def __init__(self, backend_name: Optional[str], logger: Optional[logging.Logger] = None):
        self.backend_name = backend_name
        self.logger = logger or logging.getLogger(__name__)
        self.backend = get_sdpa_backend(backend_name) if backend_name else None
@ -466,7 +459,9 @@ class SDPAContext:
                    self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}")
                self.context = None
        elif self.backend_name and self.logger:
-            self.logger.debug(f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})")
+            self.logger.debug(
                f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})"
            )
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
@ -490,7 +485,7 @@ class AbstractModelBenchmark(ABC):
        self.scenarios = {}  # Map of scenario_name -> BenchmarkScenario
    @abstractmethod
-    def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
+    def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
        """Create and return a dictionary of benchmark scenarios."""
        pass
@ -518,7 +513,7 @@ class AbstractModelBenchmark(ABC):
        """Prepare inputs for the model. Override if needed."""
        return None
-    def get_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
+    def get_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
        """Get benchmark scenarios. Creates them if they don't exist."""
        if not self.scenarios:
            self.scenarios = self.create_scenarios(**kwargs)
@ -547,9 +542,7 @@ class ModelBenchmark(AbstractModelBenchmark):
        """Default prompt for text generation. Override in subclasses if needed."""
        return self._default_prompt
-
+    def get_attention_configs(self, include_sdpa_variants: bool = True) -> list[dict[str, Any]]:
    def get_attention_configs(self, include_sdpa_variants: bool = True) -> List[Dict[str, Any]]:
        """
        Get attention implementation configurations.
@ -565,15 +558,17 @@ class ModelBenchmark(AbstractModelBenchmark):
        # Add SDPA variants if requested
        if include_sdpa_variants:
-            attention_configs.append({
+            attention_configs.append(
-                "attn_implementation": "sdpa", 
+                {
-                "sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
+                    "attn_implementation": "sdpa",
-                "desc_suffix": ""
+                    "sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
-            })
+                    "desc_suffix": "",
                }
            )
        return attention_configs
-    def get_scenario_configs(self) -> List[Dict[str, Any]]:
+    def get_scenario_configs(self) -> list[dict[str, Any]]:
        """
        Get base scenario configurations. Override in subclasses to customize.
@ -583,36 +578,38 @@ class ModelBenchmark(AbstractModelBenchmark):
        return [
            # Eager variants
            {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
            # Compiled variants
-            {"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
+            {
-            
+                "variant": "compiled",
                "compile_mode": "max-autotune",
                "use_cache": True,
                "description": "Compiled with max autotune",
            },
            # Kernelized variant (if available)
-            {"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
+            {
                "variant": "kernelized",
                "compile_mode": "max-autotune",
                "use_cache": True,
                "description": "Kernelized execution",
            },
        ]
    def _is_kernelization_available(self) -> bool:
        """Check if kernelization is available. Override in subclasses."""
        try:
-            from kernels import Mode, kernelize
+            from kernels import Mode, kernelize  # noqa: F401
            return True
        except ImportError:
            return False
-    def get_default_generation_config(self) -> Dict[str, Any]:
+    def get_default_generation_config(self) -> dict[str, Any]:
        """Get default generation configuration. Override in subclasses for model-specific defaults."""
-        return {
+        return {"do_sample": False, "top_p": 1.0, "temperature": 1.0}
            "do_sample": False,
            "top_p": 1.0,
            "temperature": 1.0
        }
-    def get_model_init_kwargs(self, config: BenchmarkConfig) -> Dict[str, Any]:
+    def get_model_init_kwargs(self, config: BenchmarkConfig) -> dict[str, Any]:
        """Get model initialization kwargs. Override in subclasses for model-specific parameters."""
-        return {
+        return {"torch_dtype": getattr(torch, config.torch_dtype), "attn_implementation": config.attn_implementation}
            "torch_dtype": getattr(torch, config.torch_dtype),
            "attn_implementation": config.attn_implementation
        }
    def get_default_torch_dtype(self) -> str:
        """Get default torch dtype. Override in subclasses."""
@ -622,19 +619,19 @@ class ModelBenchmark(AbstractModelBenchmark):
        """Get default device. Override in subclasses."""
        return "cuda"
-    def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
+    def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
        """Create benchmark scenarios for HuggingFace models."""
        scenarios = {}
        # Extract parameters with model-specific defaults
-        model_id = kwargs.get('model_id', 'microsoft/DialoGPT-medium')
+        model_id = kwargs.get("model_id", "microsoft/DialoGPT-medium")
-        warmup_iterations = kwargs.get('warmup_iterations', 3)
+        warmup_iterations = kwargs.get("warmup_iterations", 3)
-        measurement_iterations = kwargs.get('measurement_iterations', 5)
+        measurement_iterations = kwargs.get("measurement_iterations", 5)
-        num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
+        num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
-        include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
+        include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
-        device = kwargs.get('device', self.get_default_device())
+        device = kwargs.get("device", self.get_default_device())
-        torch_dtype = kwargs.get('torch_dtype', self.get_default_torch_dtype())
+        torch_dtype = kwargs.get("torch_dtype", self.get_default_torch_dtype())
-        batch_size = kwargs.get('batch_size', 1)
+        batch_size = kwargs.get("batch_size", 1)
        # Get configurations
        attention_configs = self.get_attention_configs(include_sdpa_variants)
@ -654,7 +651,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                    # Create unique config for this scenario
                    config = BenchmarkConfig(
-                        name=scenario_config['variant'],
+                        name=scenario_config["variant"],
                        model_id=model_id,
                        variant=scenario_config["variant"],
                        compile_mode=scenario_config["compile_mode"],
@ -666,7 +663,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                        torch_dtype=torch_dtype,
                        batch_size=batch_size,
                        attn_implementation=attn_implementation,
-                        sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None
+                        sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None,
                    )
                    # Create scenario name
@ -695,11 +692,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                        description += desc_suffix
                    # Create scenario
-                    scenario = BenchmarkScenario(
+                    scenario = BenchmarkScenario(name=scenario_name, config=config, description=description)
                        name=scenario_name,
                        config=config,
                        description=description
                    )
                    # Add setup callbacks based on variant
                    if scenario_config["variant"] == "compiled":
@ -718,16 +711,12 @@ class ModelBenchmark(AbstractModelBenchmark):
        # Perform torch.compile
        if config.compile_mode is not None:
-            self.compiled_model = torch.compile(
+            self.compiled_model = torch.compile(model, mode=config.compile_mode, **config.compile_options)
                model, 
                mode=config.compile_mode, 
                **config.compile_options
            )
        else:
            self.compiled_model = torch.compile(model, **config.compile_options)
        # Setup static cache for compiled mode if needed
-        if config.use_cache and hasattr(self, 'inputs') and self.inputs is not None:
+        if config.use_cache and hasattr(self, "inputs") and self.inputs is not None:
            self._setup_static_cache(config)
    def _setup_kernelization_callback(self, model, tokenizer, config, logger):
@ -737,10 +726,8 @@ class ModelBenchmark(AbstractModelBenchmark):
        try:
            from kernels import Mode, kernelize
-            self.compiled_model = kernelize(
+
-                model,
+            self.compiled_model = kernelize(model, mode=Mode.INFERENCE)
                mode=Mode.INFERENCE
            )
        except Exception as e:
            if logger:
                logger.warning(f"Failed to setup kernelized mode: {e}")
@ -749,13 +736,14 @@ class ModelBenchmark(AbstractModelBenchmark):
    def _setup_static_cache(self, config: BenchmarkConfig):
        """Setup static cache for compiled models. Override if needed."""
-        if hasattr(self, 'inputs') and self.inputs is not None:
+        if hasattr(self, "inputs") and self.inputs is not None:
            try:
                from transformers import StaticCache
                seq_length = self.inputs["input_ids"].shape[1]
                # Get the actual device the model is on
-                if hasattr(self.model, 'device'):
+                if hasattr(self.model, "device"):
                    cache_device = self.model.device
                else:
                    cache_device = self.device
@ -765,7 +753,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                    max_batch_size=config.batch_size,
                    max_cache_len=seq_length + config.num_tokens_to_generate,
                    device=cache_device,
-                    dtype=getattr(torch, config.torch_dtype)
+                    dtype=getattr(torch, config.torch_dtype),
                )
                self.logger.debug(f"StaticCache created on device: {cache_device}")
            except (ImportError, TypeError) as e:
@ -794,7 +782,6 @@ class ModelBenchmark(AbstractModelBenchmark):
    def _load_model_and_tokenizer(self, config: BenchmarkConfig):
        """Load the model and tokenizer. Override in subclasses for custom loading."""
        from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
        # Load tokenizer
@ -812,14 +799,9 @@ class ModelBenchmark(AbstractModelBenchmark):
        target_device = config.device
        # Get model initialization kwargs
        model_init_kwargs = self.get_model_init_kwargs(config)
-        model_init_kwargs.update({
+        model_init_kwargs.update({"generation_config": gen_config})
            "generation_config": gen_config
        })
-        self.model = AutoModelForCausalLM.from_pretrained(
+        self.model = AutoModelForCausalLM.from_pretrained(config.model_id, **model_init_kwargs).eval()
            config.model_id, 
            **model_init_kwargs
        ).eval()
        # Move model to target device
        self.logger.info(f"Moving model to device: {target_device}")
@ -832,7 +814,7 @@ class ModelBenchmark(AbstractModelBenchmark):
        self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt")
        # Move inputs to the same device as the model
-        if hasattr(self.model, 'device'):
+        if hasattr(self.model, "device"):
            # Model is on a single device
            model_device = self.model.device
        else:
@ -849,16 +831,16 @@ class ModelBenchmark(AbstractModelBenchmark):
    def cleanup_model(self) -> None:
        """Cleanup model resources."""
-        if hasattr(self, 'model') and self.model is not None:
+        if hasattr(self, "model") and self.model is not None:
            del self.model
            self.model = None
-        if hasattr(self, 'compiled_model') and self.compiled_model is not None:
+        if hasattr(self, "compiled_model") and self.compiled_model is not None:
            del self.compiled_model
            self.compiled_model = None
-        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+        if hasattr(self, "tokenizer") and self.tokenizer is not None:
            del self.tokenizer
            self.tokenizer = None
-        if hasattr(self, 'past_key_values') and self.past_key_values is not None:
+        if hasattr(self, "past_key_values") and self.past_key_values is not None:
            del self.past_key_values
            self.past_key_values = None
@ -877,7 +859,7 @@ class ModelBenchmark(AbstractModelBenchmark):
            # Use SDPA context if specified
            with SDPAContext(config.sdpa_backend, self.logger):
                with torch.no_grad():
-                    outputs = model_to_use.generate(**generation_kwargs)
+                    _ = model_to_use.generate(**generation_kwargs)
        return timer.elapsed_time()
@ -915,11 +897,11 @@ class ModelBenchmark(AbstractModelBenchmark):
                "variant": config.variant,
                "compile_mode": config.compile_mode,
                "attn_implementation": config.attn_implementation,
-                "sdpa_backend": config.sdpa_backend
+                "sdpa_backend": config.sdpa_backend,
-            }
+            },
        )
-    def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> Dict[str, Any]:
+    def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> dict[str, Any]:
        """Get generation kwargs. Override in subclasses for custom generation."""
        generation_config_dict = self.get_default_generation_config()
        generation_kwargs = {
@ -935,11 +917,12 @@ class ModelBenchmark(AbstractModelBenchmark):
        if self.past_key_values is not None and config.variant == "compiled":
            try:
                from transformers import StaticCache
                # Reset cache for each measurement
                seq_length = self.inputs["input_ids"].shape[1]
                # Get the actual device the model is on
-                if hasattr(self.model, 'device'):
+                if hasattr(self.model, "device"):
                    cache_device = self.model.device
                else:
                    cache_device = self.device
@ -949,7 +932,7 @@ class ModelBenchmark(AbstractModelBenchmark):
                    max_batch_size=config.batch_size,
                    max_cache_len=seq_length + max_new_tokens,
                    device=cache_device,
-                    dtype=getattr(torch, config.torch_dtype)
+                    dtype=getattr(torch, config.torch_dtype),
                )
                generation_kwargs["past_key_values"] = fresh_cache
            except (ImportError, TypeError) as e:
@ -967,14 +950,13 @@ class BenchmarkRunner:
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    def run_benchmark(
        self,
        benchmark: ModelBenchmark,
-        scenarios: Dict[str, BenchmarkScenario],
+        scenarios: dict[str, BenchmarkScenario],
        collect_gpu_metrics: bool = True,
-        commit_id: Optional[str] = None
+        commit_id: Optional[str] = None,
-    ) -> Dict[str, Dict[str, Any]]:
+    ) -> dict[str, dict[str, Any]]:
        """
        Run benchmarks using scenarios.
@ -1021,7 +1003,7 @@ class BenchmarkRunner:
                    timestamp=datetime.utcnow().isoformat(),
                    commit_id=commit_id,
                    hardware_info=get_hardware_info(),
-                    config=config
+                    config=config,
                )
                # Initialize GPU monitor
@ -1037,11 +1019,13 @@ class BenchmarkRunner:
                        _ = benchmark.measure_latency(config)
                    except Exception as e:
                        warmup_failures += 1
-                        self.logger.warning(f"Warmup iteration {i+1} failed: {e}")
+                        self.logger.warning(f"Warmup iteration {i + 1} failed: {e}")
                # If more than half the warmup iterations failed, skip this scenario
                if warmup_failures > config.warmup_iterations // 2:
-                    self.logger.warning(f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})")
+                    self.logger.warning(
                        f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})"
                    )
                    try:
                        scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
                        benchmark.cleanup_model()
@ -1077,12 +1061,18 @@ class BenchmarkRunner:
                        if timing_result.time_per_output_token_seconds is not None:
                            itl_measurements.append(timing_result.time_per_output_token_seconds)
-                        itl_str = f", itl={timing_result.time_per_output_token_seconds:.4f}s/token" if timing_result.time_per_output_token_seconds else ""
+                        itl_str = (
-                        self.logger.debug(f"Iteration {i+1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}")
+                            f", itl={timing_result.time_per_output_token_seconds:.4f}s/token"
                            if timing_result.time_per_output_token_seconds
                            else ""
                        )
                        self.logger.debug(
                            f"Iteration {i + 1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}"
                        )
                    except Exception as e:
                        measurement_failures += 1
-                        self.logger.warning(f"Measurement iteration {i+1} failed: {e}")
+                        self.logger.warning(f"Measurement iteration {i + 1} failed: {e}")
                # Stop GPU monitoring
                gpu_metrics = {}
@ -1091,7 +1081,9 @@ class BenchmarkRunner:
                # If we don't have enough successful measurements, skip this scenario
                if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2:
-                    self.logger.warning(f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})")
+                    self.logger.warning(
                        f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})"
                    )
                    try:
                        scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
                        benchmark.cleanup_model()
@ -1104,7 +1096,7 @@ class BenchmarkRunner:
                    "metadata": asdict(metadata),
                    "measurements": {},
                    "gpu_metrics": gpu_metrics,
-                    "scenario_description": scenario.description
+                    "scenario_description": scenario.description,
                }
                if latency_measurements:
@ -1112,15 +1104,21 @@ class BenchmarkRunner:
                    scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats)
                if ttft_measurements:
-                    ttft_stats = BenchmarkStatistics.from_measurements("time_to_first_token_seconds", ttft_measurements)
+                    ttft_stats = BenchmarkStatistics.from_measurements(
                        "time_to_first_token_seconds", ttft_measurements
                    )
                    scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats)
                if tokens_per_sec_measurements:
-                    tps_stats = BenchmarkStatistics.from_measurements("tokens_per_second", tokens_per_sec_measurements, "tokens/sec")
+                    tps_stats = BenchmarkStatistics.from_measurements(
                        "tokens_per_second", tokens_per_sec_measurements, "tokens/sec"
                    )
                    scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats)
                if itl_measurements:
-                    itl_stats = BenchmarkStatistics.from_measurements("time_per_output_token_seconds", itl_measurements, "seconds/token")
+                    itl_stats = BenchmarkStatistics.from_measurements(
                        "time_per_output_token_seconds", itl_measurements, "seconds/token"
                    )
                    scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats)
                # Log summary
@ -1149,6 +1147,7 @@ class BenchmarkRunner:
            except Exception as e:
                self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}")
                import traceback
                self.logger.debug(traceback.format_exc())
                # Try to clean up if possible
@ -1169,7 +1168,7 @@ class BenchmarkRunner:
        return all_results
-    def save_results(self, model_name: str, results: Dict[str, Dict[str, Any]]) -> str:
+    def save_results(self, model_name: str, results: dict[str, dict[str, Any]]) -> str:
        """Save benchmark results to JSON file."""
        # Create model-specific subdirectory
        model_dir = os.path.join(self.output_dir, model_name)
@ -1181,24 +1180,20 @@ class BenchmarkRunner:
        filepath = os.path.join(model_dir, filename)
        # Prepare output structure
-        output_data = {
+        output_data = {"model_name": model_name, "benchmark_scenarios": []}
            "model_name": model_name,
            "benchmark_scenarios": []
        }
        for config_name, config_results in results.items():
            scenario = {
                "scenario_name": config_name,
                "metadata": config_results["metadata"],
                "measurements": config_results["measurements"],
-                "gpu_metrics": config_results.get("gpu_metrics", {})
+                "gpu_metrics": config_results.get("gpu_metrics", {}),
            }
            output_data["benchmark_scenarios"].append(scenario)
        # Save to JSON file
-        with open(filepath, 'w') as f:
+        with open(filepath, "w") as f:
            json.dump(output_data, f, indent=2, default=str)
        self.logger.info(f"Results saved to {filepath}")
        return filepath
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -20,38 +20,34 @@ in the ./benches directory, organizing outputs into model-specific subfolders.
 import argparse
 import importlib.util
 import json
 import logging
 import os
 import sys
 import json
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Any, Optional
+from typing import Any, Optional
 def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
    """Setup logging configuration."""
    numeric_level = getattr(logging, log_level.upper(), None)
    if not isinstance(numeric_level, int):
-        raise ValueError(f'Invalid log level: {log_level}')
+        raise ValueError(f"Invalid log level: {log_level}")
    handlers = [logging.StreamHandler(sys.stdout)]
    if enable_file_logging:
-        handlers.append(
+        handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
            logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
        )
    logging.basicConfig(
-        level=numeric_level,
+        level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
        format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
        handlers=handlers
    )
    return logging.getLogger(__name__)
-def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
+def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
    """
    Discover all benchmark modules in the benches directory.
@ -77,20 +73,24 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
            spec.loader.exec_module(module)
            # Check if it has a benchmark runner function
-            if hasattr(module, f'run_{module_name}'):
+            if hasattr(module, f"run_{module_name}"):
-                benchmarks.append({
+                benchmarks.append(
-                    'name': module_name,
+                    {
-                    'path': str(py_file),
+                        "name": module_name,
-                    'module': module,
+                        "path": str(py_file),
-                    'runner_function': getattr(module, f'run_{module_name}')
+                        "module": module,
-                })
+                        "runner_function": getattr(module, f"run_{module_name}"),
-            elif hasattr(module, 'run_benchmark'):
+                    }
-                benchmarks.append({
+                )
-                    'name': module_name,
+            elif hasattr(module, "run_benchmark"):
-                    'path': str(py_file),
+                benchmarks.append(
-                    'module': module,
+                    {
-                    'runner_function': getattr(module, 'run_benchmark')
+                        "name": module_name,
-                })
+                        "path": str(py_file),
                        "module": module,
                        "runner_function": getattr(module, "run_benchmark"),
                    }
                )
            else:
                logging.warning(f"No runner function found in {py_file}")
@ -101,10 +101,7 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
 def run_single_benchmark(
-    benchmark_info: Dict[str, Any], 
+    benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
    output_dir: str,
    logger: logging.Logger,
    **kwargs
 ) -> Optional[str]:
    """
    Run a single benchmark and return the output file path.
@ -118,21 +115,19 @@ def run_single_benchmark(
    Returns:
        Path to the output file if successful, None otherwise
    """
-    benchmark_name = benchmark_info['name']
+    benchmark_name = benchmark_info["name"]
-    runner_func = benchmark_info['runner_function']
+    runner_func = benchmark_info["runner_function"]
    logger.info(f"Running benchmark: {benchmark_name}")
    try:
        # Check function signature to determine what arguments to pass
        import inspect
        sig = inspect.signature(runner_func)
        # Prepare arguments based on function signature
-        func_kwargs = {
+        func_kwargs = {"logger": logger, "output_dir": output_dir}
            'logger': logger,
            'output_dir': output_dir
        }
        # Add other kwargs if the function accepts them
        for param_name in sig.parameters:
@ -145,8 +140,7 @@ def run_single_benchmark(
        if has_var_kwargs:
            valid_kwargs = {**func_kwargs, **kwargs}
        else:
-            valid_kwargs = {k: v for k, v in func_kwargs.items() 
+            valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
                           if k in sig.parameters}
        # Run the benchmark
        result = runner_func(**valid_kwargs)
@ -161,15 +155,12 @@ def run_single_benchmark(
    except Exception as e:
        logger.error(f"Benchmark {benchmark_name} failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return None
-def generate_summary_report(
+def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
    output_dir: str, 
    benchmark_results: Dict[str, Any],
    logger: logging.Logger
 ) -> str:
    """Generate a summary report of all benchmark runs."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
@ -179,13 +170,13 @@ def generate_summary_report(
            "timestamp": datetime.utcnow().isoformat(),
            "total_benchmarks": len(benchmark_results),
            "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
-            "failed_benchmarks": len([r for r in benchmark_results.values() if r is None])
+            "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
        },
        "benchmark_results": benchmark_results,
-        "output_directory": output_dir
+        "output_directory": output_dir,
    }
-    with open(summary_file, 'w') as f:
+    with open(summary_file, "w") as f:
        json.dump(summary_data, f, indent=2, default=str)
    logger.info(f"Summary report saved to: {summary_file}")
@ -194,22 +185,20 @@ def generate_summary_report(
 def main():
    """Main entry point for the benchmarking script."""
-    parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
        description="Run all benchmarks in the ./benches directory"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="benchmark_results",
-        help="Base output directory for benchmark results (default: benchmark_results)"
+        help="Base output directory for benchmark results (default: benchmark_results)",
    )
    parser.add_argument(
        "--benches-dir",
        type=str,
        default="./benches",
-        help="Directory containing benchmark implementations (default: ./benches)"
+        help="Directory containing benchmark implementations (default: ./benches)",
    )
    parser.add_argument(
@ -217,66 +206,34 @@ def main():
        type=str,
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        default="INFO",
-        help="Logging level (default: INFO)"
+        help="Logging level (default: INFO)",
    )
-    parser.add_argument(
+    parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
-        "--model-id",
+
-        type=str,
+    parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
        help="Specific model ID to benchmark (if supported by benchmarks)"
    )
    parser.add_argument(
-        "--warmup-iterations",
+        "--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
        type=int,
        default=3,
        help="Number of warmup iterations (default: 3)"
    )
    parser.add_argument(
        "--measurement-iterations",
        type=int,
        default=5,
        help="Number of measurement iterations (default: 5)"
    )
    parser.add_argument(
        "--num-tokens-to-generate",
        type=int,
        default=100,
-        help="Number of tokens to generate in benchmarks (default: 100)"
+        help="Number of tokens to generate in benchmarks (default: 100)",
    )
-    parser.add_argument(
+    parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
-        "--include",
+
-        type=str,
+    parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
-        nargs="*",
+
-        help="Only run benchmarks matching these names"
+    parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
-    )
+
    parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
    parser.add_argument(
-        "--exclude",
+        "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
        type=str,
        nargs="*",
        help="Exclude benchmarks matching these names"
    )
    parser.add_argument(
        "--enable-mock",
        action="store_true",
        help="Enable mock benchmark (skipped by default)"
    )
    parser.add_argument(
        "--enable-file-logging",
        action="store_true",
        help="Enable file logging (disabled by default)"
    )
    parser.add_argument(
        "--commit-id",
        type=str,
        help="Git commit ID for metadata (if not provided, will auto-detect from git)"
    )
    args = parser.parse_args()
@ -304,13 +261,15 @@ def main():
        filtered_benchmarks = benchmarks
        if args.include:
-            filtered_benchmarks = [b for b in filtered_benchmarks 
+            filtered_benchmarks = [
-                                 if any(pattern in b['name'] for pattern in args.include)]
+                b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
            ]
            logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
        if args.exclude:
-            filtered_benchmarks = [b for b in filtered_benchmarks 
+            filtered_benchmarks = [
-                                 if not any(pattern in b['name'] for pattern in args.exclude)]
+                b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
            ]
            logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
        if not filtered_benchmarks:
@ -319,34 +278,29 @@ def main():
        # Prepare common kwargs for benchmarks
        benchmark_kwargs = {
-            'warmup_iterations': args.warmup_iterations,
+            "warmup_iterations": args.warmup_iterations,
-            'measurement_iterations': args.measurement_iterations,
+            "measurement_iterations": args.measurement_iterations,
-            'num_tokens_to_generate': args.num_tokens_to_generate
+            "num_tokens_to_generate": args.num_tokens_to_generate,
        }
        if args.model_id:
-            benchmark_kwargs['model_id'] = args.model_id
+            benchmark_kwargs["model_id"] = args.model_id
        # Add enable_mock flag for mock benchmark
-        benchmark_kwargs['enable_mock'] = args.enable_mock
+        benchmark_kwargs["enable_mock"] = args.enable_mock
        # Add commit_id if provided
        if args.commit_id:
-            benchmark_kwargs['commit_id'] = args.commit_id
+            benchmark_kwargs["commit_id"] = args.commit_id
        # Run benchmarks
        benchmark_results = {}
        successful_count = 0
        for benchmark_info in filtered_benchmarks:
-            result = run_single_benchmark(
+            result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
                benchmark_info,
                args.output_dir,
                logger,
                **benchmark_kwargs
            )
-            benchmark_results[benchmark_info['name']] = result
+            benchmark_results[benchmark_info["name"]] = result
            if result is not None:
                successful_count += 1
@ -377,6 +331,7 @@ def main():
    except Exception as e:
        logger.error(f"Benchmark run failed: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return 1
--- a/scripts/check_tokenizers.py
+++ b/scripts/check_tokenizers.py
@ -4,8 +4,8 @@ import datasets
 import transformers
 from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
 from transformers.utils import logging
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.utils import logging
 logging.set_verbosity_info()
@ -22,7 +22,9 @@ imperfect = 0
 wrong = 0
-def check_diff(spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
+def check_diff(
    spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
 ) -> bool:
    if spm_diff == list(reversed(tok_diff)):
        # AAA -> AA+A vs A+AA case.
        return True
@ -54,7 +56,9 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
    return False
-def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
+def check_details(
    line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
 ) -> bool:
    # Encoding can be the same with same result AAA -> A + AA vs AA + A
    # We can check that we use at least exactly the same number of tokens.
    for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
@ -90,7 +94,9 @@ def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTr
                    if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
                ]
                for j in possible_matches:
-                    if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], slow, fast) and check_details(
+                    if check_diff(
                        spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
                    ) and check_details(
                        line,
                        spm_ids[first + i : last],
                        tok_ids[first + j : last],
@ -140,9 +146,9 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
    if skip_assert:
        return
-    assert (
+    assert slow_ids == fast_ids, (
-        slow_ids == fast_ids
+        f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
-    ), f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
+    )
 def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
--- a/scripts/stale.py
+++ b/scripts/stale.py
@ -15,6 +15,7 @@
 Script to close stale issue. Taken in part from the AllenNLP repository.
 https://github.com/allenai/allennlp.
 """
 import os
 from datetime import datetime as dt
@ -39,10 +40,11 @@ def main():
    for i, issue in enumerate(open_issues):
        print(i, issue)
-        comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True)
+        comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
        last_comment = comments[0] if len(comments) > 0 else None
        if (
-            last_comment is not None and last_comment.user.login == "github-actions[bot]"
+            last_comment is not None
            and last_comment.user.login == "github-actions[bot]"
            and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
            and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())