Enable ruff on benchmark and scripts (#40634)

* Enable ruff on benchmark and scripts

Signed-off-by: cyy <cyyever@outlook.com>

* Cover benchmark_v2

Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>

* correct

* style

* style

---------

Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>
Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
This commit is contained in:
Yuanyuan Chen
2025-09-10 17:38:06 +08:00
committed by GitHub
parent 08edec9f7d
commit a5ecd94a3f
11 changed files with 661 additions and 649 deletions

View File

@ -3,7 +3,7 @@
# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
export PYTHONPATH = src export PYTHONPATH = src
check_dirs := examples tests src utils check_dirs := examples tests src utils scripts benchmark benchmark_v2
exclude_folders := "" exclude_folders := ""

View File

@ -11,25 +11,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from logging import Logger
import os import os
import sys
from logging import Logger
from threading import Event, Thread from threading import Event, Thread
from time import perf_counter, sleep from time import perf_counter, sleep
from typing import Optional from typing import Optional
import sys
# Add the parent directory to Python path to import benchmarks_entrypoint # Add the parent directory to Python path to import benchmarks_entrypoint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from benchmarks_entrypoint import MetricsRecorder
import gpustat import gpustat
import psutil import psutil
import psycopg2 import psycopg2
from benchmarks_entrypoint import MetricsRecorder
# Optional heavy ML dependencies - only required when actually running the benchmark # Optional heavy ML dependencies - only required when actually running the benchmark
try: try:
import torch import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
TRANSFORMERS_AVAILABLE = True TRANSFORMERS_AVAILABLE = True
except ImportError: except ImportError:
TRANSFORMERS_AVAILABLE = False TRANSFORMERS_AVAILABLE = False
@ -63,7 +66,13 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
def run_benchmark( def run_benchmark(
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100 logger: Logger,
repository: str,
branch: str,
commit_id: str,
commit_msg: str,
metrics_recorder=None,
num_tokens_to_generate=100,
): ):
# Check if required ML dependencies are available # Check if required ML dependencies are available
if not TRANSFORMERS_AVAILABLE: if not TRANSFORMERS_AVAILABLE:
@ -154,7 +163,7 @@ def run_benchmark(
# First eager forward pass # First eager forward pass
logger.info("running first eager forward pass") logger.info("running first eager forward pass")
start = perf_counter() start = perf_counter()
outputs = model(**inputs) _ = model(**inputs)
torch.cuda.synchronize() torch.cuda.synchronize()
end = perf_counter() end = perf_counter()
first_eager_fwd_pass_time = end - start first_eager_fwd_pass_time = end - start
@ -163,7 +172,7 @@ def run_benchmark(
# Second eager forward pass (should be faster) # Second eager forward pass (should be faster)
logger.info("running second eager forward pass") logger.info("running second eager forward pass")
start = perf_counter() start = perf_counter()
outputs = model(**inputs) _ = model(**inputs)
torch.cuda.synchronize() torch.cuda.synchronize()
end = perf_counter() end = perf_counter()
second_eager_fwd_pass_time = end - start second_eager_fwd_pass_time = end - start

View File

@ -31,9 +31,7 @@ from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from git import Repo from git import Repo
from huggingface_hub import HfApi from huggingface_hub import HfApi
from optimum_benchmark import Benchmark from optimum_benchmark import Benchmark
from optimum_benchmark_wrapper import main from optimum_benchmark_wrapper import main

View File

@ -13,19 +13,20 @@
# limitations under the License. # limitations under the License.
import argparse import argparse
import importlib.util import importlib.util
import json
import logging import logging
import os import os
import sys import sys
import json
import uuid import uuid
from datetime import datetime from datetime import datetime
from typing import Dict, Tuple, Optional, List
import pandas as pd import pandas as pd
try: try:
from psycopg2.extensions import register_adapter from psycopg2.extensions import register_adapter
from psycopg2.extras import Json from psycopg2.extras import Json
register_adapter(dict, Json) register_adapter(dict, Json)
PSYCOPG2_AVAILABLE = True PSYCOPG2_AVAILABLE = True
except ImportError: except ImportError:
@ -38,8 +39,14 @@ class ImportModuleException(Exception):
class MetricsRecorder: class MetricsRecorder:
def __init__( def __init__(
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str, self,
collect_csv_data: bool = True connection,
logger: logging.Logger,
repository: str,
branch: str,
commit_id: str,
commit_msg: str,
collect_csv_data: bool = True,
): ):
self.conn = connection self.conn = connection
self.use_database = connection is not None self.use_database = connection is not None
@ -55,23 +62,39 @@ class MetricsRecorder:
# For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled) # For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
if self.collect_csv_data: if self.collect_csv_data:
# Initialize empty DataFrames with proper schemas # Initialize empty DataFrames with proper schemas
self.benchmarks_df = pd.DataFrame(columns=[ self.benchmarks_df = pd.DataFrame(
'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message', columns=[
'metadata', 'created_at' "benchmark_id",
]) "repository",
self.device_measurements_df = pd.DataFrame(columns=[ "branch",
'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util', "commit_id",
'gpu_mem_megabytes', 'time' "commit_message",
]) "metadata",
self.model_measurements_df = pd.DataFrame(columns=[ "created_at",
'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs', ]
'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs', )
'second_eager_generate_time_secs', 'time_to_first_token_secs', self.device_measurements_df = pd.DataFrame(
'time_to_second_token_secs', 'time_to_third_token_secs', columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
'time_to_next_token_mean_secs', 'first_compile_generate_time_secs', )
'second_compile_generate_time_secs', 'third_compile_generate_time_secs', self.model_measurements_df = pd.DataFrame(
'fourth_compile_generate_time_secs' columns=[
]) "benchmark_id",
"time",
"model_load_time",
"first_eager_forward_pass_time_secs",
"second_eager_forward_pass_time_secs",
"first_eager_generate_time_secs",
"second_eager_generate_time_secs",
"time_to_first_token_secs",
"time_to_second_token_secs",
"time_to_third_token_secs",
"time_to_next_token_mean_secs",
"first_compile_generate_time_secs",
"second_compile_generate_time_secs",
"third_compile_generate_time_secs",
"fourth_compile_generate_time_secs",
]
)
else: else:
self.benchmarks_df = None self.benchmarks_df = None
self.device_measurements_df = None self.device_measurements_df = None
@ -95,15 +118,19 @@ class MetricsRecorder:
# Store benchmark data for CSV export (if enabled) # Store benchmark data for CSV export (if enabled)
if self.collect_csv_data: if self.collect_csv_data:
# Add row to pandas DataFrame # Add row to pandas DataFrame
new_row = pd.DataFrame([{ new_row = pd.DataFrame(
'benchmark_id': benchmark_id, [
'repository': self.repository, {
'branch': self.branch, "benchmark_id": benchmark_id,
'commit_id': self.commit_id, "repository": self.repository,
'commit_message': self.commit_msg, "branch": self.branch,
'metadata': json.dumps(metadata), "commit_id": self.commit_id,
'created_at': datetime.utcnow().isoformat() "commit_message": self.commit_msg,
}]) "metadata": json.dumps(metadata),
"created_at": datetime.utcnow().isoformat(),
}
]
)
self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True) self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
mode_info = [] mode_info = []
@ -123,14 +150,18 @@ class MetricsRecorder:
# Store device measurements for CSV export (if enabled) # Store device measurements for CSV export (if enabled)
if self.collect_csv_data: if self.collect_csv_data:
# Add row to pandas DataFrame # Add row to pandas DataFrame
new_row = pd.DataFrame([{ new_row = pd.DataFrame(
'benchmark_id': benchmark_id, [
'cpu_util': cpu_util, {
'mem_megabytes': mem_megabytes, "benchmark_id": benchmark_id,
'gpu_util': gpu_util, "cpu_util": cpu_util,
'gpu_mem_megabytes': gpu_mem_megabytes, "mem_megabytes": mem_megabytes,
'time': datetime.utcnow().isoformat() "gpu_util": gpu_util,
}]) "gpu_mem_megabytes": gpu_mem_megabytes,
"time": datetime.utcnow().isoformat(),
}
]
)
self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True) self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
# Store in database if available # Store in database if available
@ -149,10 +180,7 @@ class MetricsRecorder:
# Store model measurements for CSV export (if enabled) # Store model measurements for CSV export (if enabled)
if self.collect_csv_data: if self.collect_csv_data:
# Add row to pandas DataFrame with flattened measurements # Add row to pandas DataFrame with flattened measurements
row_data = { row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
'benchmark_id': benchmark_id,
'time': datetime.utcnow().isoformat()
}
# Flatten the measurements dict into the row # Flatten the measurements dict into the row
row_data.update(measurements) row_data.update(measurements)
@ -241,28 +269,34 @@ class MetricsRecorder:
# Add model measurements (join on benchmark_id) # Add model measurements (join on benchmark_id)
if len(self.model_measurements_df) > 0: if len(self.model_measurements_df) > 0:
# Drop 'time' column from model measurements to avoid conflicts # Drop 'time' column from model measurements to avoid conflicts
model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore') model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
summary_df = summary_df.merge(model_df, on='benchmark_id', how='left') summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")
# Calculate device measurement aggregates using pandas groupby # Calculate device measurement aggregates using pandas groupby
if len(self.device_measurements_df) > 0: if len(self.device_measurements_df) > 0:
device_agg = self.device_measurements_df.groupby('benchmark_id').agg({ device_agg = (
'cpu_util': ['mean', 'max', 'std', 'count'], self.device_measurements_df.groupby("benchmark_id")
'mem_megabytes': ['mean', 'max', 'std'], .agg(
'gpu_util': ['mean', 'max', 'std'], {
'gpu_mem_megabytes': ['mean', 'max', 'std'] "cpu_util": ["mean", "max", "std", "count"],
}).round(3) "mem_megabytes": ["mean", "max", "std"],
"gpu_util": ["mean", "max", "std"],
"gpu_mem_megabytes": ["mean", "max", "std"],
}
)
.round(3)
)
# Flatten column names # Flatten column names
device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns] device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
device_agg = device_agg.reset_index() device_agg = device_agg.reset_index()
# Rename count column to be more descriptive # Rename count column to be more descriptive
if 'cpu_util_count' in device_agg.columns: if "cpu_util_count" in device_agg.columns:
device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'}) device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})
# Merge with summary # Merge with summary
summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left') summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")
# Export the comprehensive summary # Export the comprehensive summary
summary_df.to_csv(summary_file, index=False) summary_df.to_csv(summary_file, index=False)
@ -313,18 +347,13 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
help="The commit message associated with the commit, truncated to 70 characters.", help="The commit message associated with the commit, truncated to 70 characters.",
) )
parser.add_argument( parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")
"--csv",
action="store_true",
default=False,
help="Enable CSV output files generation."
)
parser.add_argument( parser.add_argument(
"--csv-output-dir", "--csv-output-dir",
type=str, type=str,
default="benchmark_results", default="benchmark_results",
help="Directory for CSV output files (default: benchmark_results)." help="Directory for CSV output files (default: benchmark_results).",
) )
args = parser.parse_args() args = parser.parse_args()
@ -356,6 +385,7 @@ def create_database_connection():
try: try:
import psycopg2 import psycopg2
conn = psycopg2.connect("dbname=metrics") conn = psycopg2.connect("dbname=metrics")
logger.info("Successfully connected to database") logger.info("Successfully connected to database")
return conn return conn
@ -364,8 +394,9 @@ def create_database_connection():
return None return None
def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str, def create_global_metrics_recorder(
generate_csv: bool = False) -> MetricsRecorder: repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
) -> MetricsRecorder:
""" """
Create a global metrics recorder that will be used across all benchmarks. Create a global metrics recorder that will be used across all benchmarks.
""" """
@ -415,7 +446,7 @@ if __name__ == "__main__":
try: try:
logger.debug(f"checking if benches/{entry.name} has run_benchmark function") logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
module = import_from_path(entry.name.split(".")[0], entry.path) module = import_from_path(entry.name.split(".")[0], entry.path)
if hasattr(module, 'run_benchmark'): if hasattr(module, "run_benchmark"):
benchmark_modules.append(entry.name) benchmark_modules.append(entry.name)
logger.debug(f"discovered benchmark: {entry.name}") logger.debug(f"discovered benchmark: {entry.name}")
else: else:
@ -443,7 +474,9 @@ if __name__ == "__main__":
module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder) module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
except TypeError: except TypeError:
# Fall back to the old signature for backward compatibility # Fall back to the old signature for backward compatibility
logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module") logger.warning(
f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
)
module.run_benchmark(logger, repository, branch, commit_id, commit_msg) module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
successful_benchmarks += 1 successful_benchmarks += 1

View File

@ -3,7 +3,11 @@ import subprocess
def main(config_dir, config_name, args): def main(config_dir, config_name, args):
subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args) subprocess.run(
["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
+ ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
+ args
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -12,18 +12,19 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import logging import logging
from typing import Dict, Any, List import os
from typing import Any
from benchmark_framework import ModelBenchmark
import torch import torch
from benchmark_framework import ModelBenchmark
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "1" os.environ["TOKENIZERS_PARALLELISM"] = "1"
torch.set_float32_matmul_precision("high") torch.set_float32_matmul_precision("high")
class LLaMABenchmark(ModelBenchmark): class LLaMABenchmark(ModelBenchmark):
"""Simplified LLaMA model benchmark implementation using the ModelBenchmark base class.""" """Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
@ -31,9 +32,7 @@ class LLaMABenchmark(ModelBenchmark):
super().__init__(logger) super().__init__(logger)
self._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA self._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA
def get_scenario_configs(self) -> list[dict[str, Any]]:
def get_scenario_configs(self) -> List[Dict[str, Any]]:
""" """
Get LLaMA-specific scenario configurations. Get LLaMA-specific scenario configurations.
@ -43,24 +42,33 @@ class LLaMABenchmark(ModelBenchmark):
return [ return [
# Eager variants # Eager variants
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"}, {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
# Compiled variants # Compiled variants
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"}, {
"variant": "compiled",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Compiled with max autotune",
},
# Kernelized variant (if available) # Kernelized variant (if available)
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"}, {
"variant": "kernelized",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Kernelized execution",
},
] ]
def _is_kernelization_available(self) -> bool: def _is_kernelization_available(self) -> bool:
"""Check if kernelization is available for LLaMA.""" """Check if kernelization is available for LLaMA."""
try: try:
from kernels import Mode, kernelize from kernels import Mode, kernelize # noqa: F401
return True return True
except ImportError: except ImportError:
self.logger.debug("Kernelization not available: kernels module not found") self.logger.debug("Kernelization not available: kernels module not found")
return False return False
def get_default_generation_config(self) -> Dict[str, Any]: def get_default_generation_config(self) -> dict[str, Any]:
"""Get LLaMA-specific generation configuration.""" """Get LLaMA-specific generation configuration."""
return { return {
"do_sample": False, "do_sample": False,
@ -70,9 +78,8 @@ class LLaMABenchmark(ModelBenchmark):
"max_new_tokens": None, # Will be set per scenario "max_new_tokens": None, # Will be set per scenario
} }
def get_model_init_kwargs(self, config) -> Dict[str, Any]: def get_model_init_kwargs(self, config) -> dict[str, Any]:
"""Get LLaMA-specific model initialization kwargs.""" """Get LLaMA-specific model initialization kwargs."""
from benchmark_framework import BenchmarkConfig
return { return {
"torch_dtype": getattr(torch, config.torch_dtype), "torch_dtype": getattr(torch, config.torch_dtype),
"attn_implementation": config.attn_implementation, "attn_implementation": config.attn_implementation,
@ -103,18 +110,20 @@ def run_llama(logger, output_dir, **kwargs):
from benchmark_framework import BenchmarkRunner from benchmark_framework import BenchmarkRunner
# Extract parameters with defaults # Extract parameters with defaults
model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf') model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
warmup_iterations = kwargs.get('warmup_iterations', 3) warmup_iterations = kwargs.get("warmup_iterations", 3)
measurement_iterations = kwargs.get('measurement_iterations', 5) measurement_iterations = kwargs.get("measurement_iterations", 5)
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100) num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
include_sdpa_variants = kwargs.get('include_sdpa_variants', True) include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
device = kwargs.get('device', 'cuda') device = kwargs.get("device", "cuda")
torch_dtype = kwargs.get('torch_dtype', 'float16') torch_dtype = kwargs.get("torch_dtype", "float16")
batch_size = kwargs.get('batch_size', 1) batch_size = kwargs.get("batch_size", 1)
commit_id = kwargs.get('commit_id', None) commit_id = kwargs.get("commit_id")
logger.info(f"Starting LLaMA benchmark for model: {model_id}") logger.info(f"Starting LLaMA benchmark for model: {model_id}")
logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}") logger.info(
f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
)
try: try:
# Create benchmark instance # Create benchmark instance
@ -129,7 +138,7 @@ def run_llama(logger, output_dir, **kwargs):
include_sdpa_variants=include_sdpa_variants, include_sdpa_variants=include_sdpa_variants,
device=device, device=device,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
batch_size=batch_size batch_size=batch_size,
) )
logger.info(f"Created {len(scenarios)} benchmark scenarios") logger.info(f"Created {len(scenarios)} benchmark scenarios")
@ -143,7 +152,7 @@ def run_llama(logger, output_dir, **kwargs):
return None return None
# Save results # Save results
model_name = model_id.split('/')[-1] # Extract model name from ID model_name = model_id.split("/")[-1] # Extract model name from ID
output_file = runner.save_results(model_name, results) output_file = runner.save_results(model_name, results)
logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}") logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
@ -152,5 +161,6 @@ def run_llama(logger, output_dir, **kwargs):
except Exception as e: except Exception as e:
logger.error(f"LLaMA benchmark failed: {e}") logger.error(f"LLaMA benchmark failed: {e}")
import traceback import traceback
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
raise raise

View File

@ -14,28 +14,26 @@
import gc import gc
import json import json
import os
import subprocess
import sys
import time
import statistics
import threading
from abc import ABC, abstractmethod
from contextlib import nullcontext
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Union, TypedDict
import logging import logging
import os
import statistics
import sys
import threading
import time
from abc import ABC, abstractmethod
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import Any, Optional, TypedDict, Union
import gpustat
import numpy as np import numpy as np
import psutil import psutil
import gpustat
import torch import torch
class GPUMetrics(TypedDict): class GPUMetrics(TypedDict):
"""GPU monitoring result with GPU metrics.""" """GPU monitoring result with GPU metrics."""
gpu_utilization_mean: float gpu_utilization_mean: float
gpu_utilization_max: float gpu_utilization_max: float
gpu_utilization_min: float gpu_utilization_min: float
@ -48,6 +46,7 @@ class GPUMetrics(TypedDict):
class NoGPU(TypedDict): class NoGPU(TypedDict):
"""GPU monitoring result without GPU metrics.""" """GPU monitoring result without GPU metrics."""
gpu_monitoring_status: str gpu_monitoring_status: str
gpu_monitoring_reason: str gpu_monitoring_reason: str
@ -134,6 +133,7 @@ class ArchAwareTimer:
@dataclass @dataclass
class BenchmarkConfig: class BenchmarkConfig:
"""Configuration for a single benchmark scenario.""" """Configuration for a single benchmark scenario."""
name: str name: str
model_id: str model_id: str
variant: str = "eager" # "eager", "compiled", "kernelized" variant: str = "eager" # "eager", "compiled", "kernelized"
@ -143,13 +143,13 @@ class BenchmarkConfig:
device: str = "cuda" device: str = "cuda"
torch_dtype: str = "float16" torch_dtype: str = "float16"
compile_mode: Optional[str] = None # None, "default", "reduce-overhead", "max-autotune" compile_mode: Optional[str] = None # None, "default", "reduce-overhead", "max-autotune"
compile_options: Dict[str, Any] = field(default_factory=dict) compile_options: dict[str, Any] = field(default_factory=dict)
use_cache: bool = True use_cache: bool = True
batch_size: int = 1 batch_size: int = 1
sequence_length: Optional[int] = None sequence_length: Optional[int] = None
attn_implementation: str = "sdpa" # "eager", "sdpa", "flash_attention_2" attn_implementation: str = "sdpa" # "eager", "sdpa", "flash_attention_2"
sdpa_backend: Optional[str] = None # None, "math", "flash_attention", "efficient_attention", "cudnn_attention" sdpa_backend: Optional[str] = None # None, "math", "flash_attention", "efficient_attention", "cudnn_attention"
custom_params: Dict[str, Any] = field(default_factory=dict) custom_params: dict[str, Any] = field(default_factory=dict)
class BenchmarkScenario: class BenchmarkScenario:
@ -195,24 +195,24 @@ class BenchmarkScenario:
return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')" return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')"
@dataclass @dataclass
class TimingResult: class TimingResult:
"""Result from a timing measurement.""" """Result from a timing measurement."""
time_to_first_token_seconds: Optional[float] = None time_to_first_token_seconds: Optional[float] = None
latency_seconds: float = 0.0 latency_seconds: float = 0.0
tokens_per_second: Optional[float] = None tokens_per_second: Optional[float] = None
time_per_output_token_seconds: Optional[float] = None time_per_output_token_seconds: Optional[float] = None
total_tokens_generated: int = 0 total_tokens_generated: int = 0
metadata: Dict[str, Any] = field(default_factory=dict) metadata: dict[str, Any] = field(default_factory=dict)
@dataclass @dataclass
class BenchmarkStatistics: class BenchmarkStatistics:
"""Statistical analysis of benchmark measurements.""" """Statistical analysis of benchmark measurements."""
name: str name: str
measurements: List[float] measurements: list[float]
mean: float mean: float
median: float median: float
std: float std: float
@ -226,7 +226,7 @@ class BenchmarkStatistics:
unit: str = "seconds" unit: str = "seconds"
@classmethod @classmethod
def from_measurements(cls, name: str, measurements: List[float], unit: str = "seconds") -> 'BenchmarkStatistics': def from_measurements(cls, name: str, measurements: list[float], unit: str = "seconds") -> "BenchmarkStatistics":
"""Create statistics from a list of measurements.""" """Create statistics from a list of measurements."""
if not measurements: if not measurements:
raise ValueError("Cannot create statistics from empty measurements") raise ValueError("Cannot create statistics from empty measurements")
@ -246,13 +246,14 @@ class BenchmarkStatistics:
p90=float(np.percentile(measurements_array, 90)), p90=float(np.percentile(measurements_array, 90)),
p95=float(np.percentile(measurements_array, 95)), p95=float(np.percentile(measurements_array, 95)),
p99=float(np.percentile(measurements_array, 99)), p99=float(np.percentile(measurements_array, 99)),
unit=unit unit=unit,
) )
@dataclass @dataclass
class HardwareInfo: class HardwareInfo:
"""Hardware information collected during benchmarking.""" """Hardware information collected during benchmarking."""
gpu_name: str gpu_name: str
gpu_memory_total_mb: int gpu_memory_total_mb: int
cpu_count: int cpu_count: int
@ -265,6 +266,7 @@ class HardwareInfo:
@dataclass @dataclass
class BenchmarkMetadata: class BenchmarkMetadata:
"""Metadata collected for each benchmark run.""" """Metadata collected for each benchmark run."""
timestamp: str timestamp: str
commit_id: str commit_id: str
hardware_info: HardwareInfo hardware_info: HardwareInfo
@ -274,7 +276,7 @@ class BenchmarkMetadata:
class GPUMonitor: class GPUMonitor:
"""Monitor GPU utilization during benchmark execution.""" """Monitor GPU utilization during benchmark execution."""
def __init__(self, sample_interval: float = 0.1, logger: logging.Logger = None): def __init__(self, sample_interval: float = 0.1, logger: Optional[logging.Logger] = None):
self.sample_interval = sample_interval self.sample_interval = sample_interval
self.logger = logger or logging.getLogger(__name__) self.logger = logger or logging.getLogger(__name__)
self.stop_event = threading.Event() self.stop_event = threading.Event()
@ -321,10 +323,7 @@ class GPUMonitor:
def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]: def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]:
"""Stop monitoring and return collected metrics.""" """Stop monitoring and return collected metrics."""
if not self.gpu_available: if not self.gpu_available:
return NoGPU( return NoGPU(gpu_monitoring_status="disabled", gpu_monitoring_reason="no_gpus_available")
gpu_monitoring_status="disabled",
gpu_monitoring_reason="no_gpus_available"
)
# Signal the monitoring thread to stop # Signal the monitoring thread to stop
self.stop_event.set() self.stop_event.set()
@ -340,15 +339,12 @@ class GPUMonitor:
gpu_memory_used_max=max(self.gpu_memory_used), gpu_memory_used_max=max(self.gpu_memory_used),
gpu_memory_used_min=min(self.gpu_memory_used), gpu_memory_used_min=min(self.gpu_memory_used),
sample_count=len(self.gpu_utilization), sample_count=len(self.gpu_utilization),
gpu_monitoring_status="success" gpu_monitoring_status="success",
) )
self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected") self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected")
return metrics return metrics
else: else:
return NoGPU( return NoGPU(gpu_monitoring_status="failed", gpu_monitoring_reason="no_samples_collected")
gpu_monitoring_status="failed",
gpu_monitoring_reason="no_samples_collected"
)
def _monitor_loop(self): def _monitor_loop(self):
"""Background monitoring loop using threading.Event for communication.""" """Background monitoring loop using threading.Event for communication."""
@ -400,7 +396,7 @@ def get_hardware_info() -> HardwareInfo:
torch_version = torch.__version__ torch_version = torch.__version__
cuda_version = None cuda_version = None
if hasattr(torch, 'cuda') and torch.cuda.is_available(): if hasattr(torch, "cuda") and torch.cuda.is_available():
cuda_version = torch.version.cuda cuda_version = torch.version.cuda
return HardwareInfo( return HardwareInfo(
@ -410,14 +406,14 @@ def get_hardware_info() -> HardwareInfo:
memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)), memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)),
python_version=f"{sys.version.split()[0]}", python_version=f"{sys.version.split()[0]}",
torch_version=torch_version, torch_version=torch_version,
cuda_version=cuda_version cuda_version=cuda_version,
) )
def flush_memory(): def flush_memory():
"""Flush GPU memory and run garbage collection.""" """Flush GPU memory and run garbage collection."""
gc.collect() gc.collect()
if hasattr(torch, 'cuda') and torch.cuda.is_available(): if hasattr(torch, "cuda") and torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated() torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats() torch.cuda.reset_peak_memory_stats()
@ -442,13 +438,10 @@ def get_sdpa_backend(backend_name: Optional[str]):
return None return None
class SDPAContext: class SDPAContext:
"""Context manager for SDPA kernel selection.""" """Context manager for SDPA kernel selection."""
def __init__(self, backend_name: Optional[str], logger: logging.Logger = None): def __init__(self, backend_name: Optional[str], logger: Optional[logging.Logger] = None):
self.backend_name = backend_name self.backend_name = backend_name
self.logger = logger or logging.getLogger(__name__) self.logger = logger or logging.getLogger(__name__)
self.backend = get_sdpa_backend(backend_name) if backend_name else None self.backend = get_sdpa_backend(backend_name) if backend_name else None
@ -466,7 +459,9 @@ class SDPAContext:
self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}") self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}")
self.context = None self.context = None
elif self.backend_name and self.logger: elif self.backend_name and self.logger:
self.logger.debug(f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})") self.logger.debug(
f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})"
)
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
@ -490,7 +485,7 @@ class AbstractModelBenchmark(ABC):
self.scenarios = {} # Map of scenario_name -> BenchmarkScenario self.scenarios = {} # Map of scenario_name -> BenchmarkScenario
@abstractmethod @abstractmethod
def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']: def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
"""Create and return a dictionary of benchmark scenarios.""" """Create and return a dictionary of benchmark scenarios."""
pass pass
@ -518,7 +513,7 @@ class AbstractModelBenchmark(ABC):
"""Prepare inputs for the model. Override if needed.""" """Prepare inputs for the model. Override if needed."""
return None return None
def get_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']: def get_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
"""Get benchmark scenarios. Creates them if they don't exist.""" """Get benchmark scenarios. Creates them if they don't exist."""
if not self.scenarios: if not self.scenarios:
self.scenarios = self.create_scenarios(**kwargs) self.scenarios = self.create_scenarios(**kwargs)
@ -547,9 +542,7 @@ class ModelBenchmark(AbstractModelBenchmark):
"""Default prompt for text generation. Override in subclasses if needed.""" """Default prompt for text generation. Override in subclasses if needed."""
return self._default_prompt return self._default_prompt
def get_attention_configs(self, include_sdpa_variants: bool = True) -> list[dict[str, Any]]:
def get_attention_configs(self, include_sdpa_variants: bool = True) -> List[Dict[str, Any]]:
""" """
Get attention implementation configurations. Get attention implementation configurations.
@ -565,15 +558,17 @@ class ModelBenchmark(AbstractModelBenchmark):
# Add SDPA variants if requested # Add SDPA variants if requested
if include_sdpa_variants: if include_sdpa_variants:
attention_configs.append({ attention_configs.append(
"attn_implementation": "sdpa", {
"sdpa_backends": [None, "math", "flash_attention", "efficient_attention"], "attn_implementation": "sdpa",
"desc_suffix": "" "sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
}) "desc_suffix": "",
}
)
return attention_configs return attention_configs
def get_scenario_configs(self) -> List[Dict[str, Any]]: def get_scenario_configs(self) -> list[dict[str, Any]]:
""" """
Get base scenario configurations. Override in subclasses to customize. Get base scenario configurations. Override in subclasses to customize.
@ -583,36 +578,38 @@ class ModelBenchmark(AbstractModelBenchmark):
return [ return [
# Eager variants # Eager variants
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"}, {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
# Compiled variants # Compiled variants
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"}, {
"variant": "compiled",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Compiled with max autotune",
},
# Kernelized variant (if available) # Kernelized variant (if available)
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"}, {
"variant": "kernelized",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Kernelized execution",
},
] ]
def _is_kernelization_available(self) -> bool: def _is_kernelization_available(self) -> bool:
"""Check if kernelization is available. Override in subclasses.""" """Check if kernelization is available. Override in subclasses."""
try: try:
from kernels import Mode, kernelize from kernels import Mode, kernelize # noqa: F401
return True return True
except ImportError: except ImportError:
return False return False
def get_default_generation_config(self) -> Dict[str, Any]: def get_default_generation_config(self) -> dict[str, Any]:
"""Get default generation configuration. Override in subclasses for model-specific defaults.""" """Get default generation configuration. Override in subclasses for model-specific defaults."""
return { return {"do_sample": False, "top_p": 1.0, "temperature": 1.0}
"do_sample": False,
"top_p": 1.0,
"temperature": 1.0
}
def get_model_init_kwargs(self, config: BenchmarkConfig) -> Dict[str, Any]: def get_model_init_kwargs(self, config: BenchmarkConfig) -> dict[str, Any]:
"""Get model initialization kwargs. Override in subclasses for model-specific parameters.""" """Get model initialization kwargs. Override in subclasses for model-specific parameters."""
return { return {"torch_dtype": getattr(torch, config.torch_dtype), "attn_implementation": config.attn_implementation}
"torch_dtype": getattr(torch, config.torch_dtype),
"attn_implementation": config.attn_implementation
}
def get_default_torch_dtype(self) -> str: def get_default_torch_dtype(self) -> str:
"""Get default torch dtype. Override in subclasses.""" """Get default torch dtype. Override in subclasses."""
@ -622,19 +619,19 @@ class ModelBenchmark(AbstractModelBenchmark):
"""Get default device. Override in subclasses.""" """Get default device. Override in subclasses."""
return "cuda" return "cuda"
def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']: def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
"""Create benchmark scenarios for HuggingFace models.""" """Create benchmark scenarios for HuggingFace models."""
scenarios = {} scenarios = {}
# Extract parameters with model-specific defaults # Extract parameters with model-specific defaults
model_id = kwargs.get('model_id', 'microsoft/DialoGPT-medium') model_id = kwargs.get("model_id", "microsoft/DialoGPT-medium")
warmup_iterations = kwargs.get('warmup_iterations', 3) warmup_iterations = kwargs.get("warmup_iterations", 3)
measurement_iterations = kwargs.get('measurement_iterations', 5) measurement_iterations = kwargs.get("measurement_iterations", 5)
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100) num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
include_sdpa_variants = kwargs.get('include_sdpa_variants', True) include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
device = kwargs.get('device', self.get_default_device()) device = kwargs.get("device", self.get_default_device())
torch_dtype = kwargs.get('torch_dtype', self.get_default_torch_dtype()) torch_dtype = kwargs.get("torch_dtype", self.get_default_torch_dtype())
batch_size = kwargs.get('batch_size', 1) batch_size = kwargs.get("batch_size", 1)
# Get configurations # Get configurations
attention_configs = self.get_attention_configs(include_sdpa_variants) attention_configs = self.get_attention_configs(include_sdpa_variants)
@ -654,7 +651,7 @@ class ModelBenchmark(AbstractModelBenchmark):
# Create unique config for this scenario # Create unique config for this scenario
config = BenchmarkConfig( config = BenchmarkConfig(
name=scenario_config['variant'], name=scenario_config["variant"],
model_id=model_id, model_id=model_id,
variant=scenario_config["variant"], variant=scenario_config["variant"],
compile_mode=scenario_config["compile_mode"], compile_mode=scenario_config["compile_mode"],
@ -666,7 +663,7 @@ class ModelBenchmark(AbstractModelBenchmark):
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
batch_size=batch_size, batch_size=batch_size,
attn_implementation=attn_implementation, attn_implementation=attn_implementation,
sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None,
) )
# Create scenario name # Create scenario name
@ -695,11 +692,7 @@ class ModelBenchmark(AbstractModelBenchmark):
description += desc_suffix description += desc_suffix
# Create scenario # Create scenario
scenario = BenchmarkScenario( scenario = BenchmarkScenario(name=scenario_name, config=config, description=description)
name=scenario_name,
config=config,
description=description
)
# Add setup callbacks based on variant # Add setup callbacks based on variant
if scenario_config["variant"] == "compiled": if scenario_config["variant"] == "compiled":
@ -718,16 +711,12 @@ class ModelBenchmark(AbstractModelBenchmark):
# Perform torch.compile # Perform torch.compile
if config.compile_mode is not None: if config.compile_mode is not None:
self.compiled_model = torch.compile( self.compiled_model = torch.compile(model, mode=config.compile_mode, **config.compile_options)
model,
mode=config.compile_mode,
**config.compile_options
)
else: else:
self.compiled_model = torch.compile(model, **config.compile_options) self.compiled_model = torch.compile(model, **config.compile_options)
# Setup static cache for compiled mode if needed # Setup static cache for compiled mode if needed
if config.use_cache and hasattr(self, 'inputs') and self.inputs is not None: if config.use_cache and hasattr(self, "inputs") and self.inputs is not None:
self._setup_static_cache(config) self._setup_static_cache(config)
def _setup_kernelization_callback(self, model, tokenizer, config, logger): def _setup_kernelization_callback(self, model, tokenizer, config, logger):
@ -737,10 +726,8 @@ class ModelBenchmark(AbstractModelBenchmark):
try: try:
from kernels import Mode, kernelize from kernels import Mode, kernelize
self.compiled_model = kernelize(
model, self.compiled_model = kernelize(model, mode=Mode.INFERENCE)
mode=Mode.INFERENCE
)
except Exception as e: except Exception as e:
if logger: if logger:
logger.warning(f"Failed to setup kernelized mode: {e}") logger.warning(f"Failed to setup kernelized mode: {e}")
@ -749,13 +736,14 @@ class ModelBenchmark(AbstractModelBenchmark):
def _setup_static_cache(self, config: BenchmarkConfig): def _setup_static_cache(self, config: BenchmarkConfig):
"""Setup static cache for compiled models. Override if needed.""" """Setup static cache for compiled models. Override if needed."""
if hasattr(self, 'inputs') and self.inputs is not None: if hasattr(self, "inputs") and self.inputs is not None:
try: try:
from transformers import StaticCache from transformers import StaticCache
seq_length = self.inputs["input_ids"].shape[1] seq_length = self.inputs["input_ids"].shape[1]
# Get the actual device the model is on # Get the actual device the model is on
if hasattr(self.model, 'device'): if hasattr(self.model, "device"):
cache_device = self.model.device cache_device = self.model.device
else: else:
cache_device = self.device cache_device = self.device
@ -765,7 +753,7 @@ class ModelBenchmark(AbstractModelBenchmark):
max_batch_size=config.batch_size, max_batch_size=config.batch_size,
max_cache_len=seq_length + config.num_tokens_to_generate, max_cache_len=seq_length + config.num_tokens_to_generate,
device=cache_device, device=cache_device,
dtype=getattr(torch, config.torch_dtype) dtype=getattr(torch, config.torch_dtype),
) )
self.logger.debug(f"StaticCache created on device: {cache_device}") self.logger.debug(f"StaticCache created on device: {cache_device}")
except (ImportError, TypeError) as e: except (ImportError, TypeError) as e:
@ -794,7 +782,6 @@ class ModelBenchmark(AbstractModelBenchmark):
def _load_model_and_tokenizer(self, config: BenchmarkConfig): def _load_model_and_tokenizer(self, config: BenchmarkConfig):
"""Load the model and tokenizer. Override in subclasses for custom loading.""" """Load the model and tokenizer. Override in subclasses for custom loading."""
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
# Load tokenizer # Load tokenizer
@ -812,14 +799,9 @@ class ModelBenchmark(AbstractModelBenchmark):
target_device = config.device target_device = config.device
# Get model initialization kwargs # Get model initialization kwargs
model_init_kwargs = self.get_model_init_kwargs(config) model_init_kwargs = self.get_model_init_kwargs(config)
model_init_kwargs.update({ model_init_kwargs.update({"generation_config": gen_config})
"generation_config": gen_config
})
self.model = AutoModelForCausalLM.from_pretrained( self.model = AutoModelForCausalLM.from_pretrained(config.model_id, **model_init_kwargs).eval()
config.model_id,
**model_init_kwargs
).eval()
# Move model to target device # Move model to target device
self.logger.info(f"Moving model to device: {target_device}") self.logger.info(f"Moving model to device: {target_device}")
@ -832,7 +814,7 @@ class ModelBenchmark(AbstractModelBenchmark):
self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt") self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt")
# Move inputs to the same device as the model # Move inputs to the same device as the model
if hasattr(self.model, 'device'): if hasattr(self.model, "device"):
# Model is on a single device # Model is on a single device
model_device = self.model.device model_device = self.model.device
else: else:
@ -849,16 +831,16 @@ class ModelBenchmark(AbstractModelBenchmark):
def cleanup_model(self) -> None: def cleanup_model(self) -> None:
"""Cleanup model resources.""" """Cleanup model resources."""
if hasattr(self, 'model') and self.model is not None: if hasattr(self, "model") and self.model is not None:
del self.model del self.model
self.model = None self.model = None
if hasattr(self, 'compiled_model') and self.compiled_model is not None: if hasattr(self, "compiled_model") and self.compiled_model is not None:
del self.compiled_model del self.compiled_model
self.compiled_model = None self.compiled_model = None
if hasattr(self, 'tokenizer') and self.tokenizer is not None: if hasattr(self, "tokenizer") and self.tokenizer is not None:
del self.tokenizer del self.tokenizer
self.tokenizer = None self.tokenizer = None
if hasattr(self, 'past_key_values') and self.past_key_values is not None: if hasattr(self, "past_key_values") and self.past_key_values is not None:
del self.past_key_values del self.past_key_values
self.past_key_values = None self.past_key_values = None
@ -877,7 +859,7 @@ class ModelBenchmark(AbstractModelBenchmark):
# Use SDPA context if specified # Use SDPA context if specified
with SDPAContext(config.sdpa_backend, self.logger): with SDPAContext(config.sdpa_backend, self.logger):
with torch.no_grad(): with torch.no_grad():
outputs = model_to_use.generate(**generation_kwargs) _ = model_to_use.generate(**generation_kwargs)
return timer.elapsed_time() return timer.elapsed_time()
@ -915,11 +897,11 @@ class ModelBenchmark(AbstractModelBenchmark):
"variant": config.variant, "variant": config.variant,
"compile_mode": config.compile_mode, "compile_mode": config.compile_mode,
"attn_implementation": config.attn_implementation, "attn_implementation": config.attn_implementation,
"sdpa_backend": config.sdpa_backend "sdpa_backend": config.sdpa_backend,
} },
) )
def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> Dict[str, Any]: def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> dict[str, Any]:
"""Get generation kwargs. Override in subclasses for custom generation.""" """Get generation kwargs. Override in subclasses for custom generation."""
generation_config_dict = self.get_default_generation_config() generation_config_dict = self.get_default_generation_config()
generation_kwargs = { generation_kwargs = {
@ -935,11 +917,12 @@ class ModelBenchmark(AbstractModelBenchmark):
if self.past_key_values is not None and config.variant == "compiled": if self.past_key_values is not None and config.variant == "compiled":
try: try:
from transformers import StaticCache from transformers import StaticCache
# Reset cache for each measurement # Reset cache for each measurement
seq_length = self.inputs["input_ids"].shape[1] seq_length = self.inputs["input_ids"].shape[1]
# Get the actual device the model is on # Get the actual device the model is on
if hasattr(self.model, 'device'): if hasattr(self.model, "device"):
cache_device = self.model.device cache_device = self.model.device
else: else:
cache_device = self.device cache_device = self.device
@ -949,7 +932,7 @@ class ModelBenchmark(AbstractModelBenchmark):
max_batch_size=config.batch_size, max_batch_size=config.batch_size,
max_cache_len=seq_length + max_new_tokens, max_cache_len=seq_length + max_new_tokens,
device=cache_device, device=cache_device,
dtype=getattr(torch, config.torch_dtype) dtype=getattr(torch, config.torch_dtype),
) )
generation_kwargs["past_key_values"] = fresh_cache generation_kwargs["past_key_values"] = fresh_cache
except (ImportError, TypeError) as e: except (ImportError, TypeError) as e:
@ -967,14 +950,13 @@ class BenchmarkRunner:
self.output_dir = output_dir self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
def run_benchmark( def run_benchmark(
self, self,
benchmark: ModelBenchmark, benchmark: ModelBenchmark,
scenarios: Dict[str, BenchmarkScenario], scenarios: dict[str, BenchmarkScenario],
collect_gpu_metrics: bool = True, collect_gpu_metrics: bool = True,
commit_id: Optional[str] = None commit_id: Optional[str] = None,
) -> Dict[str, Dict[str, Any]]: ) -> dict[str, dict[str, Any]]:
""" """
Run benchmarks using scenarios. Run benchmarks using scenarios.
@ -1021,7 +1003,7 @@ class BenchmarkRunner:
timestamp=datetime.utcnow().isoformat(), timestamp=datetime.utcnow().isoformat(),
commit_id=commit_id, commit_id=commit_id,
hardware_info=get_hardware_info(), hardware_info=get_hardware_info(),
config=config config=config,
) )
# Initialize GPU monitor # Initialize GPU monitor
@ -1037,11 +1019,13 @@ class BenchmarkRunner:
_ = benchmark.measure_latency(config) _ = benchmark.measure_latency(config)
except Exception as e: except Exception as e:
warmup_failures += 1 warmup_failures += 1
self.logger.warning(f"Warmup iteration {i+1} failed: {e}") self.logger.warning(f"Warmup iteration {i + 1} failed: {e}")
# If more than half the warmup iterations failed, skip this scenario # If more than half the warmup iterations failed, skip this scenario
if warmup_failures > config.warmup_iterations // 2: if warmup_failures > config.warmup_iterations // 2:
self.logger.warning(f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})") self.logger.warning(
f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})"
)
try: try:
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger) scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
benchmark.cleanup_model() benchmark.cleanup_model()
@ -1077,12 +1061,18 @@ class BenchmarkRunner:
if timing_result.time_per_output_token_seconds is not None: if timing_result.time_per_output_token_seconds is not None:
itl_measurements.append(timing_result.time_per_output_token_seconds) itl_measurements.append(timing_result.time_per_output_token_seconds)
itl_str = f", itl={timing_result.time_per_output_token_seconds:.4f}s/token" if timing_result.time_per_output_token_seconds else "" itl_str = (
self.logger.debug(f"Iteration {i+1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}") f", itl={timing_result.time_per_output_token_seconds:.4f}s/token"
if timing_result.time_per_output_token_seconds
else ""
)
self.logger.debug(
f"Iteration {i + 1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}"
)
except Exception as e: except Exception as e:
measurement_failures += 1 measurement_failures += 1
self.logger.warning(f"Measurement iteration {i+1} failed: {e}") self.logger.warning(f"Measurement iteration {i + 1} failed: {e}")
# Stop GPU monitoring # Stop GPU monitoring
gpu_metrics = {} gpu_metrics = {}
@ -1091,7 +1081,9 @@ class BenchmarkRunner:
# If we don't have enough successful measurements, skip this scenario # If we don't have enough successful measurements, skip this scenario
if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2: if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2:
self.logger.warning(f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})") self.logger.warning(
f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})"
)
try: try:
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger) scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
benchmark.cleanup_model() benchmark.cleanup_model()
@ -1104,7 +1096,7 @@ class BenchmarkRunner:
"metadata": asdict(metadata), "metadata": asdict(metadata),
"measurements": {}, "measurements": {},
"gpu_metrics": gpu_metrics, "gpu_metrics": gpu_metrics,
"scenario_description": scenario.description "scenario_description": scenario.description,
} }
if latency_measurements: if latency_measurements:
@ -1112,15 +1104,21 @@ class BenchmarkRunner:
scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats) scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats)
if ttft_measurements: if ttft_measurements:
ttft_stats = BenchmarkStatistics.from_measurements("time_to_first_token_seconds", ttft_measurements) ttft_stats = BenchmarkStatistics.from_measurements(
"time_to_first_token_seconds", ttft_measurements
)
scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats) scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats)
if tokens_per_sec_measurements: if tokens_per_sec_measurements:
tps_stats = BenchmarkStatistics.from_measurements("tokens_per_second", tokens_per_sec_measurements, "tokens/sec") tps_stats = BenchmarkStatistics.from_measurements(
"tokens_per_second", tokens_per_sec_measurements, "tokens/sec"
)
scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats) scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats)
if itl_measurements: if itl_measurements:
itl_stats = BenchmarkStatistics.from_measurements("time_per_output_token_seconds", itl_measurements, "seconds/token") itl_stats = BenchmarkStatistics.from_measurements(
"time_per_output_token_seconds", itl_measurements, "seconds/token"
)
scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats) scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats)
# Log summary # Log summary
@ -1149,6 +1147,7 @@ class BenchmarkRunner:
except Exception as e: except Exception as e:
self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}") self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}")
import traceback import traceback
self.logger.debug(traceback.format_exc()) self.logger.debug(traceback.format_exc())
# Try to clean up if possible # Try to clean up if possible
@ -1169,7 +1168,7 @@ class BenchmarkRunner:
return all_results return all_results
def save_results(self, model_name: str, results: Dict[str, Dict[str, Any]]) -> str: def save_results(self, model_name: str, results: dict[str, dict[str, Any]]) -> str:
"""Save benchmark results to JSON file.""" """Save benchmark results to JSON file."""
# Create model-specific subdirectory # Create model-specific subdirectory
model_dir = os.path.join(self.output_dir, model_name) model_dir = os.path.join(self.output_dir, model_name)
@ -1181,24 +1180,20 @@ class BenchmarkRunner:
filepath = os.path.join(model_dir, filename) filepath = os.path.join(model_dir, filename)
# Prepare output structure # Prepare output structure
output_data = { output_data = {"model_name": model_name, "benchmark_scenarios": []}
"model_name": model_name,
"benchmark_scenarios": []
}
for config_name, config_results in results.items(): for config_name, config_results in results.items():
scenario = { scenario = {
"scenario_name": config_name, "scenario_name": config_name,
"metadata": config_results["metadata"], "metadata": config_results["metadata"],
"measurements": config_results["measurements"], "measurements": config_results["measurements"],
"gpu_metrics": config_results.get("gpu_metrics", {}) "gpu_metrics": config_results.get("gpu_metrics", {}),
} }
output_data["benchmark_scenarios"].append(scenario) output_data["benchmark_scenarios"].append(scenario)
# Save to JSON file # Save to JSON file
with open(filepath, 'w') as f: with open(filepath, "w") as f:
json.dump(output_data, f, indent=2, default=str) json.dump(output_data, f, indent=2, default=str)
self.logger.info(f"Results saved to {filepath}") self.logger.info(f"Results saved to {filepath}")
return filepath return filepath

View File

@ -20,38 +20,34 @@ in the ./benches directory, organizing outputs into model-specific subfolders.
import argparse import argparse
import importlib.util import importlib.util
import json
import logging import logging
import os import os
import sys import sys
import json
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Dict, List, Any, Optional from typing import Any, Optional
def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger: def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
"""Setup logging configuration.""" """Setup logging configuration."""
numeric_level = getattr(logging, log_level.upper(), None) numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int): if not isinstance(numeric_level, int):
raise ValueError(f'Invalid log level: {log_level}') raise ValueError(f"Invalid log level: {log_level}")
handlers = [logging.StreamHandler(sys.stdout)] handlers = [logging.StreamHandler(sys.stdout)]
if enable_file_logging: if enable_file_logging:
handlers.append( handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
)
logging.basicConfig( logging.basicConfig(
level=numeric_level, level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
handlers=handlers
) )
return logging.getLogger(__name__) return logging.getLogger(__name__)
def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]: def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
""" """
Discover all benchmark modules in the benches directory. Discover all benchmark modules in the benches directory.
@ -77,20 +73,24 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
spec.loader.exec_module(module) spec.loader.exec_module(module)
# Check if it has a benchmark runner function # Check if it has a benchmark runner function
if hasattr(module, f'run_{module_name}'): if hasattr(module, f"run_{module_name}"):
benchmarks.append({ benchmarks.append(
'name': module_name, {
'path': str(py_file), "name": module_name,
'module': module, "path": str(py_file),
'runner_function': getattr(module, f'run_{module_name}') "module": module,
}) "runner_function": getattr(module, f"run_{module_name}"),
elif hasattr(module, 'run_benchmark'): }
benchmarks.append({ )
'name': module_name, elif hasattr(module, "run_benchmark"):
'path': str(py_file), benchmarks.append(
'module': module, {
'runner_function': getattr(module, 'run_benchmark') "name": module_name,
}) "path": str(py_file),
"module": module,
"runner_function": getattr(module, "run_benchmark"),
}
)
else: else:
logging.warning(f"No runner function found in {py_file}") logging.warning(f"No runner function found in {py_file}")
@ -101,10 +101,7 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
def run_single_benchmark( def run_single_benchmark(
benchmark_info: Dict[str, Any], benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
output_dir: str,
logger: logging.Logger,
**kwargs
) -> Optional[str]: ) -> Optional[str]:
""" """
Run a single benchmark and return the output file path. Run a single benchmark and return the output file path.
@ -118,21 +115,19 @@ def run_single_benchmark(
Returns: Returns:
Path to the output file if successful, None otherwise Path to the output file if successful, None otherwise
""" """
benchmark_name = benchmark_info['name'] benchmark_name = benchmark_info["name"]
runner_func = benchmark_info['runner_function'] runner_func = benchmark_info["runner_function"]
logger.info(f"Running benchmark: {benchmark_name}") logger.info(f"Running benchmark: {benchmark_name}")
try: try:
# Check function signature to determine what arguments to pass # Check function signature to determine what arguments to pass
import inspect import inspect
sig = inspect.signature(runner_func) sig = inspect.signature(runner_func)
# Prepare arguments based on function signature # Prepare arguments based on function signature
func_kwargs = { func_kwargs = {"logger": logger, "output_dir": output_dir}
'logger': logger,
'output_dir': output_dir
}
# Add other kwargs if the function accepts them # Add other kwargs if the function accepts them
for param_name in sig.parameters: for param_name in sig.parameters:
@ -145,8 +140,7 @@ def run_single_benchmark(
if has_var_kwargs: if has_var_kwargs:
valid_kwargs = {**func_kwargs, **kwargs} valid_kwargs = {**func_kwargs, **kwargs}
else: else:
valid_kwargs = {k: v for k, v in func_kwargs.items() valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
if k in sig.parameters}
# Run the benchmark # Run the benchmark
result = runner_func(**valid_kwargs) result = runner_func(**valid_kwargs)
@ -161,15 +155,12 @@ def run_single_benchmark(
except Exception as e: except Exception as e:
logger.error(f"Benchmark {benchmark_name} failed: {e}") logger.error(f"Benchmark {benchmark_name} failed: {e}")
import traceback import traceback
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
return None return None
def generate_summary_report( def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
output_dir: str,
benchmark_results: Dict[str, Any],
logger: logging.Logger
) -> str:
"""Generate a summary report of all benchmark runs.""" """Generate a summary report of all benchmark runs."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json") summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
@ -179,13 +170,13 @@ def generate_summary_report(
"timestamp": datetime.utcnow().isoformat(), "timestamp": datetime.utcnow().isoformat(),
"total_benchmarks": len(benchmark_results), "total_benchmarks": len(benchmark_results),
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]), "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]) "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
}, },
"benchmark_results": benchmark_results, "benchmark_results": benchmark_results,
"output_directory": output_dir "output_directory": output_dir,
} }
with open(summary_file, 'w') as f: with open(summary_file, "w") as f:
json.dump(summary_data, f, indent=2, default=str) json.dump(summary_data, f, indent=2, default=str)
logger.info(f"Summary report saved to: {summary_file}") logger.info(f"Summary report saved to: {summary_file}")
@ -194,22 +185,20 @@ def generate_summary_report(
def main(): def main():
"""Main entry point for the benchmarking script.""" """Main entry point for the benchmarking script."""
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
description="Run all benchmarks in the ./benches directory"
)
parser.add_argument( parser.add_argument(
"--output-dir", "--output-dir",
type=str, type=str,
default="benchmark_results", default="benchmark_results",
help="Base output directory for benchmark results (default: benchmark_results)" help="Base output directory for benchmark results (default: benchmark_results)",
) )
parser.add_argument( parser.add_argument(
"--benches-dir", "--benches-dir",
type=str, type=str,
default="./benches", default="./benches",
help="Directory containing benchmark implementations (default: ./benches)" help="Directory containing benchmark implementations (default: ./benches)",
) )
parser.add_argument( parser.add_argument(
@ -217,66 +206,34 @@ def main():
type=str, type=str,
choices=["DEBUG", "INFO", "WARNING", "ERROR"], choices=["DEBUG", "INFO", "WARNING", "ERROR"],
default="INFO", default="INFO",
help="Logging level (default: INFO)" help="Logging level (default: INFO)",
) )
parser.add_argument( parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
"--model-id",
type=str, parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
help="Specific model ID to benchmark (if supported by benchmarks)"
)
parser.add_argument( parser.add_argument(
"--warmup-iterations", "--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
type=int,
default=3,
help="Number of warmup iterations (default: 3)"
)
parser.add_argument(
"--measurement-iterations",
type=int,
default=5,
help="Number of measurement iterations (default: 5)"
) )
parser.add_argument( parser.add_argument(
"--num-tokens-to-generate", "--num-tokens-to-generate",
type=int, type=int,
default=100, default=100,
help="Number of tokens to generate in benchmarks (default: 100)" help="Number of tokens to generate in benchmarks (default: 100)",
) )
parser.add_argument( parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
"--include",
type=str, parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
nargs="*",
help="Only run benchmarks matching these names" parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
)
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
parser.add_argument( parser.add_argument(
"--exclude", "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
type=str,
nargs="*",
help="Exclude benchmarks matching these names"
)
parser.add_argument(
"--enable-mock",
action="store_true",
help="Enable mock benchmark (skipped by default)"
)
parser.add_argument(
"--enable-file-logging",
action="store_true",
help="Enable file logging (disabled by default)"
)
parser.add_argument(
"--commit-id",
type=str,
help="Git commit ID for metadata (if not provided, will auto-detect from git)"
) )
args = parser.parse_args() args = parser.parse_args()
@ -304,13 +261,15 @@ def main():
filtered_benchmarks = benchmarks filtered_benchmarks = benchmarks
if args.include: if args.include:
filtered_benchmarks = [b for b in filtered_benchmarks filtered_benchmarks = [
if any(pattern in b['name'] for pattern in args.include)] b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
]
logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}") logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
if args.exclude: if args.exclude:
filtered_benchmarks = [b for b in filtered_benchmarks filtered_benchmarks = [
if not any(pattern in b['name'] for pattern in args.exclude)] b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
]
logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}") logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
if not filtered_benchmarks: if not filtered_benchmarks:
@ -319,34 +278,29 @@ def main():
# Prepare common kwargs for benchmarks # Prepare common kwargs for benchmarks
benchmark_kwargs = { benchmark_kwargs = {
'warmup_iterations': args.warmup_iterations, "warmup_iterations": args.warmup_iterations,
'measurement_iterations': args.measurement_iterations, "measurement_iterations": args.measurement_iterations,
'num_tokens_to_generate': args.num_tokens_to_generate "num_tokens_to_generate": args.num_tokens_to_generate,
} }
if args.model_id: if args.model_id:
benchmark_kwargs['model_id'] = args.model_id benchmark_kwargs["model_id"] = args.model_id
# Add enable_mock flag for mock benchmark # Add enable_mock flag for mock benchmark
benchmark_kwargs['enable_mock'] = args.enable_mock benchmark_kwargs["enable_mock"] = args.enable_mock
# Add commit_id if provided # Add commit_id if provided
if args.commit_id: if args.commit_id:
benchmark_kwargs['commit_id'] = args.commit_id benchmark_kwargs["commit_id"] = args.commit_id
# Run benchmarks # Run benchmarks
benchmark_results = {} benchmark_results = {}
successful_count = 0 successful_count = 0
for benchmark_info in filtered_benchmarks: for benchmark_info in filtered_benchmarks:
result = run_single_benchmark( result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
benchmark_info,
args.output_dir,
logger,
**benchmark_kwargs
)
benchmark_results[benchmark_info['name']] = result benchmark_results[benchmark_info["name"]] = result
if result is not None: if result is not None:
successful_count += 1 successful_count += 1
@ -377,6 +331,7 @@ def main():
except Exception as e: except Exception as e:
logger.error(f"Benchmark run failed: {e}") logger.error(f"Benchmark run failed: {e}")
import traceback import traceback
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
return 1 return 1

View File

@ -4,8 +4,8 @@ import datasets
import transformers import transformers
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
from transformers.utils import logging
from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import logging
logging.set_verbosity_info() logging.set_verbosity_info()
@ -22,7 +22,9 @@ imperfect = 0
wrong = 0 wrong = 0
def check_diff(spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool: def check_diff(
spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
) -> bool:
if spm_diff == list(reversed(tok_diff)): if spm_diff == list(reversed(tok_diff)):
# AAA -> AA+A vs A+AA case. # AAA -> AA+A vs A+AA case.
return True return True
@ -54,7 +56,9 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
return False return False
def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool: def check_details(
line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
) -> bool:
# Encoding can be the same with same result AAA -> A + AA vs AA + A # Encoding can be the same with same result AAA -> A + AA vs AA + A
# We can check that we use at least exactly the same number of tokens. # We can check that we use at least exactly the same number of tokens.
for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)): for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
@ -90,7 +94,9 @@ def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTr
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width] if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
] ]
for j in possible_matches: for j in possible_matches:
if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], slow, fast) and check_details( if check_diff(
spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
) and check_details(
line, line,
spm_ids[first + i : last], spm_ids[first + i : last],
tok_ids[first + j : last], tok_ids[first + j : last],
@ -140,9 +146,9 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
if skip_assert: if skip_assert:
return return
assert ( assert slow_ids == fast_ids, (
slow_ids == fast_ids f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
), f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}" )
def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None: def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:

View File

@ -15,6 +15,7 @@
Script to close stale issue. Taken in part from the AllenNLP repository. Script to close stale issue. Taken in part from the AllenNLP repository.
https://github.com/allenai/allennlp. https://github.com/allenai/allennlp.
""" """
import os import os
from datetime import datetime as dt from datetime import datetime as dt
@ -39,10 +40,11 @@ def main():
for i, issue in enumerate(open_issues): for i, issue in enumerate(open_issues):
print(i, issue) print(i, issue)
comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True) comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
last_comment = comments[0] if len(comments) > 0 else None last_comment = comments[0] if len(comments) > 0 else None
if ( if (
last_comment is not None and last_comment.user.login == "github-actions[bot]" last_comment is not None
and last_comment.user.login == "github-actions[bot]"
and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7 and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30 and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels()) and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())