Enable ruff on benchmark and scripts (#40634)

* Enable ruff on benchmark and scripts

Signed-off-by: cyy <cyyever@outlook.com>

* Cover benchmark_v2

Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>

* correct

* style

* style

---------

Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>
Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
This commit is contained in:
Yuanyuan Chen
2025-09-10 17:38:06 +08:00
committed by GitHub
parent 08edec9f7d
commit a5ecd94a3f
11 changed files with 661 additions and 649 deletions

View File

@ -3,7 +3,7 @@
# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
export PYTHONPATH = src
check_dirs := examples tests src utils
check_dirs := examples tests src utils scripts benchmark benchmark_v2
exclude_folders := ""

View File

@ -11,25 +11,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from logging import Logger
import os
import sys
from logging import Logger
from threading import Event, Thread
from time import perf_counter, sleep
from typing import Optional
import sys
# Add the parent directory to Python path to import benchmarks_entrypoint
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from benchmarks_entrypoint import MetricsRecorder
import gpustat
import psutil
import psycopg2
from benchmarks_entrypoint import MetricsRecorder
# Optional heavy ML dependencies - only required when actually running the benchmark
try:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
@ -63,7 +66,13 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
def run_benchmark(
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
logger: Logger,
repository: str,
branch: str,
commit_id: str,
commit_msg: str,
metrics_recorder=None,
num_tokens_to_generate=100,
):
# Check if required ML dependencies are available
if not TRANSFORMERS_AVAILABLE:
@ -154,7 +163,7 @@ def run_benchmark(
# First eager forward pass
logger.info("running first eager forward pass")
start = perf_counter()
outputs = model(**inputs)
_ = model(**inputs)
torch.cuda.synchronize()
end = perf_counter()
first_eager_fwd_pass_time = end - start
@ -163,7 +172,7 @@ def run_benchmark(
# Second eager forward pass (should be faster)
logger.info("running second eager forward pass")
start = perf_counter()
outputs = model(**inputs)
_ = model(**inputs)
torch.cuda.synchronize()
end = perf_counter()
second_eager_fwd_pass_time = end - start

View File

@ -31,9 +31,7 @@ from contextlib import contextmanager
from pathlib import Path
from git import Repo
from huggingface_hub import HfApi
from optimum_benchmark import Benchmark
from optimum_benchmark_wrapper import main

View File

@ -13,19 +13,20 @@
# limitations under the License.
import argparse
import importlib.util
import json
import logging
import os
import sys
import json
import uuid
from datetime import datetime
from typing import Dict, Tuple, Optional, List
import pandas as pd
try:
from psycopg2.extensions import register_adapter
from psycopg2.extras import Json
register_adapter(dict, Json)
PSYCOPG2_AVAILABLE = True
except ImportError:
@ -38,8 +39,14 @@ class ImportModuleException(Exception):
class MetricsRecorder:
def __init__(
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str,
collect_csv_data: bool = True
self,
connection,
logger: logging.Logger,
repository: str,
branch: str,
commit_id: str,
commit_msg: str,
collect_csv_data: bool = True,
):
self.conn = connection
self.use_database = connection is not None
@ -55,23 +62,39 @@ class MetricsRecorder:
# For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
if self.collect_csv_data:
# Initialize empty DataFrames with proper schemas
self.benchmarks_df = pd.DataFrame(columns=[
'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message',
'metadata', 'created_at'
])
self.device_measurements_df = pd.DataFrame(columns=[
'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util',
'gpu_mem_megabytes', 'time'
])
self.model_measurements_df = pd.DataFrame(columns=[
'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
'second_eager_generate_time_secs', 'time_to_first_token_secs',
'time_to_second_token_secs', 'time_to_third_token_secs',
'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
'fourth_compile_generate_time_secs'
])
self.benchmarks_df = pd.DataFrame(
columns=[
"benchmark_id",
"repository",
"branch",
"commit_id",
"commit_message",
"metadata",
"created_at",
]
)
self.device_measurements_df = pd.DataFrame(
columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
)
self.model_measurements_df = pd.DataFrame(
columns=[
"benchmark_id",
"time",
"model_load_time",
"first_eager_forward_pass_time_secs",
"second_eager_forward_pass_time_secs",
"first_eager_generate_time_secs",
"second_eager_generate_time_secs",
"time_to_first_token_secs",
"time_to_second_token_secs",
"time_to_third_token_secs",
"time_to_next_token_mean_secs",
"first_compile_generate_time_secs",
"second_compile_generate_time_secs",
"third_compile_generate_time_secs",
"fourth_compile_generate_time_secs",
]
)
else:
self.benchmarks_df = None
self.device_measurements_df = None
@ -95,15 +118,19 @@ class MetricsRecorder:
# Store benchmark data for CSV export (if enabled)
if self.collect_csv_data:
# Add row to pandas DataFrame
new_row = pd.DataFrame([{
'benchmark_id': benchmark_id,
'repository': self.repository,
'branch': self.branch,
'commit_id': self.commit_id,
'commit_message': self.commit_msg,
'metadata': json.dumps(metadata),
'created_at': datetime.utcnow().isoformat()
}])
new_row = pd.DataFrame(
[
{
"benchmark_id": benchmark_id,
"repository": self.repository,
"branch": self.branch,
"commit_id": self.commit_id,
"commit_message": self.commit_msg,
"metadata": json.dumps(metadata),
"created_at": datetime.utcnow().isoformat(),
}
]
)
self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
mode_info = []
@ -123,14 +150,18 @@ class MetricsRecorder:
# Store device measurements for CSV export (if enabled)
if self.collect_csv_data:
# Add row to pandas DataFrame
new_row = pd.DataFrame([{
'benchmark_id': benchmark_id,
'cpu_util': cpu_util,
'mem_megabytes': mem_megabytes,
'gpu_util': gpu_util,
'gpu_mem_megabytes': gpu_mem_megabytes,
'time': datetime.utcnow().isoformat()
}])
new_row = pd.DataFrame(
[
{
"benchmark_id": benchmark_id,
"cpu_util": cpu_util,
"mem_megabytes": mem_megabytes,
"gpu_util": gpu_util,
"gpu_mem_megabytes": gpu_mem_megabytes,
"time": datetime.utcnow().isoformat(),
}
]
)
self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
# Store in database if available
@ -149,10 +180,7 @@ class MetricsRecorder:
# Store model measurements for CSV export (if enabled)
if self.collect_csv_data:
# Add row to pandas DataFrame with flattened measurements
row_data = {
'benchmark_id': benchmark_id,
'time': datetime.utcnow().isoformat()
}
row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
# Flatten the measurements dict into the row
row_data.update(measurements)
@ -241,28 +269,34 @@ class MetricsRecorder:
# Add model measurements (join on benchmark_id)
if len(self.model_measurements_df) > 0:
# Drop 'time' column from model measurements to avoid conflicts
model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")
# Calculate device measurement aggregates using pandas groupby
if len(self.device_measurements_df) > 0:
device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
'cpu_util': ['mean', 'max', 'std', 'count'],
'mem_megabytes': ['mean', 'max', 'std'],
'gpu_util': ['mean', 'max', 'std'],
'gpu_mem_megabytes': ['mean', 'max', 'std']
}).round(3)
device_agg = (
self.device_measurements_df.groupby("benchmark_id")
.agg(
{
"cpu_util": ["mean", "max", "std", "count"],
"mem_megabytes": ["mean", "max", "std"],
"gpu_util": ["mean", "max", "std"],
"gpu_mem_megabytes": ["mean", "max", "std"],
}
)
.round(3)
)
# Flatten column names
device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
device_agg = device_agg.reset_index()
# Rename count column to be more descriptive
if 'cpu_util_count' in device_agg.columns:
device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
if "cpu_util_count" in device_agg.columns:
device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})
# Merge with summary
summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")
# Export the comprehensive summary
summary_df.to_csv(summary_file, index=False)
@ -313,18 +347,13 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
help="The commit message associated with the commit, truncated to 70 characters.",
)
parser.add_argument(
"--csv",
action="store_true",
default=False,
help="Enable CSV output files generation."
)
parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")
parser.add_argument(
"--csv-output-dir",
type=str,
default="benchmark_results",
help="Directory for CSV output files (default: benchmark_results)."
help="Directory for CSV output files (default: benchmark_results).",
)
args = parser.parse_args()
@ -356,6 +385,7 @@ def create_database_connection():
try:
import psycopg2
conn = psycopg2.connect("dbname=metrics")
logger.info("Successfully connected to database")
return conn
@ -364,8 +394,9 @@ def create_database_connection():
return None
def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str,
generate_csv: bool = False) -> MetricsRecorder:
def create_global_metrics_recorder(
repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
) -> MetricsRecorder:
"""
Create a global metrics recorder that will be used across all benchmarks.
"""
@ -415,7 +446,7 @@ if __name__ == "__main__":
try:
logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
module = import_from_path(entry.name.split(".")[0], entry.path)
if hasattr(module, 'run_benchmark'):
if hasattr(module, "run_benchmark"):
benchmark_modules.append(entry.name)
logger.debug(f"discovered benchmark: {entry.name}")
else:
@ -443,7 +474,9 @@ if __name__ == "__main__":
module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
except TypeError:
# Fall back to the old signature for backward compatibility
logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
logger.warning(
f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
)
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
successful_benchmarks += 1

View File

@ -3,7 +3,11 @@ import subprocess
def main(config_dir, config_name, args):
subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
subprocess.run(
["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
+ ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
+ args
)
if __name__ == "__main__":

View File

@ -12,18 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import logging
from typing import Dict, Any, List
from benchmark_framework import ModelBenchmark
import os
from typing import Any
import torch
from benchmark_framework import ModelBenchmark
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "1"
torch.set_float32_matmul_precision("high")
class LLaMABenchmark(ModelBenchmark):
"""Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
@ -31,9 +32,7 @@ class LLaMABenchmark(ModelBenchmark):
super().__init__(logger)
self._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA
def get_scenario_configs(self) -> List[Dict[str, Any]]:
def get_scenario_configs(self) -> list[dict[str, Any]]:
"""
Get LLaMA-specific scenario configurations.
@ -43,24 +42,33 @@ class LLaMABenchmark(ModelBenchmark):
return [
# Eager variants
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
# Compiled variants
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
{
"variant": "compiled",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Compiled with max autotune",
},
# Kernelized variant (if available)
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
{
"variant": "kernelized",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Kernelized execution",
},
]
def _is_kernelization_available(self) -> bool:
"""Check if kernelization is available for LLaMA."""
try:
from kernels import Mode, kernelize
from kernels import Mode, kernelize # noqa: F401
return True
except ImportError:
self.logger.debug("Kernelization not available: kernels module not found")
return False
def get_default_generation_config(self) -> Dict[str, Any]:
def get_default_generation_config(self) -> dict[str, Any]:
"""Get LLaMA-specific generation configuration."""
return {
"do_sample": False,
@ -70,9 +78,8 @@ class LLaMABenchmark(ModelBenchmark):
"max_new_tokens": None, # Will be set per scenario
}
def get_model_init_kwargs(self, config) -> Dict[str, Any]:
def get_model_init_kwargs(self, config) -> dict[str, Any]:
"""Get LLaMA-specific model initialization kwargs."""
from benchmark_framework import BenchmarkConfig
return {
"torch_dtype": getattr(torch, config.torch_dtype),
"attn_implementation": config.attn_implementation,
@ -103,18 +110,20 @@ def run_llama(logger, output_dir, **kwargs):
from benchmark_framework import BenchmarkRunner
# Extract parameters with defaults
model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf')
warmup_iterations = kwargs.get('warmup_iterations', 3)
measurement_iterations = kwargs.get('measurement_iterations', 5)
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
device = kwargs.get('device', 'cuda')
torch_dtype = kwargs.get('torch_dtype', 'float16')
batch_size = kwargs.get('batch_size', 1)
commit_id = kwargs.get('commit_id', None)
model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
warmup_iterations = kwargs.get("warmup_iterations", 3)
measurement_iterations = kwargs.get("measurement_iterations", 5)
num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
device = kwargs.get("device", "cuda")
torch_dtype = kwargs.get("torch_dtype", "float16")
batch_size = kwargs.get("batch_size", 1)
commit_id = kwargs.get("commit_id")
logger.info(f"Starting LLaMA benchmark for model: {model_id}")
logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}")
logger.info(
f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
)
try:
# Create benchmark instance
@ -129,7 +138,7 @@ def run_llama(logger, output_dir, **kwargs):
include_sdpa_variants=include_sdpa_variants,
device=device,
torch_dtype=torch_dtype,
batch_size=batch_size
batch_size=batch_size,
)
logger.info(f"Created {len(scenarios)} benchmark scenarios")
@ -143,7 +152,7 @@ def run_llama(logger, output_dir, **kwargs):
return None
# Save results
model_name = model_id.split('/')[-1] # Extract model name from ID
model_name = model_id.split("/")[-1] # Extract model name from ID
output_file = runner.save_results(model_name, results)
logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
@ -152,5 +161,6 @@ def run_llama(logger, output_dir, **kwargs):
except Exception as e:
logger.error(f"LLaMA benchmark failed: {e}")
import traceback
logger.debug(traceback.format_exc())
raise

View File

@ -14,28 +14,26 @@
import gc
import json
import os
import subprocess
import sys
import time
import statistics
import threading
from abc import ABC, abstractmethod
from contextlib import nullcontext
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Union, TypedDict
import logging
import os
import statistics
import sys
import threading
import time
from abc import ABC, abstractmethod
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import Any, Optional, TypedDict, Union
import gpustat
import numpy as np
import psutil
import gpustat
import torch
class GPUMetrics(TypedDict):
"""GPU monitoring result with GPU metrics."""
gpu_utilization_mean: float
gpu_utilization_max: float
gpu_utilization_min: float
@ -48,6 +46,7 @@ class GPUMetrics(TypedDict):
class NoGPU(TypedDict):
"""GPU monitoring result without GPU metrics."""
gpu_monitoring_status: str
gpu_monitoring_reason: str
@ -134,6 +133,7 @@ class ArchAwareTimer:
@dataclass
class BenchmarkConfig:
"""Configuration for a single benchmark scenario."""
name: str
model_id: str
variant: str = "eager" # "eager", "compiled", "kernelized"
@ -143,13 +143,13 @@ class BenchmarkConfig:
device: str = "cuda"
torch_dtype: str = "float16"
compile_mode: Optional[str] = None # None, "default", "reduce-overhead", "max-autotune"
compile_options: Dict[str, Any] = field(default_factory=dict)
compile_options: dict[str, Any] = field(default_factory=dict)
use_cache: bool = True
batch_size: int = 1
sequence_length: Optional[int] = None
attn_implementation: str = "sdpa" # "eager", "sdpa", "flash_attention_2"
sdpa_backend: Optional[str] = None # None, "math", "flash_attention", "efficient_attention", "cudnn_attention"
custom_params: Dict[str, Any] = field(default_factory=dict)
custom_params: dict[str, Any] = field(default_factory=dict)
class BenchmarkScenario:
@ -195,24 +195,24 @@ class BenchmarkScenario:
return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')"
@dataclass
class TimingResult:
"""Result from a timing measurement."""
time_to_first_token_seconds: Optional[float] = None
latency_seconds: float = 0.0
tokens_per_second: Optional[float] = None
time_per_output_token_seconds: Optional[float] = None
total_tokens_generated: int = 0
metadata: Dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass
class BenchmarkStatistics:
"""Statistical analysis of benchmark measurements."""
name: str
measurements: List[float]
measurements: list[float]
mean: float
median: float
std: float
@ -226,7 +226,7 @@ class BenchmarkStatistics:
unit: str = "seconds"
@classmethod
def from_measurements(cls, name: str, measurements: List[float], unit: str = "seconds") -> 'BenchmarkStatistics':
def from_measurements(cls, name: str, measurements: list[float], unit: str = "seconds") -> "BenchmarkStatistics":
"""Create statistics from a list of measurements."""
if not measurements:
raise ValueError("Cannot create statistics from empty measurements")
@ -246,13 +246,14 @@ class BenchmarkStatistics:
p90=float(np.percentile(measurements_array, 90)),
p95=float(np.percentile(measurements_array, 95)),
p99=float(np.percentile(measurements_array, 99)),
unit=unit
unit=unit,
)
@dataclass
class HardwareInfo:
"""Hardware information collected during benchmarking."""
gpu_name: str
gpu_memory_total_mb: int
cpu_count: int
@ -265,6 +266,7 @@ class HardwareInfo:
@dataclass
class BenchmarkMetadata:
"""Metadata collected for each benchmark run."""
timestamp: str
commit_id: str
hardware_info: HardwareInfo
@ -274,7 +276,7 @@ class BenchmarkMetadata:
class GPUMonitor:
"""Monitor GPU utilization during benchmark execution."""
def __init__(self, sample_interval: float = 0.1, logger: logging.Logger = None):
def __init__(self, sample_interval: float = 0.1, logger: Optional[logging.Logger] = None):
self.sample_interval = sample_interval
self.logger = logger or logging.getLogger(__name__)
self.stop_event = threading.Event()
@ -321,10 +323,7 @@ class GPUMonitor:
def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]:
"""Stop monitoring and return collected metrics."""
if not self.gpu_available:
return NoGPU(
gpu_monitoring_status="disabled",
gpu_monitoring_reason="no_gpus_available"
)
return NoGPU(gpu_monitoring_status="disabled", gpu_monitoring_reason="no_gpus_available")
# Signal the monitoring thread to stop
self.stop_event.set()
@ -340,15 +339,12 @@ class GPUMonitor:
gpu_memory_used_max=max(self.gpu_memory_used),
gpu_memory_used_min=min(self.gpu_memory_used),
sample_count=len(self.gpu_utilization),
gpu_monitoring_status="success"
gpu_monitoring_status="success",
)
self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected")
return metrics
else:
return NoGPU(
gpu_monitoring_status="failed",
gpu_monitoring_reason="no_samples_collected"
)
return NoGPU(gpu_monitoring_status="failed", gpu_monitoring_reason="no_samples_collected")
def _monitor_loop(self):
"""Background monitoring loop using threading.Event for communication."""
@ -400,7 +396,7 @@ def get_hardware_info() -> HardwareInfo:
torch_version = torch.__version__
cuda_version = None
if hasattr(torch, 'cuda') and torch.cuda.is_available():
if hasattr(torch, "cuda") and torch.cuda.is_available():
cuda_version = torch.version.cuda
return HardwareInfo(
@ -410,14 +406,14 @@ def get_hardware_info() -> HardwareInfo:
memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)),
python_version=f"{sys.version.split()[0]}",
torch_version=torch_version,
cuda_version=cuda_version
cuda_version=cuda_version,
)
def flush_memory():
"""Flush GPU memory and run garbage collection."""
gc.collect()
if hasattr(torch, 'cuda') and torch.cuda.is_available():
if hasattr(torch, "cuda") and torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
@ -442,13 +438,10 @@ def get_sdpa_backend(backend_name: Optional[str]):
return None
class SDPAContext:
"""Context manager for SDPA kernel selection."""
def __init__(self, backend_name: Optional[str], logger: logging.Logger = None):
def __init__(self, backend_name: Optional[str], logger: Optional[logging.Logger] = None):
self.backend_name = backend_name
self.logger = logger or logging.getLogger(__name__)
self.backend = get_sdpa_backend(backend_name) if backend_name else None
@ -466,7 +459,9 @@ class SDPAContext:
self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}")
self.context = None
elif self.backend_name and self.logger:
self.logger.debug(f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})")
self.logger.debug(
f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})"
)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
@ -490,7 +485,7 @@ class AbstractModelBenchmark(ABC):
self.scenarios = {} # Map of scenario_name -> BenchmarkScenario
@abstractmethod
def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
"""Create and return a dictionary of benchmark scenarios."""
pass
@ -518,7 +513,7 @@ class AbstractModelBenchmark(ABC):
"""Prepare inputs for the model. Override if needed."""
return None
def get_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
def get_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
"""Get benchmark scenarios. Creates them if they don't exist."""
if not self.scenarios:
self.scenarios = self.create_scenarios(**kwargs)
@ -547,9 +542,7 @@ class ModelBenchmark(AbstractModelBenchmark):
"""Default prompt for text generation. Override in subclasses if needed."""
return self._default_prompt
def get_attention_configs(self, include_sdpa_variants: bool = True) -> List[Dict[str, Any]]:
def get_attention_configs(self, include_sdpa_variants: bool = True) -> list[dict[str, Any]]:
"""
Get attention implementation configurations.
@ -565,15 +558,17 @@ class ModelBenchmark(AbstractModelBenchmark):
# Add SDPA variants if requested
if include_sdpa_variants:
attention_configs.append({
"attn_implementation": "sdpa",
"sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
"desc_suffix": ""
})
attention_configs.append(
{
"attn_implementation": "sdpa",
"sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
"desc_suffix": "",
}
)
return attention_configs
def get_scenario_configs(self) -> List[Dict[str, Any]]:
def get_scenario_configs(self) -> list[dict[str, Any]]:
"""
Get base scenario configurations. Override in subclasses to customize.
@ -583,36 +578,38 @@ class ModelBenchmark(AbstractModelBenchmark):
return [
# Eager variants
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
# Compiled variants
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
{
"variant": "compiled",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Compiled with max autotune",
},
# Kernelized variant (if available)
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
{
"variant": "kernelized",
"compile_mode": "max-autotune",
"use_cache": True,
"description": "Kernelized execution",
},
]
def _is_kernelization_available(self) -> bool:
"""Check if kernelization is available. Override in subclasses."""
try:
from kernels import Mode, kernelize
from kernels import Mode, kernelize # noqa: F401
return True
except ImportError:
return False
def get_default_generation_config(self) -> Dict[str, Any]:
def get_default_generation_config(self) -> dict[str, Any]:
"""Get default generation configuration. Override in subclasses for model-specific defaults."""
return {
"do_sample": False,
"top_p": 1.0,
"temperature": 1.0
}
return {"do_sample": False, "top_p": 1.0, "temperature": 1.0}
def get_model_init_kwargs(self, config: BenchmarkConfig) -> Dict[str, Any]:
def get_model_init_kwargs(self, config: BenchmarkConfig) -> dict[str, Any]:
"""Get model initialization kwargs. Override in subclasses for model-specific parameters."""
return {
"torch_dtype": getattr(torch, config.torch_dtype),
"attn_implementation": config.attn_implementation
}
return {"torch_dtype": getattr(torch, config.torch_dtype), "attn_implementation": config.attn_implementation}
def get_default_torch_dtype(self) -> str:
"""Get default torch dtype. Override in subclasses."""
@ -622,19 +619,19 @@ class ModelBenchmark(AbstractModelBenchmark):
"""Get default device. Override in subclasses."""
return "cuda"
def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
"""Create benchmark scenarios for HuggingFace models."""
scenarios = {}
# Extract parameters with model-specific defaults
model_id = kwargs.get('model_id', 'microsoft/DialoGPT-medium')
warmup_iterations = kwargs.get('warmup_iterations', 3)
measurement_iterations = kwargs.get('measurement_iterations', 5)
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
device = kwargs.get('device', self.get_default_device())
torch_dtype = kwargs.get('torch_dtype', self.get_default_torch_dtype())
batch_size = kwargs.get('batch_size', 1)
model_id = kwargs.get("model_id", "microsoft/DialoGPT-medium")
warmup_iterations = kwargs.get("warmup_iterations", 3)
measurement_iterations = kwargs.get("measurement_iterations", 5)
num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
device = kwargs.get("device", self.get_default_device())
torch_dtype = kwargs.get("torch_dtype", self.get_default_torch_dtype())
batch_size = kwargs.get("batch_size", 1)
# Get configurations
attention_configs = self.get_attention_configs(include_sdpa_variants)
@ -654,7 +651,7 @@ class ModelBenchmark(AbstractModelBenchmark):
# Create unique config for this scenario
config = BenchmarkConfig(
name=scenario_config['variant'],
name=scenario_config["variant"],
model_id=model_id,
variant=scenario_config["variant"],
compile_mode=scenario_config["compile_mode"],
@ -666,7 +663,7 @@ class ModelBenchmark(AbstractModelBenchmark):
torch_dtype=torch_dtype,
batch_size=batch_size,
attn_implementation=attn_implementation,
sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None
sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None,
)
# Create scenario name
@ -695,11 +692,7 @@ class ModelBenchmark(AbstractModelBenchmark):
description += desc_suffix
# Create scenario
scenario = BenchmarkScenario(
name=scenario_name,
config=config,
description=description
)
scenario = BenchmarkScenario(name=scenario_name, config=config, description=description)
# Add setup callbacks based on variant
if scenario_config["variant"] == "compiled":
@ -718,16 +711,12 @@ class ModelBenchmark(AbstractModelBenchmark):
# Perform torch.compile
if config.compile_mode is not None:
self.compiled_model = torch.compile(
model,
mode=config.compile_mode,
**config.compile_options
)
self.compiled_model = torch.compile(model, mode=config.compile_mode, **config.compile_options)
else:
self.compiled_model = torch.compile(model, **config.compile_options)
# Setup static cache for compiled mode if needed
if config.use_cache and hasattr(self, 'inputs') and self.inputs is not None:
if config.use_cache and hasattr(self, "inputs") and self.inputs is not None:
self._setup_static_cache(config)
def _setup_kernelization_callback(self, model, tokenizer, config, logger):
@ -737,10 +726,8 @@ class ModelBenchmark(AbstractModelBenchmark):
try:
from kernels import Mode, kernelize
self.compiled_model = kernelize(
model,
mode=Mode.INFERENCE
)
self.compiled_model = kernelize(model, mode=Mode.INFERENCE)
except Exception as e:
if logger:
logger.warning(f"Failed to setup kernelized mode: {e}")
@ -749,13 +736,14 @@ class ModelBenchmark(AbstractModelBenchmark):
def _setup_static_cache(self, config: BenchmarkConfig):
"""Setup static cache for compiled models. Override if needed."""
if hasattr(self, 'inputs') and self.inputs is not None:
if hasattr(self, "inputs") and self.inputs is not None:
try:
from transformers import StaticCache
seq_length = self.inputs["input_ids"].shape[1]
# Get the actual device the model is on
if hasattr(self.model, 'device'):
if hasattr(self.model, "device"):
cache_device = self.model.device
else:
cache_device = self.device
@ -765,7 +753,7 @@ class ModelBenchmark(AbstractModelBenchmark):
max_batch_size=config.batch_size,
max_cache_len=seq_length + config.num_tokens_to_generate,
device=cache_device,
dtype=getattr(torch, config.torch_dtype)
dtype=getattr(torch, config.torch_dtype),
)
self.logger.debug(f"StaticCache created on device: {cache_device}")
except (ImportError, TypeError) as e:
@ -794,7 +782,6 @@ class ModelBenchmark(AbstractModelBenchmark):
def _load_model_and_tokenizer(self, config: BenchmarkConfig):
"""Load the model and tokenizer. Override in subclasses for custom loading."""
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
# Load tokenizer
@ -812,14 +799,9 @@ class ModelBenchmark(AbstractModelBenchmark):
target_device = config.device
# Get model initialization kwargs
model_init_kwargs = self.get_model_init_kwargs(config)
model_init_kwargs.update({
"generation_config": gen_config
})
model_init_kwargs.update({"generation_config": gen_config})
self.model = AutoModelForCausalLM.from_pretrained(
config.model_id,
**model_init_kwargs
).eval()
self.model = AutoModelForCausalLM.from_pretrained(config.model_id, **model_init_kwargs).eval()
# Move model to target device
self.logger.info(f"Moving model to device: {target_device}")
@ -832,7 +814,7 @@ class ModelBenchmark(AbstractModelBenchmark):
self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt")
# Move inputs to the same device as the model
if hasattr(self.model, 'device'):
if hasattr(self.model, "device"):
# Model is on a single device
model_device = self.model.device
else:
@ -849,16 +831,16 @@ class ModelBenchmark(AbstractModelBenchmark):
def cleanup_model(self) -> None:
"""Cleanup model resources."""
if hasattr(self, 'model') and self.model is not None:
if hasattr(self, "model") and self.model is not None:
del self.model
self.model = None
if hasattr(self, 'compiled_model') and self.compiled_model is not None:
if hasattr(self, "compiled_model") and self.compiled_model is not None:
del self.compiled_model
self.compiled_model = None
if hasattr(self, 'tokenizer') and self.tokenizer is not None:
if hasattr(self, "tokenizer") and self.tokenizer is not None:
del self.tokenizer
self.tokenizer = None
if hasattr(self, 'past_key_values') and self.past_key_values is not None:
if hasattr(self, "past_key_values") and self.past_key_values is not None:
del self.past_key_values
self.past_key_values = None
@ -877,7 +859,7 @@ class ModelBenchmark(AbstractModelBenchmark):
# Use SDPA context if specified
with SDPAContext(config.sdpa_backend, self.logger):
with torch.no_grad():
outputs = model_to_use.generate(**generation_kwargs)
_ = model_to_use.generate(**generation_kwargs)
return timer.elapsed_time()
@ -915,11 +897,11 @@ class ModelBenchmark(AbstractModelBenchmark):
"variant": config.variant,
"compile_mode": config.compile_mode,
"attn_implementation": config.attn_implementation,
"sdpa_backend": config.sdpa_backend
}
"sdpa_backend": config.sdpa_backend,
},
)
def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> Dict[str, Any]:
def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> dict[str, Any]:
"""Get generation kwargs. Override in subclasses for custom generation."""
generation_config_dict = self.get_default_generation_config()
generation_kwargs = {
@ -935,11 +917,12 @@ class ModelBenchmark(AbstractModelBenchmark):
if self.past_key_values is not None and config.variant == "compiled":
try:
from transformers import StaticCache
# Reset cache for each measurement
seq_length = self.inputs["input_ids"].shape[1]
# Get the actual device the model is on
if hasattr(self.model, 'device'):
if hasattr(self.model, "device"):
cache_device = self.model.device
else:
cache_device = self.device
@ -949,7 +932,7 @@ class ModelBenchmark(AbstractModelBenchmark):
max_batch_size=config.batch_size,
max_cache_len=seq_length + max_new_tokens,
device=cache_device,
dtype=getattr(torch, config.torch_dtype)
dtype=getattr(torch, config.torch_dtype),
)
generation_kwargs["past_key_values"] = fresh_cache
except (ImportError, TypeError) as e:
@ -967,14 +950,13 @@ class BenchmarkRunner:
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
def run_benchmark(
self,
benchmark: ModelBenchmark,
scenarios: Dict[str, BenchmarkScenario],
scenarios: dict[str, BenchmarkScenario],
collect_gpu_metrics: bool = True,
commit_id: Optional[str] = None
) -> Dict[str, Dict[str, Any]]:
commit_id: Optional[str] = None,
) -> dict[str, dict[str, Any]]:
"""
Run benchmarks using scenarios.
@ -1021,7 +1003,7 @@ class BenchmarkRunner:
timestamp=datetime.utcnow().isoformat(),
commit_id=commit_id,
hardware_info=get_hardware_info(),
config=config
config=config,
)
# Initialize GPU monitor
@ -1037,11 +1019,13 @@ class BenchmarkRunner:
_ = benchmark.measure_latency(config)
except Exception as e:
warmup_failures += 1
self.logger.warning(f"Warmup iteration {i+1} failed: {e}")
self.logger.warning(f"Warmup iteration {i + 1} failed: {e}")
# If more than half the warmup iterations failed, skip this scenario
if warmup_failures > config.warmup_iterations // 2:
self.logger.warning(f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})")
self.logger.warning(
f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})"
)
try:
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
benchmark.cleanup_model()
@ -1077,12 +1061,18 @@ class BenchmarkRunner:
if timing_result.time_per_output_token_seconds is not None:
itl_measurements.append(timing_result.time_per_output_token_seconds)
itl_str = f", itl={timing_result.time_per_output_token_seconds:.4f}s/token" if timing_result.time_per_output_token_seconds else ""
self.logger.debug(f"Iteration {i+1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}")
itl_str = (
f", itl={timing_result.time_per_output_token_seconds:.4f}s/token"
if timing_result.time_per_output_token_seconds
else ""
)
self.logger.debug(
f"Iteration {i + 1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}"
)
except Exception as e:
measurement_failures += 1
self.logger.warning(f"Measurement iteration {i+1} failed: {e}")
self.logger.warning(f"Measurement iteration {i + 1} failed: {e}")
# Stop GPU monitoring
gpu_metrics = {}
@ -1091,7 +1081,9 @@ class BenchmarkRunner:
# If we don't have enough successful measurements, skip this scenario
if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2:
self.logger.warning(f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})")
self.logger.warning(
f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})"
)
try:
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
benchmark.cleanup_model()
@ -1104,7 +1096,7 @@ class BenchmarkRunner:
"metadata": asdict(metadata),
"measurements": {},
"gpu_metrics": gpu_metrics,
"scenario_description": scenario.description
"scenario_description": scenario.description,
}
if latency_measurements:
@ -1112,15 +1104,21 @@ class BenchmarkRunner:
scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats)
if ttft_measurements:
ttft_stats = BenchmarkStatistics.from_measurements("time_to_first_token_seconds", ttft_measurements)
ttft_stats = BenchmarkStatistics.from_measurements(
"time_to_first_token_seconds", ttft_measurements
)
scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats)
if tokens_per_sec_measurements:
tps_stats = BenchmarkStatistics.from_measurements("tokens_per_second", tokens_per_sec_measurements, "tokens/sec")
tps_stats = BenchmarkStatistics.from_measurements(
"tokens_per_second", tokens_per_sec_measurements, "tokens/sec"
)
scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats)
if itl_measurements:
itl_stats = BenchmarkStatistics.from_measurements("time_per_output_token_seconds", itl_measurements, "seconds/token")
itl_stats = BenchmarkStatistics.from_measurements(
"time_per_output_token_seconds", itl_measurements, "seconds/token"
)
scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats)
# Log summary
@ -1149,6 +1147,7 @@ class BenchmarkRunner:
except Exception as e:
self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}")
import traceback
self.logger.debug(traceback.format_exc())
# Try to clean up if possible
@ -1169,7 +1168,7 @@ class BenchmarkRunner:
return all_results
def save_results(self, model_name: str, results: Dict[str, Dict[str, Any]]) -> str:
def save_results(self, model_name: str, results: dict[str, dict[str, Any]]) -> str:
"""Save benchmark results to JSON file."""
# Create model-specific subdirectory
model_dir = os.path.join(self.output_dir, model_name)
@ -1181,24 +1180,20 @@ class BenchmarkRunner:
filepath = os.path.join(model_dir, filename)
# Prepare output structure
output_data = {
"model_name": model_name,
"benchmark_scenarios": []
}
output_data = {"model_name": model_name, "benchmark_scenarios": []}
for config_name, config_results in results.items():
scenario = {
"scenario_name": config_name,
"metadata": config_results["metadata"],
"measurements": config_results["measurements"],
"gpu_metrics": config_results.get("gpu_metrics", {})
"gpu_metrics": config_results.get("gpu_metrics", {}),
}
output_data["benchmark_scenarios"].append(scenario)
# Save to JSON file
with open(filepath, 'w') as f:
with open(filepath, "w") as f:
json.dump(output_data, f, indent=2, default=str)
self.logger.info(f"Results saved to {filepath}")
return filepath

View File

@ -20,38 +20,34 @@ in the ./benches directory, organizing outputs into model-specific subfolders.
import argparse
import importlib.util
import json
import logging
import os
import sys
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional
from typing import Any, Optional
def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
"""Setup logging configuration."""
numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError(f'Invalid log level: {log_level}')
raise ValueError(f"Invalid log level: {log_level}")
handlers = [logging.StreamHandler(sys.stdout)]
if enable_file_logging:
handlers.append(
logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
)
handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
logging.basicConfig(
level=numeric_level,
format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
handlers=handlers
level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
)
return logging.getLogger(__name__)
def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
"""
Discover all benchmark modules in the benches directory.
@ -77,20 +73,24 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
spec.loader.exec_module(module)
# Check if it has a benchmark runner function
if hasattr(module, f'run_{module_name}'):
benchmarks.append({
'name': module_name,
'path': str(py_file),
'module': module,
'runner_function': getattr(module, f'run_{module_name}')
})
elif hasattr(module, 'run_benchmark'):
benchmarks.append({
'name': module_name,
'path': str(py_file),
'module': module,
'runner_function': getattr(module, 'run_benchmark')
})
if hasattr(module, f"run_{module_name}"):
benchmarks.append(
{
"name": module_name,
"path": str(py_file),
"module": module,
"runner_function": getattr(module, f"run_{module_name}"),
}
)
elif hasattr(module, "run_benchmark"):
benchmarks.append(
{
"name": module_name,
"path": str(py_file),
"module": module,
"runner_function": getattr(module, "run_benchmark"),
}
)
else:
logging.warning(f"No runner function found in {py_file}")
@ -101,10 +101,7 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
def run_single_benchmark(
benchmark_info: Dict[str, Any],
output_dir: str,
logger: logging.Logger,
**kwargs
benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
) -> Optional[str]:
"""
Run a single benchmark and return the output file path.
@ -118,21 +115,19 @@ def run_single_benchmark(
Returns:
Path to the output file if successful, None otherwise
"""
benchmark_name = benchmark_info['name']
runner_func = benchmark_info['runner_function']
benchmark_name = benchmark_info["name"]
runner_func = benchmark_info["runner_function"]
logger.info(f"Running benchmark: {benchmark_name}")
try:
# Check function signature to determine what arguments to pass
import inspect
sig = inspect.signature(runner_func)
# Prepare arguments based on function signature
func_kwargs = {
'logger': logger,
'output_dir': output_dir
}
func_kwargs = {"logger": logger, "output_dir": output_dir}
# Add other kwargs if the function accepts them
for param_name in sig.parameters:
@ -145,8 +140,7 @@ def run_single_benchmark(
if has_var_kwargs:
valid_kwargs = {**func_kwargs, **kwargs}
else:
valid_kwargs = {k: v for k, v in func_kwargs.items()
if k in sig.parameters}
valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
# Run the benchmark
result = runner_func(**valid_kwargs)
@ -161,15 +155,12 @@ def run_single_benchmark(
except Exception as e:
logger.error(f"Benchmark {benchmark_name} failed: {e}")
import traceback
logger.debug(traceback.format_exc())
return None
def generate_summary_report(
output_dir: str,
benchmark_results: Dict[str, Any],
logger: logging.Logger
) -> str:
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
"""Generate a summary report of all benchmark runs."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
@ -179,13 +170,13 @@ def generate_summary_report(
"timestamp": datetime.utcnow().isoformat(),
"total_benchmarks": len(benchmark_results),
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None])
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
},
"benchmark_results": benchmark_results,
"output_directory": output_dir
"output_directory": output_dir,
}
with open(summary_file, 'w') as f:
with open(summary_file, "w") as f:
json.dump(summary_data, f, indent=2, default=str)
logger.info(f"Summary report saved to: {summary_file}")
@ -194,22 +185,20 @@ def generate_summary_report(
def main():
"""Main entry point for the benchmarking script."""
parser = argparse.ArgumentParser(
description="Run all benchmarks in the ./benches directory"
)
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
parser.add_argument(
"--output-dir",
type=str,
default="benchmark_results",
help="Base output directory for benchmark results (default: benchmark_results)"
help="Base output directory for benchmark results (default: benchmark_results)",
)
parser.add_argument(
"--benches-dir",
type=str,
default="./benches",
help="Directory containing benchmark implementations (default: ./benches)"
help="Directory containing benchmark implementations (default: ./benches)",
)
parser.add_argument(
@ -217,66 +206,34 @@ def main():
type=str,
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
default="INFO",
help="Logging level (default: INFO)"
help="Logging level (default: INFO)",
)
parser.add_argument(
"--model-id",
type=str,
help="Specific model ID to benchmark (if supported by benchmarks)"
)
parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
parser.add_argument(
"--warmup-iterations",
type=int,
default=3,
help="Number of warmup iterations (default: 3)"
)
parser.add_argument(
"--measurement-iterations",
type=int,
default=5,
help="Number of measurement iterations (default: 5)"
"--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
)
parser.add_argument(
"--num-tokens-to-generate",
type=int,
default=100,
help="Number of tokens to generate in benchmarks (default: 100)"
help="Number of tokens to generate in benchmarks (default: 100)",
)
parser.add_argument(
"--include",
type=str,
nargs="*",
help="Only run benchmarks matching these names"
)
parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
parser.add_argument(
"--exclude",
type=str,
nargs="*",
help="Exclude benchmarks matching these names"
)
parser.add_argument(
"--enable-mock",
action="store_true",
help="Enable mock benchmark (skipped by default)"
)
parser.add_argument(
"--enable-file-logging",
action="store_true",
help="Enable file logging (disabled by default)"
)
parser.add_argument(
"--commit-id",
type=str,
help="Git commit ID for metadata (if not provided, will auto-detect from git)"
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
)
args = parser.parse_args()
@ -304,13 +261,15 @@ def main():
filtered_benchmarks = benchmarks
if args.include:
filtered_benchmarks = [b for b in filtered_benchmarks
if any(pattern in b['name'] for pattern in args.include)]
filtered_benchmarks = [
b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
]
logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
if args.exclude:
filtered_benchmarks = [b for b in filtered_benchmarks
if not any(pattern in b['name'] for pattern in args.exclude)]
filtered_benchmarks = [
b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
]
logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
if not filtered_benchmarks:
@ -319,34 +278,29 @@ def main():
# Prepare common kwargs for benchmarks
benchmark_kwargs = {
'warmup_iterations': args.warmup_iterations,
'measurement_iterations': args.measurement_iterations,
'num_tokens_to_generate': args.num_tokens_to_generate
"warmup_iterations": args.warmup_iterations,
"measurement_iterations": args.measurement_iterations,
"num_tokens_to_generate": args.num_tokens_to_generate,
}
if args.model_id:
benchmark_kwargs['model_id'] = args.model_id
benchmark_kwargs["model_id"] = args.model_id
# Add enable_mock flag for mock benchmark
benchmark_kwargs['enable_mock'] = args.enable_mock
benchmark_kwargs["enable_mock"] = args.enable_mock
# Add commit_id if provided
if args.commit_id:
benchmark_kwargs['commit_id'] = args.commit_id
benchmark_kwargs["commit_id"] = args.commit_id
# Run benchmarks
benchmark_results = {}
successful_count = 0
for benchmark_info in filtered_benchmarks:
result = run_single_benchmark(
benchmark_info,
args.output_dir,
logger,
**benchmark_kwargs
)
result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
benchmark_results[benchmark_info['name']] = result
benchmark_results[benchmark_info["name"]] = result
if result is not None:
successful_count += 1
@ -377,6 +331,7 @@ def main():
except Exception as e:
logger.error(f"Benchmark run failed: {e}")
import traceback
logger.debug(traceback.format_exc())
return 1

View File

@ -4,8 +4,8 @@ import datasets
import transformers
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
from transformers.utils import logging
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import logging
logging.set_verbosity_info()
@ -22,7 +22,9 @@ imperfect = 0
wrong = 0
def check_diff(spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
def check_diff(
spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
) -> bool:
if spm_diff == list(reversed(tok_diff)):
# AAA -> AA+A vs A+AA case.
return True
@ -54,7 +56,9 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
return False
def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
def check_details(
line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
) -> bool:
# Encoding can be the same with same result AAA -> A + AA vs AA + A
# We can check that we use at least exactly the same number of tokens.
for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
@ -90,7 +94,9 @@ def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTr
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
]
for j in possible_matches:
if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], slow, fast) and check_details(
if check_diff(
spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
) and check_details(
line,
spm_ids[first + i : last],
tok_ids[first + j : last],
@ -140,9 +146,9 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
if skip_assert:
return
assert (
slow_ids == fast_ids
), f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
assert slow_ids == fast_ids, (
f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
)
def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:

View File

@ -15,6 +15,7 @@
Script to close stale issue. Taken in part from the AllenNLP repository.
https://github.com/allenai/allennlp.
"""
import os
from datetime import datetime as dt
@ -39,10 +40,11 @@ def main():
for i, issue in enumerate(open_issues):
print(i, issue)
comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True)
comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
last_comment = comments[0] if len(comments) > 0 else None
if (
last_comment is not None and last_comment.user.login == "github-actions[bot]"
last_comment is not None
and last_comment.user.login == "github-actions[bot]"
and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())