* Enable ruff on benchmark and scripts Signed-off-by: cyy <cyyever@outlook.com> * Cover benchmark_v2 Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> * correct * style * style --------- Signed-off-by: cyy <cyyever@outlook.com> Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
341 lines
12 KiB
Python
Executable File
341 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Top-level benchmarking script that automatically discovers and runs all benchmarks
|
|
in the ./benches directory, organizing outputs into model-specific subfolders.
|
|
"""
|
|
|
|
import argparse
|
|
import importlib.util
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
|
|
def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
|
|
"""Setup logging configuration."""
|
|
numeric_level = getattr(logging, log_level.upper(), None)
|
|
if not isinstance(numeric_level, int):
|
|
raise ValueError(f"Invalid log level: {log_level}")
|
|
|
|
handlers = [logging.StreamHandler(sys.stdout)]
|
|
|
|
if enable_file_logging:
|
|
handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
|
|
|
|
logging.basicConfig(
|
|
level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
|
|
)
|
|
|
|
return logging.getLogger(__name__)
|
|
|
|
|
|
def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
|
|
"""
|
|
Discover all benchmark modules in the benches directory.
|
|
|
|
Returns:
|
|
List of dictionaries containing benchmark module info
|
|
"""
|
|
benchmarks = []
|
|
benches_path = Path(benches_dir)
|
|
|
|
if not benches_path.exists():
|
|
raise FileNotFoundError(f"Benches directory not found: {benches_dir}")
|
|
|
|
for py_file in benches_path.glob("*.py"):
|
|
if py_file.name.startswith("__"):
|
|
continue
|
|
|
|
module_name = py_file.stem
|
|
|
|
try:
|
|
# Import the module
|
|
spec = importlib.util.spec_from_file_location(module_name, py_file)
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
|
|
# Check if it has a benchmark runner function
|
|
if hasattr(module, f"run_{module_name}"):
|
|
benchmarks.append(
|
|
{
|
|
"name": module_name,
|
|
"path": str(py_file),
|
|
"module": module,
|
|
"runner_function": getattr(module, f"run_{module_name}"),
|
|
}
|
|
)
|
|
elif hasattr(module, "run_benchmark"):
|
|
benchmarks.append(
|
|
{
|
|
"name": module_name,
|
|
"path": str(py_file),
|
|
"module": module,
|
|
"runner_function": getattr(module, "run_benchmark"),
|
|
}
|
|
)
|
|
else:
|
|
logging.warning(f"No runner function found in {py_file}")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Failed to import {py_file}: {e}")
|
|
|
|
return benchmarks
|
|
|
|
|
|
def run_single_benchmark(
|
|
benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
|
|
) -> Optional[str]:
|
|
"""
|
|
Run a single benchmark and return the output file path.
|
|
|
|
Args:
|
|
benchmark_info: Dictionary containing benchmark module info
|
|
output_dir: Base output directory
|
|
logger: Logger instance
|
|
**kwargs: Additional arguments to pass to the benchmark
|
|
|
|
Returns:
|
|
Path to the output file if successful, None otherwise
|
|
"""
|
|
benchmark_name = benchmark_info["name"]
|
|
runner_func = benchmark_info["runner_function"]
|
|
|
|
logger.info(f"Running benchmark: {benchmark_name}")
|
|
|
|
try:
|
|
# Check function signature to determine what arguments to pass
|
|
import inspect
|
|
|
|
sig = inspect.signature(runner_func)
|
|
|
|
# Prepare arguments based on function signature
|
|
func_kwargs = {"logger": logger, "output_dir": output_dir}
|
|
|
|
# Add other kwargs if the function accepts them
|
|
for param_name in sig.parameters:
|
|
if param_name in kwargs:
|
|
func_kwargs[param_name] = kwargs[param_name]
|
|
|
|
# Filter kwargs to only include parameters the function accepts
|
|
# If function has **kwargs, include all provided kwargs
|
|
has_var_kwargs = any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values())
|
|
if has_var_kwargs:
|
|
valid_kwargs = {**func_kwargs, **kwargs}
|
|
else:
|
|
valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
|
|
|
|
# Run the benchmark
|
|
result = runner_func(**valid_kwargs)
|
|
|
|
if isinstance(result, str):
|
|
# Function returned a file path
|
|
return result
|
|
else:
|
|
logger.info(f"Benchmark {benchmark_name} completed successfully")
|
|
return "completed"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Benchmark {benchmark_name} failed: {e}")
|
|
import traceback
|
|
|
|
logger.debug(traceback.format_exc())
|
|
return None
|
|
|
|
|
|
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
|
|
"""Generate a summary report of all benchmark runs."""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
|
|
|
|
summary_data = {
|
|
"run_metadata": {
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"total_benchmarks": len(benchmark_results),
|
|
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
|
|
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
|
|
},
|
|
"benchmark_results": benchmark_results,
|
|
"output_directory": output_dir,
|
|
}
|
|
|
|
with open(summary_file, "w") as f:
|
|
json.dump(summary_data, f, indent=2, default=str)
|
|
|
|
logger.info(f"Summary report saved to: {summary_file}")
|
|
return summary_file
|
|
|
|
|
|
def main():
|
|
"""Main entry point for the benchmarking script."""
|
|
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
|
|
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=str,
|
|
default="benchmark_results",
|
|
help="Base output directory for benchmark results (default: benchmark_results)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--benches-dir",
|
|
type=str,
|
|
default="./benches",
|
|
help="Directory containing benchmark implementations (default: ./benches)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--log-level",
|
|
type=str,
|
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
default="INFO",
|
|
help="Logging level (default: INFO)",
|
|
)
|
|
|
|
parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
|
|
|
|
parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
|
|
|
|
parser.add_argument(
|
|
"--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--num-tokens-to-generate",
|
|
type=int,
|
|
default=100,
|
|
help="Number of tokens to generate in benchmarks (default: 100)",
|
|
)
|
|
|
|
parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
|
|
|
|
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
|
|
|
|
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
|
|
|
|
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
|
|
|
|
parser.add_argument(
|
|
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Setup logging
|
|
logger = setup_logging(args.log_level, args.enable_file_logging)
|
|
|
|
logger.info("Starting benchmark discovery and execution")
|
|
logger.info(f"Output directory: {args.output_dir}")
|
|
logger.info(f"Benches directory: {args.benches_dir}")
|
|
|
|
# Create output directory
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
try:
|
|
# Discover benchmarks
|
|
benchmarks = discover_benchmarks(args.benches_dir)
|
|
logger.info(f"Discovered {len(benchmarks)} benchmark(s): {[b['name'] for b in benchmarks]}")
|
|
|
|
if not benchmarks:
|
|
logger.warning("No benchmarks found!")
|
|
return 1
|
|
|
|
# Filter benchmarks based on include/exclude
|
|
filtered_benchmarks = benchmarks
|
|
|
|
if args.include:
|
|
filtered_benchmarks = [
|
|
b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
|
|
]
|
|
logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
|
|
|
|
if args.exclude:
|
|
filtered_benchmarks = [
|
|
b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
|
|
]
|
|
logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
|
|
|
|
if not filtered_benchmarks:
|
|
logger.warning("No benchmarks remaining after filtering!")
|
|
return 1
|
|
|
|
# Prepare common kwargs for benchmarks
|
|
benchmark_kwargs = {
|
|
"warmup_iterations": args.warmup_iterations,
|
|
"measurement_iterations": args.measurement_iterations,
|
|
"num_tokens_to_generate": args.num_tokens_to_generate,
|
|
}
|
|
|
|
if args.model_id:
|
|
benchmark_kwargs["model_id"] = args.model_id
|
|
|
|
# Add enable_mock flag for mock benchmark
|
|
benchmark_kwargs["enable_mock"] = args.enable_mock
|
|
|
|
# Add commit_id if provided
|
|
if args.commit_id:
|
|
benchmark_kwargs["commit_id"] = args.commit_id
|
|
|
|
# Run benchmarks
|
|
benchmark_results = {}
|
|
successful_count = 0
|
|
|
|
for benchmark_info in filtered_benchmarks:
|
|
result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
|
|
|
|
benchmark_results[benchmark_info["name"]] = result
|
|
|
|
if result is not None:
|
|
successful_count += 1
|
|
|
|
# Generate summary report
|
|
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger)
|
|
|
|
# Final summary
|
|
total_benchmarks = len(filtered_benchmarks)
|
|
failed_count = total_benchmarks - successful_count
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("BENCHMARK RUN SUMMARY")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Total benchmarks: {total_benchmarks}")
|
|
logger.info(f"Successful: {successful_count}")
|
|
logger.info(f"Failed: {failed_count}")
|
|
logger.info(f"Output directory: {args.output_dir}")
|
|
logger.info(f"Summary report: {summary_file}")
|
|
|
|
if failed_count > 0:
|
|
logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
|
|
return 1
|
|
else:
|
|
logger.info("All benchmarks completed successfully!")
|
|
return 0
|
|
|
|
except Exception as e:
|
|
logger.error(f"Benchmark run failed: {e}")
|
|
import traceback
|
|
|
|
logger.debug(traceback.format_exc())
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|