Enable ruff on benchmark and scripts (#40634)
* Enable ruff on benchmark and scripts Signed-off-by: cyy <cyyever@outlook.com> * Cover benchmark_v2 Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> * correct * style * style --------- Signed-off-by: cyy <cyyever@outlook.com> Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
This commit is contained in:
2
Makefile
2
Makefile
@ -3,7 +3,7 @@
|
||||
# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
|
||||
export PYTHONPATH = src
|
||||
|
||||
check_dirs := examples tests src utils
|
||||
check_dirs := examples tests src utils scripts benchmark benchmark_v2
|
||||
|
||||
exclude_folders := ""
|
||||
|
||||
|
@ -11,25 +11,28 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from logging import Logger
|
||||
import os
|
||||
import sys
|
||||
from logging import Logger
|
||||
from threading import Event, Thread
|
||||
from time import perf_counter, sleep
|
||||
from typing import Optional
|
||||
import sys
|
||||
|
||||
|
||||
# Add the parent directory to Python path to import benchmarks_entrypoint
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from benchmarks_entrypoint import MetricsRecorder
|
||||
|
||||
import gpustat
|
||||
import psutil
|
||||
import psycopg2
|
||||
from benchmarks_entrypoint import MetricsRecorder
|
||||
|
||||
|
||||
# Optional heavy ML dependencies - only required when actually running the benchmark
|
||||
try:
|
||||
import torch
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
|
||||
|
||||
TRANSFORMERS_AVAILABLE = True
|
||||
except ImportError:
|
||||
TRANSFORMERS_AVAILABLE = False
|
||||
@ -63,7 +66,13 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
|
||||
logger: Logger,
|
||||
repository: str,
|
||||
branch: str,
|
||||
commit_id: str,
|
||||
commit_msg: str,
|
||||
metrics_recorder=None,
|
||||
num_tokens_to_generate=100,
|
||||
):
|
||||
# Check if required ML dependencies are available
|
||||
if not TRANSFORMERS_AVAILABLE:
|
||||
@ -71,11 +80,11 @@ def run_benchmark(
|
||||
logger.error("pip install torch transformers")
|
||||
logger.error("Skipping LLaMA benchmark due to missing dependencies.")
|
||||
return
|
||||
|
||||
|
||||
continue_metric_collection = Event()
|
||||
metrics_thread = None
|
||||
model_id = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
|
||||
# If no metrics_recorder is provided, create one for backward compatibility
|
||||
if metrics_recorder is None:
|
||||
try:
|
||||
@ -154,7 +163,7 @@ def run_benchmark(
|
||||
# First eager forward pass
|
||||
logger.info("running first eager forward pass")
|
||||
start = perf_counter()
|
||||
outputs = model(**inputs)
|
||||
_ = model(**inputs)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
first_eager_fwd_pass_time = end - start
|
||||
@ -163,7 +172,7 @@ def run_benchmark(
|
||||
# Second eager forward pass (should be faster)
|
||||
logger.info("running second eager forward pass")
|
||||
start = perf_counter()
|
||||
outputs = model(**inputs)
|
||||
_ = model(**inputs)
|
||||
torch.cuda.synchronize()
|
||||
end = perf_counter()
|
||||
second_eager_fwd_pass_time = end - start
|
||||
@ -339,7 +348,7 @@ def run_benchmark(
|
||||
continue_metric_collection.set()
|
||||
if metrics_thread is not None:
|
||||
metrics_thread.join()
|
||||
|
||||
|
||||
# Only close the recorder if we created it locally
|
||||
if should_close_recorder:
|
||||
metrics_recorder.close()
|
||||
metrics_recorder.close()
|
||||
|
@ -31,9 +31,7 @@ from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from git import Repo
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
from optimum_benchmark import Benchmark
|
||||
from optimum_benchmark_wrapper import main
|
||||
|
||||
|
@ -13,19 +13,20 @@
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import importlib.util
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Dict, Tuple, Optional, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
try:
|
||||
from psycopg2.extensions import register_adapter
|
||||
from psycopg2.extras import Json
|
||||
|
||||
register_adapter(dict, Json)
|
||||
PSYCOPG2_AVAILABLE = True
|
||||
except ImportError:
|
||||
@ -38,8 +39,14 @@ class ImportModuleException(Exception):
|
||||
|
||||
class MetricsRecorder:
|
||||
def __init__(
|
||||
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str,
|
||||
collect_csv_data: bool = True
|
||||
self,
|
||||
connection,
|
||||
logger: logging.Logger,
|
||||
repository: str,
|
||||
branch: str,
|
||||
commit_id: str,
|
||||
commit_msg: str,
|
||||
collect_csv_data: bool = True,
|
||||
):
|
||||
self.conn = connection
|
||||
self.use_database = connection is not None
|
||||
@ -51,27 +58,43 @@ class MetricsRecorder:
|
||||
self.commit_id = commit_id
|
||||
self.commit_msg = commit_msg
|
||||
self.collect_csv_data = collect_csv_data
|
||||
|
||||
|
||||
# For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
|
||||
if self.collect_csv_data:
|
||||
# Initialize empty DataFrames with proper schemas
|
||||
self.benchmarks_df = pd.DataFrame(columns=[
|
||||
'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message',
|
||||
'metadata', 'created_at'
|
||||
])
|
||||
self.device_measurements_df = pd.DataFrame(columns=[
|
||||
'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util',
|
||||
'gpu_mem_megabytes', 'time'
|
||||
])
|
||||
self.model_measurements_df = pd.DataFrame(columns=[
|
||||
'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
|
||||
'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
|
||||
'second_eager_generate_time_secs', 'time_to_first_token_secs',
|
||||
'time_to_second_token_secs', 'time_to_third_token_secs',
|
||||
'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
|
||||
'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
|
||||
'fourth_compile_generate_time_secs'
|
||||
])
|
||||
self.benchmarks_df = pd.DataFrame(
|
||||
columns=[
|
||||
"benchmark_id",
|
||||
"repository",
|
||||
"branch",
|
||||
"commit_id",
|
||||
"commit_message",
|
||||
"metadata",
|
||||
"created_at",
|
||||
]
|
||||
)
|
||||
self.device_measurements_df = pd.DataFrame(
|
||||
columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
|
||||
)
|
||||
self.model_measurements_df = pd.DataFrame(
|
||||
columns=[
|
||||
"benchmark_id",
|
||||
"time",
|
||||
"model_load_time",
|
||||
"first_eager_forward_pass_time_secs",
|
||||
"second_eager_forward_pass_time_secs",
|
||||
"first_eager_generate_time_secs",
|
||||
"second_eager_generate_time_secs",
|
||||
"time_to_first_token_secs",
|
||||
"time_to_second_token_secs",
|
||||
"time_to_third_token_secs",
|
||||
"time_to_next_token_mean_secs",
|
||||
"first_compile_generate_time_secs",
|
||||
"second_compile_generate_time_secs",
|
||||
"third_compile_generate_time_secs",
|
||||
"fourth_compile_generate_time_secs",
|
||||
]
|
||||
)
|
||||
else:
|
||||
self.benchmarks_df = None
|
||||
self.device_measurements_df = None
|
||||
@ -83,7 +106,7 @@ class MetricsRecorder:
|
||||
"""
|
||||
# Generate a unique UUID for this benchmark
|
||||
benchmark_id = str(uuid.uuid4())
|
||||
|
||||
|
||||
if self.use_database:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute(
|
||||
@ -91,28 +114,32 @@ class MetricsRecorder:
|
||||
(benchmark_id, self.repository, self.branch, self.commit_id, self.commit_msg, metadata),
|
||||
)
|
||||
self.logger.debug(f"initialised benchmark #{benchmark_id}")
|
||||
|
||||
|
||||
# Store benchmark data for CSV export (if enabled)
|
||||
if self.collect_csv_data:
|
||||
# Add row to pandas DataFrame
|
||||
new_row = pd.DataFrame([{
|
||||
'benchmark_id': benchmark_id,
|
||||
'repository': self.repository,
|
||||
'branch': self.branch,
|
||||
'commit_id': self.commit_id,
|
||||
'commit_message': self.commit_msg,
|
||||
'metadata': json.dumps(metadata),
|
||||
'created_at': datetime.utcnow().isoformat()
|
||||
}])
|
||||
new_row = pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"benchmark_id": benchmark_id,
|
||||
"repository": self.repository,
|
||||
"branch": self.branch,
|
||||
"commit_id": self.commit_id,
|
||||
"commit_message": self.commit_msg,
|
||||
"metadata": json.dumps(metadata),
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
]
|
||||
)
|
||||
self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
|
||||
|
||||
|
||||
mode_info = []
|
||||
if self.use_database:
|
||||
mode_info.append("database")
|
||||
if self.collect_csv_data:
|
||||
mode_info.append("CSV")
|
||||
mode_str = " + ".join(mode_info) if mode_info else "no storage"
|
||||
|
||||
|
||||
self.logger.debug(f"initialised benchmark #{benchmark_id} ({mode_str} mode)")
|
||||
return benchmark_id
|
||||
|
||||
@ -123,16 +150,20 @@ class MetricsRecorder:
|
||||
# Store device measurements for CSV export (if enabled)
|
||||
if self.collect_csv_data:
|
||||
# Add row to pandas DataFrame
|
||||
new_row = pd.DataFrame([{
|
||||
'benchmark_id': benchmark_id,
|
||||
'cpu_util': cpu_util,
|
||||
'mem_megabytes': mem_megabytes,
|
||||
'gpu_util': gpu_util,
|
||||
'gpu_mem_megabytes': gpu_mem_megabytes,
|
||||
'time': datetime.utcnow().isoformat()
|
||||
}])
|
||||
new_row = pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"benchmark_id": benchmark_id,
|
||||
"cpu_util": cpu_util,
|
||||
"mem_megabytes": mem_megabytes,
|
||||
"gpu_util": gpu_util,
|
||||
"gpu_mem_megabytes": gpu_mem_megabytes,
|
||||
"time": datetime.utcnow().isoformat(),
|
||||
}
|
||||
]
|
||||
)
|
||||
self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
|
||||
|
||||
|
||||
# Store in database if available
|
||||
if self.use_database:
|
||||
with self.conn.cursor() as cur:
|
||||
@ -140,7 +171,7 @@ class MetricsRecorder:
|
||||
"INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
|
||||
(benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
|
||||
)
|
||||
|
||||
|
||||
self.logger.debug(
|
||||
f"collected device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
|
||||
)
|
||||
@ -149,16 +180,13 @@ class MetricsRecorder:
|
||||
# Store model measurements for CSV export (if enabled)
|
||||
if self.collect_csv_data:
|
||||
# Add row to pandas DataFrame with flattened measurements
|
||||
row_data = {
|
||||
'benchmark_id': benchmark_id,
|
||||
'time': datetime.utcnow().isoformat()
|
||||
}
|
||||
row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
|
||||
# Flatten the measurements dict into the row
|
||||
row_data.update(measurements)
|
||||
|
||||
|
||||
new_row = pd.DataFrame([row_data])
|
||||
self.model_measurements_df = pd.concat([self.model_measurements_df, new_row], ignore_index=True)
|
||||
|
||||
|
||||
# Store in database if available
|
||||
if self.use_database:
|
||||
with self.conn.cursor() as cur:
|
||||
@ -174,7 +202,7 @@ class MetricsRecorder:
|
||||
measurements,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
self.logger.debug(f"collected model measurements for benchmark #{benchmark_id}: {measurements}")
|
||||
|
||||
def export_to_csv(self, output_dir: str = "benchmark_results"):
|
||||
@ -184,19 +212,19 @@ class MetricsRecorder:
|
||||
if not self.collect_csv_data:
|
||||
self.logger.warning("CSV data collection is disabled - no CSV files will be generated")
|
||||
return
|
||||
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
self.logger.info(f"Created output directory: {output_dir}")
|
||||
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
files_created = []
|
||||
|
||||
|
||||
# Export using pandas DataFrames
|
||||
self._export_pandas_data(output_dir, timestamp, files_created)
|
||||
|
||||
|
||||
self.logger.info(f"CSV export complete! Created {len(files_created)} files in {output_dir}")
|
||||
|
||||
|
||||
def _export_pandas_data(self, output_dir: str, timestamp: str, files_created: list):
|
||||
"""
|
||||
Export CSV files using pandas DataFrames
|
||||
@ -206,24 +234,24 @@ class MetricsRecorder:
|
||||
self.benchmarks_df.to_csv(benchmarks_file, index=False)
|
||||
files_created.append(benchmarks_file)
|
||||
self.logger.info(f"Exported {len(self.benchmarks_df)} benchmark records to {benchmarks_file}")
|
||||
|
||||
# Export device measurements
|
||||
|
||||
# Export device measurements
|
||||
device_file = os.path.join(output_dir, f"device_measurements_{timestamp}.csv")
|
||||
self.device_measurements_df.to_csv(device_file, index=False)
|
||||
files_created.append(device_file)
|
||||
self.logger.info(f"Exported {len(self.device_measurements_df)} device measurement records to {device_file}")
|
||||
|
||||
|
||||
# Export model measurements (already flattened)
|
||||
model_file = os.path.join(output_dir, f"model_measurements_{timestamp}.csv")
|
||||
self.model_measurements_df.to_csv(model_file, index=False)
|
||||
files_created.append(model_file)
|
||||
self.logger.info(f"Exported {len(self.model_measurements_df)} model measurement records to {model_file}")
|
||||
|
||||
|
||||
# Create comprehensive summary using pandas operations
|
||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.csv")
|
||||
self._create_summary(summary_file)
|
||||
files_created.append(summary_file)
|
||||
|
||||
|
||||
def _create_summary(self, summary_file: str):
|
||||
"""
|
||||
Create a comprehensive summary CSV using pandas operations
|
||||
@ -234,36 +262,42 @@ class MetricsRecorder:
|
||||
summary_df.to_csv(summary_file, index=False)
|
||||
self.logger.info(f"Created empty benchmark summary at {summary_file}")
|
||||
return
|
||||
|
||||
|
||||
# Start with benchmarks as the base
|
||||
summary_df = self.benchmarks_df.copy()
|
||||
|
||||
|
||||
# Add model measurements (join on benchmark_id)
|
||||
if len(self.model_measurements_df) > 0:
|
||||
# Drop 'time' column from model measurements to avoid conflicts
|
||||
model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
|
||||
summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
|
||||
|
||||
model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
|
||||
summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")
|
||||
|
||||
# Calculate device measurement aggregates using pandas groupby
|
||||
if len(self.device_measurements_df) > 0:
|
||||
device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
|
||||
'cpu_util': ['mean', 'max', 'std', 'count'],
|
||||
'mem_megabytes': ['mean', 'max', 'std'],
|
||||
'gpu_util': ['mean', 'max', 'std'],
|
||||
'gpu_mem_megabytes': ['mean', 'max', 'std']
|
||||
}).round(3)
|
||||
|
||||
device_agg = (
|
||||
self.device_measurements_df.groupby("benchmark_id")
|
||||
.agg(
|
||||
{
|
||||
"cpu_util": ["mean", "max", "std", "count"],
|
||||
"mem_megabytes": ["mean", "max", "std"],
|
||||
"gpu_util": ["mean", "max", "std"],
|
||||
"gpu_mem_megabytes": ["mean", "max", "std"],
|
||||
}
|
||||
)
|
||||
.round(3)
|
||||
)
|
||||
|
||||
# Flatten column names
|
||||
device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
|
||||
device_agg = device_agg.reset_index()
|
||||
|
||||
|
||||
# Rename count column to be more descriptive
|
||||
if 'cpu_util_count' in device_agg.columns:
|
||||
device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
|
||||
|
||||
if "cpu_util_count" in device_agg.columns:
|
||||
device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})
|
||||
|
||||
# Merge with summary
|
||||
summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
|
||||
|
||||
summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")
|
||||
|
||||
# Export the comprehensive summary
|
||||
summary_df.to_csv(summary_file, index=False)
|
||||
self.logger.info(f"Created comprehensive benchmark summary with {len(summary_df)} records at {summary_file}")
|
||||
@ -312,23 +346,18 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
|
||||
type=str,
|
||||
help="The commit message associated with the commit, truncated to 70 characters.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enable CSV output files generation."
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")
|
||||
|
||||
parser.add_argument(
|
||||
"--csv-output-dir",
|
||||
type=str,
|
||||
default="benchmark_results",
|
||||
help="Directory for CSV output files (default: benchmark_results)."
|
||||
help="Directory for CSV output files (default: benchmark_results).",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# CSV is disabled by default, only enabled when --csv is used
|
||||
generate_csv = args.csv
|
||||
|
||||
@ -353,9 +382,10 @@ def create_database_connection():
|
||||
if not PSYCOPG2_AVAILABLE:
|
||||
logger.warning("psycopg2 not available - running in CSV-only mode")
|
||||
return None
|
||||
|
||||
|
||||
try:
|
||||
import psycopg2
|
||||
|
||||
conn = psycopg2.connect("dbname=metrics")
|
||||
logger.info("Successfully connected to database")
|
||||
return conn
|
||||
@ -364,27 +394,28 @@ def create_database_connection():
|
||||
return None
|
||||
|
||||
|
||||
def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str,
|
||||
generate_csv: bool = False) -> MetricsRecorder:
|
||||
def create_global_metrics_recorder(
|
||||
repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
|
||||
) -> MetricsRecorder:
|
||||
"""
|
||||
Create a global metrics recorder that will be used across all benchmarks.
|
||||
"""
|
||||
connection = create_database_connection()
|
||||
recorder = MetricsRecorder(connection, logger, repository, branch, commit_id, commit_msg, generate_csv)
|
||||
|
||||
|
||||
# Log the storage mode
|
||||
storage_modes = []
|
||||
if connection is not None:
|
||||
storage_modes.append("database")
|
||||
if generate_csv:
|
||||
storage_modes.append("CSV")
|
||||
|
||||
|
||||
if not storage_modes:
|
||||
logger.warning("Running benchmarks with NO data storage (no database connection, CSV disabled)")
|
||||
logger.warning("Use --csv flag to enable CSV output when database is unavailable")
|
||||
else:
|
||||
logger.info(f"Running benchmarks with: {' + '.join(storage_modes)} storage")
|
||||
|
||||
|
||||
return recorder
|
||||
|
||||
|
||||
@ -393,16 +424,16 @@ if __name__ == "__main__":
|
||||
benches_folder_path = os.path.join(benchmarks_folder_path, "benches")
|
||||
|
||||
repository, branch, commit_id, commit_msg, generate_csv, csv_output_dir = parse_arguments()
|
||||
|
||||
|
||||
# Create a global metrics recorder
|
||||
global_metrics_recorder = create_global_metrics_recorder(repository, branch, commit_id, commit_msg, generate_csv)
|
||||
|
||||
|
||||
successful_benchmarks = 0
|
||||
failed_benchmarks = 0
|
||||
|
||||
|
||||
# Automatically discover all benchmark modules in benches/ folder
|
||||
benchmark_modules = []
|
||||
|
||||
|
||||
if os.path.exists(benches_folder_path):
|
||||
logger.debug(f"Scanning for benchmarks in: {benches_folder_path}")
|
||||
for entry in os.scandir(benches_folder_path):
|
||||
@ -410,12 +441,12 @@ if __name__ == "__main__":
|
||||
continue
|
||||
if entry.name.startswith("__"): # Skip __init__.py, __pycache__, etc.
|
||||
continue
|
||||
|
||||
|
||||
# Check if the file has a run_benchmark function
|
||||
try:
|
||||
logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
|
||||
module = import_from_path(entry.name.split(".")[0], entry.path)
|
||||
if hasattr(module, 'run_benchmark'):
|
||||
if hasattr(module, "run_benchmark"):
|
||||
benchmark_modules.append(entry.name)
|
||||
logger.debug(f"discovered benchmark: {entry.name}")
|
||||
else:
|
||||
@ -436,16 +467,18 @@ if __name__ == "__main__":
|
||||
logger.debug(f"loading: {module_name}")
|
||||
module = import_from_path(module_name.split(".")[0], module_path)
|
||||
logger.info(f"running benchmarks in: {module_name}")
|
||||
|
||||
|
||||
# Check if the module has an updated run_benchmark function that accepts metrics_recorder
|
||||
try:
|
||||
# Try the new signature first
|
||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
|
||||
except TypeError:
|
||||
# Fall back to the old signature for backward compatibility
|
||||
logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
|
||||
logger.warning(
|
||||
f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
|
||||
)
|
||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
|
||||
|
||||
|
||||
successful_benchmarks += 1
|
||||
except ImportModuleException as e:
|
||||
logger.error(e)
|
||||
@ -461,7 +494,7 @@ if __name__ == "__main__":
|
||||
logger.info(f"CSV reports have been generated and saved to the {csv_output_dir} directory")
|
||||
else:
|
||||
logger.info("CSV generation disabled - no CSV files created (use --csv to enable)")
|
||||
|
||||
|
||||
logger.info(f"Benchmark run completed. Successful: {successful_benchmarks}, Failed: {failed_benchmarks}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to export CSV results: {e}")
|
||||
|
@ -3,7 +3,11 @@ import subprocess
|
||||
|
||||
|
||||
def main(config_dir, config_name, args):
|
||||
subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
|
||||
subprocess.run(
|
||||
["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
|
||||
+ ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
|
||||
+ args
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1 +1 @@
|
||||
# Benchmark implementations directory
|
||||
# Benchmark implementations directory
|
||||
|
@ -12,55 +12,63 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, Any, List
|
||||
|
||||
from benchmark_framework import ModelBenchmark
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
from benchmark_framework import ModelBenchmark
|
||||
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
|
||||
class LLaMABenchmark(ModelBenchmark):
|
||||
"""Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
|
||||
|
||||
|
||||
def __init__(self, logger: logging.Logger):
|
||||
super().__init__(logger)
|
||||
self._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA
|
||||
|
||||
|
||||
|
||||
def get_scenario_configs(self) -> List[Dict[str, Any]]:
|
||||
def get_scenario_configs(self) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Get LLaMA-specific scenario configurations.
|
||||
|
||||
|
||||
Returns:
|
||||
List of scenario configuration dictionaries
|
||||
"""
|
||||
return [
|
||||
# Eager variants
|
||||
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
|
||||
|
||||
# Compiled variants
|
||||
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
|
||||
|
||||
{
|
||||
"variant": "compiled",
|
||||
"compile_mode": "max-autotune",
|
||||
"use_cache": True,
|
||||
"description": "Compiled with max autotune",
|
||||
},
|
||||
# Kernelized variant (if available)
|
||||
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
|
||||
{
|
||||
"variant": "kernelized",
|
||||
"compile_mode": "max-autotune",
|
||||
"use_cache": True,
|
||||
"description": "Kernelized execution",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _is_kernelization_available(self) -> bool:
|
||||
"""Check if kernelization is available for LLaMA."""
|
||||
try:
|
||||
from kernels import Mode, kernelize
|
||||
from kernels import Mode, kernelize # noqa: F401
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
self.logger.debug("Kernelization not available: kernels module not found")
|
||||
return False
|
||||
|
||||
def get_default_generation_config(self) -> Dict[str, Any]:
|
||||
|
||||
def get_default_generation_config(self) -> dict[str, Any]:
|
||||
"""Get LLaMA-specific generation configuration."""
|
||||
return {
|
||||
"do_sample": False,
|
||||
@ -69,20 +77,19 @@ class LLaMABenchmark(ModelBenchmark):
|
||||
"repetition_penalty": 1.0,
|
||||
"max_new_tokens": None, # Will be set per scenario
|
||||
}
|
||||
|
||||
def get_model_init_kwargs(self, config) -> Dict[str, Any]:
|
||||
|
||||
def get_model_init_kwargs(self, config) -> dict[str, Any]:
|
||||
"""Get LLaMA-specific model initialization kwargs."""
|
||||
from benchmark_framework import BenchmarkConfig
|
||||
return {
|
||||
"torch_dtype": getattr(torch, config.torch_dtype),
|
||||
"attn_implementation": config.attn_implementation,
|
||||
"use_cache": True,
|
||||
}
|
||||
|
||||
|
||||
def get_default_torch_dtype(self) -> str:
|
||||
"""Get default torch dtype for LLaMA."""
|
||||
return "float16" # LLaMA works well with float16
|
||||
|
||||
|
||||
def get_default_device(self) -> str:
|
||||
"""Get default device for LLaMA."""
|
||||
return "cuda" # LLaMA prefers CUDA
|
||||
@ -91,35 +98,37 @@ class LLaMABenchmark(ModelBenchmark):
|
||||
def run_llama(logger, output_dir, **kwargs):
|
||||
"""
|
||||
Run LLaMA benchmark with the given configuration.
|
||||
|
||||
|
||||
Args:
|
||||
logger: Logger instance
|
||||
output_dir: Output directory for results
|
||||
**kwargs: Additional configuration options
|
||||
|
||||
|
||||
Returns:
|
||||
Path to output file if successful
|
||||
"""
|
||||
from benchmark_framework import BenchmarkRunner
|
||||
|
||||
|
||||
# Extract parameters with defaults
|
||||
model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf')
|
||||
warmup_iterations = kwargs.get('warmup_iterations', 3)
|
||||
measurement_iterations = kwargs.get('measurement_iterations', 5)
|
||||
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
|
||||
include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
|
||||
device = kwargs.get('device', 'cuda')
|
||||
torch_dtype = kwargs.get('torch_dtype', 'float16')
|
||||
batch_size = kwargs.get('batch_size', 1)
|
||||
commit_id = kwargs.get('commit_id', None)
|
||||
|
||||
model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
|
||||
warmup_iterations = kwargs.get("warmup_iterations", 3)
|
||||
measurement_iterations = kwargs.get("measurement_iterations", 5)
|
||||
num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
|
||||
include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
|
||||
device = kwargs.get("device", "cuda")
|
||||
torch_dtype = kwargs.get("torch_dtype", "float16")
|
||||
batch_size = kwargs.get("batch_size", 1)
|
||||
commit_id = kwargs.get("commit_id")
|
||||
|
||||
logger.info(f"Starting LLaMA benchmark for model: {model_id}")
|
||||
logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}")
|
||||
|
||||
logger.info(
|
||||
f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Create benchmark instance
|
||||
benchmark = LLaMABenchmark(logger)
|
||||
|
||||
|
||||
# Create scenarios
|
||||
scenarios = benchmark.create_scenarios(
|
||||
model_id=model_id,
|
||||
@ -129,28 +138,29 @@ def run_llama(logger, output_dir, **kwargs):
|
||||
include_sdpa_variants=include_sdpa_variants,
|
||||
device=device,
|
||||
torch_dtype=torch_dtype,
|
||||
batch_size=batch_size
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
|
||||
logger.info(f"Created {len(scenarios)} benchmark scenarios")
|
||||
|
||||
|
||||
# Create runner and execute benchmarks
|
||||
runner = BenchmarkRunner(logger, output_dir)
|
||||
results = runner.run_benchmark(benchmark, scenarios, commit_id=commit_id)
|
||||
|
||||
|
||||
if not results:
|
||||
logger.warning("No successful benchmark results")
|
||||
return None
|
||||
|
||||
|
||||
# Save results
|
||||
model_name = model_id.split('/')[-1] # Extract model name from ID
|
||||
model_name = model_id.split("/")[-1] # Extract model name from ID
|
||||
output_file = runner.save_results(model_name, results)
|
||||
|
||||
|
||||
logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
|
||||
return output_file
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLaMA benchmark failed: {e}")
|
||||
import traceback
|
||||
|
||||
logger.debug(traceback.format_exc())
|
||||
raise
|
||||
raise
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -14,350 +14,304 @@
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Top-level benchmarking script that automatically discovers and runs all benchmarks
|
||||
Top-level benchmarking script that automatically discovers and runs all benchmarks
|
||||
in the ./benches directory, organizing outputs into model-specific subfolders.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import importlib.util
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
|
||||
def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
|
||||
"""Setup logging configuration."""
|
||||
numeric_level = getattr(logging, log_level.upper(), None)
|
||||
if not isinstance(numeric_level, int):
|
||||
raise ValueError(f'Invalid log level: {log_level}')
|
||||
|
||||
raise ValueError(f"Invalid log level: {log_level}")
|
||||
|
||||
handlers = [logging.StreamHandler(sys.stdout)]
|
||||
|
||||
|
||||
if enable_file_logging:
|
||||
handlers.append(
|
||||
logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
|
||||
)
|
||||
|
||||
handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
|
||||
|
||||
logging.basicConfig(
|
||||
level=numeric_level,
|
||||
format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
|
||||
handlers=handlers
|
||||
level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
|
||||
)
|
||||
|
||||
|
||||
return logging.getLogger(__name__)
|
||||
|
||||
|
||||
def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
|
||||
def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Discover all benchmark modules in the benches directory.
|
||||
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing benchmark module info
|
||||
"""
|
||||
benchmarks = []
|
||||
benches_path = Path(benches_dir)
|
||||
|
||||
|
||||
if not benches_path.exists():
|
||||
raise FileNotFoundError(f"Benches directory not found: {benches_dir}")
|
||||
|
||||
|
||||
for py_file in benches_path.glob("*.py"):
|
||||
if py_file.name.startswith("__"):
|
||||
continue
|
||||
|
||||
|
||||
module_name = py_file.stem
|
||||
|
||||
|
||||
try:
|
||||
# Import the module
|
||||
spec = importlib.util.spec_from_file_location(module_name, py_file)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
|
||||
# Check if it has a benchmark runner function
|
||||
if hasattr(module, f'run_{module_name}'):
|
||||
benchmarks.append({
|
||||
'name': module_name,
|
||||
'path': str(py_file),
|
||||
'module': module,
|
||||
'runner_function': getattr(module, f'run_{module_name}')
|
||||
})
|
||||
elif hasattr(module, 'run_benchmark'):
|
||||
benchmarks.append({
|
||||
'name': module_name,
|
||||
'path': str(py_file),
|
||||
'module': module,
|
||||
'runner_function': getattr(module, 'run_benchmark')
|
||||
})
|
||||
if hasattr(module, f"run_{module_name}"):
|
||||
benchmarks.append(
|
||||
{
|
||||
"name": module_name,
|
||||
"path": str(py_file),
|
||||
"module": module,
|
||||
"runner_function": getattr(module, f"run_{module_name}"),
|
||||
}
|
||||
)
|
||||
elif hasattr(module, "run_benchmark"):
|
||||
benchmarks.append(
|
||||
{
|
||||
"name": module_name,
|
||||
"path": str(py_file),
|
||||
"module": module,
|
||||
"runner_function": getattr(module, "run_benchmark"),
|
||||
}
|
||||
)
|
||||
else:
|
||||
logging.warning(f"No runner function found in {py_file}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to import {py_file}: {e}")
|
||||
|
||||
|
||||
return benchmarks
|
||||
|
||||
|
||||
def run_single_benchmark(
|
||||
benchmark_info: Dict[str, Any],
|
||||
output_dir: str,
|
||||
logger: logging.Logger,
|
||||
**kwargs
|
||||
benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Run a single benchmark and return the output file path.
|
||||
|
||||
|
||||
Args:
|
||||
benchmark_info: Dictionary containing benchmark module info
|
||||
output_dir: Base output directory
|
||||
logger: Logger instance
|
||||
**kwargs: Additional arguments to pass to the benchmark
|
||||
|
||||
|
||||
Returns:
|
||||
Path to the output file if successful, None otherwise
|
||||
"""
|
||||
benchmark_name = benchmark_info['name']
|
||||
runner_func = benchmark_info['runner_function']
|
||||
|
||||
benchmark_name = benchmark_info["name"]
|
||||
runner_func = benchmark_info["runner_function"]
|
||||
|
||||
logger.info(f"Running benchmark: {benchmark_name}")
|
||||
|
||||
|
||||
try:
|
||||
# Check function signature to determine what arguments to pass
|
||||
import inspect
|
||||
|
||||
sig = inspect.signature(runner_func)
|
||||
|
||||
|
||||
# Prepare arguments based on function signature
|
||||
func_kwargs = {
|
||||
'logger': logger,
|
||||
'output_dir': output_dir
|
||||
}
|
||||
|
||||
func_kwargs = {"logger": logger, "output_dir": output_dir}
|
||||
|
||||
# Add other kwargs if the function accepts them
|
||||
for param_name in sig.parameters:
|
||||
if param_name in kwargs:
|
||||
func_kwargs[param_name] = kwargs[param_name]
|
||||
|
||||
|
||||
# Filter kwargs to only include parameters the function accepts
|
||||
# If function has **kwargs, include all provided kwargs
|
||||
has_var_kwargs = any(param.kind == param.VAR_KEYWORD for param in sig.parameters.values())
|
||||
if has_var_kwargs:
|
||||
valid_kwargs = {**func_kwargs, **kwargs}
|
||||
else:
|
||||
valid_kwargs = {k: v for k, v in func_kwargs.items()
|
||||
if k in sig.parameters}
|
||||
|
||||
valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
|
||||
|
||||
# Run the benchmark
|
||||
result = runner_func(**valid_kwargs)
|
||||
|
||||
|
||||
if isinstance(result, str):
|
||||
# Function returned a file path
|
||||
return result
|
||||
else:
|
||||
logger.info(f"Benchmark {benchmark_name} completed successfully")
|
||||
return "completed"
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Benchmark {benchmark_name} failed: {e}")
|
||||
import traceback
|
||||
|
||||
logger.debug(traceback.format_exc())
|
||||
return None
|
||||
|
||||
|
||||
def generate_summary_report(
|
||||
output_dir: str,
|
||||
benchmark_results: Dict[str, Any],
|
||||
logger: logging.Logger
|
||||
) -> str:
|
||||
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
|
||||
"""Generate a summary report of all benchmark runs."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
|
||||
|
||||
|
||||
summary_data = {
|
||||
"run_metadata": {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"total_benchmarks": len(benchmark_results),
|
||||
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
|
||||
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None])
|
||||
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
|
||||
},
|
||||
"benchmark_results": benchmark_results,
|
||||
"output_directory": output_dir
|
||||
"output_directory": output_dir,
|
||||
}
|
||||
|
||||
with open(summary_file, 'w') as f:
|
||||
|
||||
with open(summary_file, "w") as f:
|
||||
json.dump(summary_data, f, indent=2, default=str)
|
||||
|
||||
|
||||
logger.info(f"Summary report saved to: {summary_file}")
|
||||
return summary_file
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the benchmarking script."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run all benchmarks in the ./benches directory"
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="benchmark_results",
|
||||
help="Base output directory for benchmark results (default: benchmark_results)"
|
||||
help="Base output directory for benchmark results (default: benchmark_results)",
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument(
|
||||
"--benches-dir",
|
||||
type=str,
|
||||
default="./benches",
|
||||
help="Directory containing benchmark implementations (default: ./benches)"
|
||||
help="Directory containing benchmark implementations (default: ./benches)",
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument(
|
||||
"--log-level",
|
||||
type=str,
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
default="INFO",
|
||||
help="Logging level (default: INFO)"
|
||||
help="Logging level (default: INFO)",
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
|
||||
|
||||
parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
|
||||
|
||||
parser.add_argument(
|
||||
"--model-id",
|
||||
type=str,
|
||||
help="Specific model ID to benchmark (if supported by benchmarks)"
|
||||
"--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--warmup-iterations",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of warmup iterations (default: 3)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--measurement-iterations",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of measurement iterations (default: 5)"
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument(
|
||||
"--num-tokens-to-generate",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of tokens to generate in benchmarks (default: 100)"
|
||||
help="Number of tokens to generate in benchmarks (default: 100)",
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
|
||||
|
||||
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
|
||||
|
||||
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
|
||||
|
||||
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
|
||||
|
||||
parser.add_argument(
|
||||
"--include",
|
||||
type=str,
|
||||
nargs="*",
|
||||
help="Only run benchmarks matching these names"
|
||||
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--exclude",
|
||||
type=str,
|
||||
nargs="*",
|
||||
help="Exclude benchmarks matching these names"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable-mock",
|
||||
action="store_true",
|
||||
help="Enable mock benchmark (skipped by default)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable-file-logging",
|
||||
action="store_true",
|
||||
help="Enable file logging (disabled by default)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--commit-id",
|
||||
type=str,
|
||||
help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# Setup logging
|
||||
logger = setup_logging(args.log_level, args.enable_file_logging)
|
||||
|
||||
|
||||
logger.info("Starting benchmark discovery and execution")
|
||||
logger.info(f"Output directory: {args.output_dir}")
|
||||
logger.info(f"Benches directory: {args.benches_dir}")
|
||||
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
|
||||
try:
|
||||
# Discover benchmarks
|
||||
benchmarks = discover_benchmarks(args.benches_dir)
|
||||
logger.info(f"Discovered {len(benchmarks)} benchmark(s): {[b['name'] for b in benchmarks]}")
|
||||
|
||||
|
||||
if not benchmarks:
|
||||
logger.warning("No benchmarks found!")
|
||||
return 1
|
||||
|
||||
|
||||
# Filter benchmarks based on include/exclude
|
||||
filtered_benchmarks = benchmarks
|
||||
|
||||
|
||||
if args.include:
|
||||
filtered_benchmarks = [b for b in filtered_benchmarks
|
||||
if any(pattern in b['name'] for pattern in args.include)]
|
||||
filtered_benchmarks = [
|
||||
b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
|
||||
]
|
||||
logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
|
||||
|
||||
|
||||
if args.exclude:
|
||||
filtered_benchmarks = [b for b in filtered_benchmarks
|
||||
if not any(pattern in b['name'] for pattern in args.exclude)]
|
||||
filtered_benchmarks = [
|
||||
b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
|
||||
]
|
||||
logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
|
||||
|
||||
|
||||
if not filtered_benchmarks:
|
||||
logger.warning("No benchmarks remaining after filtering!")
|
||||
return 1
|
||||
|
||||
|
||||
# Prepare common kwargs for benchmarks
|
||||
benchmark_kwargs = {
|
||||
'warmup_iterations': args.warmup_iterations,
|
||||
'measurement_iterations': args.measurement_iterations,
|
||||
'num_tokens_to_generate': args.num_tokens_to_generate
|
||||
"warmup_iterations": args.warmup_iterations,
|
||||
"measurement_iterations": args.measurement_iterations,
|
||||
"num_tokens_to_generate": args.num_tokens_to_generate,
|
||||
}
|
||||
|
||||
|
||||
if args.model_id:
|
||||
benchmark_kwargs['model_id'] = args.model_id
|
||||
|
||||
benchmark_kwargs["model_id"] = args.model_id
|
||||
|
||||
# Add enable_mock flag for mock benchmark
|
||||
benchmark_kwargs['enable_mock'] = args.enable_mock
|
||||
|
||||
benchmark_kwargs["enable_mock"] = args.enable_mock
|
||||
|
||||
# Add commit_id if provided
|
||||
if args.commit_id:
|
||||
benchmark_kwargs['commit_id'] = args.commit_id
|
||||
|
||||
benchmark_kwargs["commit_id"] = args.commit_id
|
||||
|
||||
# Run benchmarks
|
||||
benchmark_results = {}
|
||||
successful_count = 0
|
||||
|
||||
|
||||
for benchmark_info in filtered_benchmarks:
|
||||
result = run_single_benchmark(
|
||||
benchmark_info,
|
||||
args.output_dir,
|
||||
logger,
|
||||
**benchmark_kwargs
|
||||
)
|
||||
|
||||
benchmark_results[benchmark_info['name']] = result
|
||||
|
||||
result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
|
||||
|
||||
benchmark_results[benchmark_info["name"]] = result
|
||||
|
||||
if result is not None:
|
||||
successful_count += 1
|
||||
|
||||
|
||||
# Generate summary report
|
||||
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger)
|
||||
|
||||
|
||||
# Final summary
|
||||
total_benchmarks = len(filtered_benchmarks)
|
||||
failed_count = total_benchmarks - successful_count
|
||||
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("BENCHMARK RUN SUMMARY")
|
||||
logger.info("=" * 60)
|
||||
@ -366,20 +320,21 @@ def main():
|
||||
logger.info(f"Failed: {failed_count}")
|
||||
logger.info(f"Output directory: {args.output_dir}")
|
||||
logger.info(f"Summary report: {summary_file}")
|
||||
|
||||
|
||||
if failed_count > 0:
|
||||
logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
|
||||
return 1
|
||||
else:
|
||||
logger.info("All benchmarks completed successfully!")
|
||||
return 0
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Benchmark run failed: {e}")
|
||||
import traceback
|
||||
|
||||
logger.debug(traceback.format_exc())
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
sys.exit(main())
|
||||
|
@ -4,8 +4,8 @@ import datasets
|
||||
|
||||
import transformers
|
||||
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
|
||||
from transformers.utils import logging
|
||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
logging.set_verbosity_info()
|
||||
@ -22,7 +22,9 @@ imperfect = 0
|
||||
wrong = 0
|
||||
|
||||
|
||||
def check_diff(spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
|
||||
def check_diff(
|
||||
spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
|
||||
) -> bool:
|
||||
if spm_diff == list(reversed(tok_diff)):
|
||||
# AAA -> AA+A vs A+AA case.
|
||||
return True
|
||||
@ -54,7 +56,9 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
|
||||
def check_details(
|
||||
line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
|
||||
) -> bool:
|
||||
# Encoding can be the same with same result AAA -> A + AA vs AA + A
|
||||
# We can check that we use at least exactly the same number of tokens.
|
||||
for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
|
||||
@ -90,7 +94,9 @@ def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTr
|
||||
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
|
||||
]
|
||||
for j in possible_matches:
|
||||
if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], slow, fast) and check_details(
|
||||
if check_diff(
|
||||
spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
|
||||
) and check_details(
|
||||
line,
|
||||
spm_ids[first + i : last],
|
||||
tok_ids[first + j : last],
|
||||
@ -140,9 +146,9 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
|
||||
if skip_assert:
|
||||
return
|
||||
|
||||
assert (
|
||||
slow_ids == fast_ids
|
||||
), f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
|
||||
assert slow_ids == fast_ids, (
|
||||
f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
|
||||
)
|
||||
|
||||
|
||||
def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
|
||||
|
@ -15,6 +15,7 @@
|
||||
Script to close stale issue. Taken in part from the AllenNLP repository.
|
||||
https://github.com/allenai/allennlp.
|
||||
"""
|
||||
|
||||
import os
|
||||
from datetime import datetime as dt
|
||||
|
||||
@ -39,10 +40,11 @@ def main():
|
||||
|
||||
for i, issue in enumerate(open_issues):
|
||||
print(i, issue)
|
||||
comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True)
|
||||
comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
|
||||
last_comment = comments[0] if len(comments) > 0 else None
|
||||
if (
|
||||
last_comment is not None and last_comment.user.login == "github-actions[bot]"
|
||||
last_comment is not None
|
||||
and last_comment.user.login == "github-actions[bot]"
|
||||
and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
|
||||
and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
|
||||
and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
|
||||
|
Reference in New Issue
Block a user