mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 01:23:56 +08:00
Enable ruff on benchmark and scripts (#40634)
* Enable ruff on benchmark and scripts Signed-off-by: cyy <cyyever@outlook.com> * Cover benchmark_v2 Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> * correct * style * style --------- Signed-off-by: cyy <cyyever@outlook.com> Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
This commit is contained in:
2
Makefile
2
Makefile
@ -3,7 +3,7 @@
|
|||||||
# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
|
# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
|
||||||
export PYTHONPATH = src
|
export PYTHONPATH = src
|
||||||
|
|
||||||
check_dirs := examples tests src utils
|
check_dirs := examples tests src utils scripts benchmark benchmark_v2
|
||||||
|
|
||||||
exclude_folders := ""
|
exclude_folders := ""
|
||||||
|
|
||||||
|
@ -11,25 +11,28 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from logging import Logger
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
from logging import Logger
|
||||||
from threading import Event, Thread
|
from threading import Event, Thread
|
||||||
from time import perf_counter, sleep
|
from time import perf_counter, sleep
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import sys
|
|
||||||
|
|
||||||
# Add the parent directory to Python path to import benchmarks_entrypoint
|
# Add the parent directory to Python path to import benchmarks_entrypoint
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
from benchmarks_entrypoint import MetricsRecorder
|
|
||||||
|
|
||||||
import gpustat
|
import gpustat
|
||||||
import psutil
|
import psutil
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
from benchmarks_entrypoint import MetricsRecorder
|
||||||
|
|
||||||
|
|
||||||
# Optional heavy ML dependencies - only required when actually running the benchmark
|
# Optional heavy ML dependencies - only required when actually running the benchmark
|
||||||
try:
|
try:
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
|
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
|
||||||
|
|
||||||
TRANSFORMERS_AVAILABLE = True
|
TRANSFORMERS_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
TRANSFORMERS_AVAILABLE = False
|
TRANSFORMERS_AVAILABLE = False
|
||||||
@ -63,7 +66,13 @@ def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
|
|||||||
|
|
||||||
|
|
||||||
def run_benchmark(
|
def run_benchmark(
|
||||||
logger: Logger, repository: str, branch: str, commit_id: str, commit_msg: str, metrics_recorder=None, num_tokens_to_generate=100
|
logger: Logger,
|
||||||
|
repository: str,
|
||||||
|
branch: str,
|
||||||
|
commit_id: str,
|
||||||
|
commit_msg: str,
|
||||||
|
metrics_recorder=None,
|
||||||
|
num_tokens_to_generate=100,
|
||||||
):
|
):
|
||||||
# Check if required ML dependencies are available
|
# Check if required ML dependencies are available
|
||||||
if not TRANSFORMERS_AVAILABLE:
|
if not TRANSFORMERS_AVAILABLE:
|
||||||
@ -154,7 +163,7 @@ def run_benchmark(
|
|||||||
# First eager forward pass
|
# First eager forward pass
|
||||||
logger.info("running first eager forward pass")
|
logger.info("running first eager forward pass")
|
||||||
start = perf_counter()
|
start = perf_counter()
|
||||||
outputs = model(**inputs)
|
_ = model(**inputs)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
end = perf_counter()
|
end = perf_counter()
|
||||||
first_eager_fwd_pass_time = end - start
|
first_eager_fwd_pass_time = end - start
|
||||||
@ -163,7 +172,7 @@ def run_benchmark(
|
|||||||
# Second eager forward pass (should be faster)
|
# Second eager forward pass (should be faster)
|
||||||
logger.info("running second eager forward pass")
|
logger.info("running second eager forward pass")
|
||||||
start = perf_counter()
|
start = perf_counter()
|
||||||
outputs = model(**inputs)
|
_ = model(**inputs)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
end = perf_counter()
|
end = perf_counter()
|
||||||
second_eager_fwd_pass_time = end - start
|
second_eager_fwd_pass_time = end - start
|
||||||
|
@ -31,9 +31,7 @@ from contextlib import contextmanager
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from git import Repo
|
from git import Repo
|
||||||
|
|
||||||
from huggingface_hub import HfApi
|
from huggingface_hub import HfApi
|
||||||
|
|
||||||
from optimum_benchmark import Benchmark
|
from optimum_benchmark import Benchmark
|
||||||
from optimum_benchmark_wrapper import main
|
from optimum_benchmark_wrapper import main
|
||||||
|
|
||||||
|
@ -13,19 +13,20 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import argparse
|
import argparse
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
|
||||||
import uuid
|
import uuid
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, Tuple, Optional, List
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from psycopg2.extensions import register_adapter
|
from psycopg2.extensions import register_adapter
|
||||||
from psycopg2.extras import Json
|
from psycopg2.extras import Json
|
||||||
|
|
||||||
register_adapter(dict, Json)
|
register_adapter(dict, Json)
|
||||||
PSYCOPG2_AVAILABLE = True
|
PSYCOPG2_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -38,8 +39,14 @@ class ImportModuleException(Exception):
|
|||||||
|
|
||||||
class MetricsRecorder:
|
class MetricsRecorder:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, connection, logger: logging.Logger, repository: str, branch: str, commit_id: str, commit_msg: str,
|
self,
|
||||||
collect_csv_data: bool = True
|
connection,
|
||||||
|
logger: logging.Logger,
|
||||||
|
repository: str,
|
||||||
|
branch: str,
|
||||||
|
commit_id: str,
|
||||||
|
commit_msg: str,
|
||||||
|
collect_csv_data: bool = True,
|
||||||
):
|
):
|
||||||
self.conn = connection
|
self.conn = connection
|
||||||
self.use_database = connection is not None
|
self.use_database = connection is not None
|
||||||
@ -55,23 +62,39 @@ class MetricsRecorder:
|
|||||||
# For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
|
# For CSV export - store all data in pandas DataFrames (only if CSV collection is enabled)
|
||||||
if self.collect_csv_data:
|
if self.collect_csv_data:
|
||||||
# Initialize empty DataFrames with proper schemas
|
# Initialize empty DataFrames with proper schemas
|
||||||
self.benchmarks_df = pd.DataFrame(columns=[
|
self.benchmarks_df = pd.DataFrame(
|
||||||
'benchmark_id', 'repository', 'branch', 'commit_id', 'commit_message',
|
columns=[
|
||||||
'metadata', 'created_at'
|
"benchmark_id",
|
||||||
])
|
"repository",
|
||||||
self.device_measurements_df = pd.DataFrame(columns=[
|
"branch",
|
||||||
'benchmark_id', 'cpu_util', 'mem_megabytes', 'gpu_util',
|
"commit_id",
|
||||||
'gpu_mem_megabytes', 'time'
|
"commit_message",
|
||||||
])
|
"metadata",
|
||||||
self.model_measurements_df = pd.DataFrame(columns=[
|
"created_at",
|
||||||
'benchmark_id', 'time', 'model_load_time', 'first_eager_forward_pass_time_secs',
|
]
|
||||||
'second_eager_forward_pass_time_secs', 'first_eager_generate_time_secs',
|
)
|
||||||
'second_eager_generate_time_secs', 'time_to_first_token_secs',
|
self.device_measurements_df = pd.DataFrame(
|
||||||
'time_to_second_token_secs', 'time_to_third_token_secs',
|
columns=["benchmark_id", "cpu_util", "mem_megabytes", "gpu_util", "gpu_mem_megabytes", "time"]
|
||||||
'time_to_next_token_mean_secs', 'first_compile_generate_time_secs',
|
)
|
||||||
'second_compile_generate_time_secs', 'third_compile_generate_time_secs',
|
self.model_measurements_df = pd.DataFrame(
|
||||||
'fourth_compile_generate_time_secs'
|
columns=[
|
||||||
])
|
"benchmark_id",
|
||||||
|
"time",
|
||||||
|
"model_load_time",
|
||||||
|
"first_eager_forward_pass_time_secs",
|
||||||
|
"second_eager_forward_pass_time_secs",
|
||||||
|
"first_eager_generate_time_secs",
|
||||||
|
"second_eager_generate_time_secs",
|
||||||
|
"time_to_first_token_secs",
|
||||||
|
"time_to_second_token_secs",
|
||||||
|
"time_to_third_token_secs",
|
||||||
|
"time_to_next_token_mean_secs",
|
||||||
|
"first_compile_generate_time_secs",
|
||||||
|
"second_compile_generate_time_secs",
|
||||||
|
"third_compile_generate_time_secs",
|
||||||
|
"fourth_compile_generate_time_secs",
|
||||||
|
]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.benchmarks_df = None
|
self.benchmarks_df = None
|
||||||
self.device_measurements_df = None
|
self.device_measurements_df = None
|
||||||
@ -95,15 +118,19 @@ class MetricsRecorder:
|
|||||||
# Store benchmark data for CSV export (if enabled)
|
# Store benchmark data for CSV export (if enabled)
|
||||||
if self.collect_csv_data:
|
if self.collect_csv_data:
|
||||||
# Add row to pandas DataFrame
|
# Add row to pandas DataFrame
|
||||||
new_row = pd.DataFrame([{
|
new_row = pd.DataFrame(
|
||||||
'benchmark_id': benchmark_id,
|
[
|
||||||
'repository': self.repository,
|
{
|
||||||
'branch': self.branch,
|
"benchmark_id": benchmark_id,
|
||||||
'commit_id': self.commit_id,
|
"repository": self.repository,
|
||||||
'commit_message': self.commit_msg,
|
"branch": self.branch,
|
||||||
'metadata': json.dumps(metadata),
|
"commit_id": self.commit_id,
|
||||||
'created_at': datetime.utcnow().isoformat()
|
"commit_message": self.commit_msg,
|
||||||
}])
|
"metadata": json.dumps(metadata),
|
||||||
|
"created_at": datetime.utcnow().isoformat(),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
|
self.benchmarks_df = pd.concat([self.benchmarks_df, new_row], ignore_index=True)
|
||||||
|
|
||||||
mode_info = []
|
mode_info = []
|
||||||
@ -123,14 +150,18 @@ class MetricsRecorder:
|
|||||||
# Store device measurements for CSV export (if enabled)
|
# Store device measurements for CSV export (if enabled)
|
||||||
if self.collect_csv_data:
|
if self.collect_csv_data:
|
||||||
# Add row to pandas DataFrame
|
# Add row to pandas DataFrame
|
||||||
new_row = pd.DataFrame([{
|
new_row = pd.DataFrame(
|
||||||
'benchmark_id': benchmark_id,
|
[
|
||||||
'cpu_util': cpu_util,
|
{
|
||||||
'mem_megabytes': mem_megabytes,
|
"benchmark_id": benchmark_id,
|
||||||
'gpu_util': gpu_util,
|
"cpu_util": cpu_util,
|
||||||
'gpu_mem_megabytes': gpu_mem_megabytes,
|
"mem_megabytes": mem_megabytes,
|
||||||
'time': datetime.utcnow().isoformat()
|
"gpu_util": gpu_util,
|
||||||
}])
|
"gpu_mem_megabytes": gpu_mem_megabytes,
|
||||||
|
"time": datetime.utcnow().isoformat(),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
|
self.device_measurements_df = pd.concat([self.device_measurements_df, new_row], ignore_index=True)
|
||||||
|
|
||||||
# Store in database if available
|
# Store in database if available
|
||||||
@ -149,10 +180,7 @@ class MetricsRecorder:
|
|||||||
# Store model measurements for CSV export (if enabled)
|
# Store model measurements for CSV export (if enabled)
|
||||||
if self.collect_csv_data:
|
if self.collect_csv_data:
|
||||||
# Add row to pandas DataFrame with flattened measurements
|
# Add row to pandas DataFrame with flattened measurements
|
||||||
row_data = {
|
row_data = {"benchmark_id": benchmark_id, "time": datetime.utcnow().isoformat()}
|
||||||
'benchmark_id': benchmark_id,
|
|
||||||
'time': datetime.utcnow().isoformat()
|
|
||||||
}
|
|
||||||
# Flatten the measurements dict into the row
|
# Flatten the measurements dict into the row
|
||||||
row_data.update(measurements)
|
row_data.update(measurements)
|
||||||
|
|
||||||
@ -241,28 +269,34 @@ class MetricsRecorder:
|
|||||||
# Add model measurements (join on benchmark_id)
|
# Add model measurements (join on benchmark_id)
|
||||||
if len(self.model_measurements_df) > 0:
|
if len(self.model_measurements_df) > 0:
|
||||||
# Drop 'time' column from model measurements to avoid conflicts
|
# Drop 'time' column from model measurements to avoid conflicts
|
||||||
model_df = self.model_measurements_df.drop(columns=['time'], errors='ignore')
|
model_df = self.model_measurements_df.drop(columns=["time"], errors="ignore")
|
||||||
summary_df = summary_df.merge(model_df, on='benchmark_id', how='left')
|
summary_df = summary_df.merge(model_df, on="benchmark_id", how="left")
|
||||||
|
|
||||||
# Calculate device measurement aggregates using pandas groupby
|
# Calculate device measurement aggregates using pandas groupby
|
||||||
if len(self.device_measurements_df) > 0:
|
if len(self.device_measurements_df) > 0:
|
||||||
device_agg = self.device_measurements_df.groupby('benchmark_id').agg({
|
device_agg = (
|
||||||
'cpu_util': ['mean', 'max', 'std', 'count'],
|
self.device_measurements_df.groupby("benchmark_id")
|
||||||
'mem_megabytes': ['mean', 'max', 'std'],
|
.agg(
|
||||||
'gpu_util': ['mean', 'max', 'std'],
|
{
|
||||||
'gpu_mem_megabytes': ['mean', 'max', 'std']
|
"cpu_util": ["mean", "max", "std", "count"],
|
||||||
}).round(3)
|
"mem_megabytes": ["mean", "max", "std"],
|
||||||
|
"gpu_util": ["mean", "max", "std"],
|
||||||
|
"gpu_mem_megabytes": ["mean", "max", "std"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.round(3)
|
||||||
|
)
|
||||||
|
|
||||||
# Flatten column names
|
# Flatten column names
|
||||||
device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
|
device_agg.columns = [f"{col[0]}_{col[1]}" for col in device_agg.columns]
|
||||||
device_agg = device_agg.reset_index()
|
device_agg = device_agg.reset_index()
|
||||||
|
|
||||||
# Rename count column to be more descriptive
|
# Rename count column to be more descriptive
|
||||||
if 'cpu_util_count' in device_agg.columns:
|
if "cpu_util_count" in device_agg.columns:
|
||||||
device_agg = device_agg.rename(columns={'cpu_util_count': 'device_measurement_count'})
|
device_agg = device_agg.rename(columns={"cpu_util_count": "device_measurement_count"})
|
||||||
|
|
||||||
# Merge with summary
|
# Merge with summary
|
||||||
summary_df = summary_df.merge(device_agg, on='benchmark_id', how='left')
|
summary_df = summary_df.merge(device_agg, on="benchmark_id", how="left")
|
||||||
|
|
||||||
# Export the comprehensive summary
|
# Export the comprehensive summary
|
||||||
summary_df.to_csv(summary_file, index=False)
|
summary_df.to_csv(summary_file, index=False)
|
||||||
@ -313,18 +347,13 @@ def parse_arguments() -> tuple[str, str, str, str, bool, str]:
|
|||||||
help="The commit message associated with the commit, truncated to 70 characters.",
|
help="The commit message associated with the commit, truncated to 70 characters.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument("--csv", action="store_true", default=False, help="Enable CSV output files generation.")
|
||||||
"--csv",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="Enable CSV output files generation."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--csv-output-dir",
|
"--csv-output-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default="benchmark_results",
|
default="benchmark_results",
|
||||||
help="Directory for CSV output files (default: benchmark_results)."
|
help="Directory for CSV output files (default: benchmark_results).",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -356,6 +385,7 @@ def create_database_connection():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
|
||||||
conn = psycopg2.connect("dbname=metrics")
|
conn = psycopg2.connect("dbname=metrics")
|
||||||
logger.info("Successfully connected to database")
|
logger.info("Successfully connected to database")
|
||||||
return conn
|
return conn
|
||||||
@ -364,8 +394,9 @@ def create_database_connection():
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def create_global_metrics_recorder(repository: str, branch: str, commit_id: str, commit_msg: str,
|
def create_global_metrics_recorder(
|
||||||
generate_csv: bool = False) -> MetricsRecorder:
|
repository: str, branch: str, commit_id: str, commit_msg: str, generate_csv: bool = False
|
||||||
|
) -> MetricsRecorder:
|
||||||
"""
|
"""
|
||||||
Create a global metrics recorder that will be used across all benchmarks.
|
Create a global metrics recorder that will be used across all benchmarks.
|
||||||
"""
|
"""
|
||||||
@ -415,7 +446,7 @@ if __name__ == "__main__":
|
|||||||
try:
|
try:
|
||||||
logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
|
logger.debug(f"checking if benches/{entry.name} has run_benchmark function")
|
||||||
module = import_from_path(entry.name.split(".")[0], entry.path)
|
module = import_from_path(entry.name.split(".")[0], entry.path)
|
||||||
if hasattr(module, 'run_benchmark'):
|
if hasattr(module, "run_benchmark"):
|
||||||
benchmark_modules.append(entry.name)
|
benchmark_modules.append(entry.name)
|
||||||
logger.debug(f"discovered benchmark: {entry.name}")
|
logger.debug(f"discovered benchmark: {entry.name}")
|
||||||
else:
|
else:
|
||||||
@ -443,7 +474,9 @@ if __name__ == "__main__":
|
|||||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
|
module.run_benchmark(logger, repository, branch, commit_id, commit_msg, global_metrics_recorder)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
# Fall back to the old signature for backward compatibility
|
# Fall back to the old signature for backward compatibility
|
||||||
logger.warning(f"Module {module_name} using old run_benchmark signature - database connection will be created per module")
|
logger.warning(
|
||||||
|
f"Module {module_name} using old run_benchmark signature - database connection will be created per module"
|
||||||
|
)
|
||||||
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
|
module.run_benchmark(logger, repository, branch, commit_id, commit_msg)
|
||||||
|
|
||||||
successful_benchmarks += 1
|
successful_benchmarks += 1
|
||||||
|
@ -3,7 +3,11 @@ import subprocess
|
|||||||
|
|
||||||
|
|
||||||
def main(config_dir, config_name, args):
|
def main(config_dir, config_name, args):
|
||||||
subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
|
subprocess.run(
|
||||||
|
["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"]
|
||||||
|
+ ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"]
|
||||||
|
+ args
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -12,18 +12,19 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, Any, List
|
import os
|
||||||
|
from typing import Any
|
||||||
from benchmark_framework import ModelBenchmark
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from benchmark_framework import ModelBenchmark
|
||||||
|
|
||||||
|
|
||||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
os.environ["TOKENIZERS_PARALLELISM"] = "1"
|
||||||
torch.set_float32_matmul_precision("high")
|
torch.set_float32_matmul_precision("high")
|
||||||
|
|
||||||
|
|
||||||
class LLaMABenchmark(ModelBenchmark):
|
class LLaMABenchmark(ModelBenchmark):
|
||||||
"""Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
|
"""Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""
|
||||||
|
|
||||||
@ -31,9 +32,7 @@ class LLaMABenchmark(ModelBenchmark):
|
|||||||
super().__init__(logger)
|
super().__init__(logger)
|
||||||
self._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA
|
self._default_prompt = "Why dogs are so cute?" # Custom prompt for LLaMA
|
||||||
|
|
||||||
|
def get_scenario_configs(self) -> list[dict[str, Any]]:
|
||||||
|
|
||||||
def get_scenario_configs(self) -> List[Dict[str, Any]]:
|
|
||||||
"""
|
"""
|
||||||
Get LLaMA-specific scenario configurations.
|
Get LLaMA-specific scenario configurations.
|
||||||
|
|
||||||
@ -43,24 +42,33 @@ class LLaMABenchmark(ModelBenchmark):
|
|||||||
return [
|
return [
|
||||||
# Eager variants
|
# Eager variants
|
||||||
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
|
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
|
||||||
|
|
||||||
# Compiled variants
|
# Compiled variants
|
||||||
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
|
{
|
||||||
|
"variant": "compiled",
|
||||||
|
"compile_mode": "max-autotune",
|
||||||
|
"use_cache": True,
|
||||||
|
"description": "Compiled with max autotune",
|
||||||
|
},
|
||||||
# Kernelized variant (if available)
|
# Kernelized variant (if available)
|
||||||
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
|
{
|
||||||
|
"variant": "kernelized",
|
||||||
|
"compile_mode": "max-autotune",
|
||||||
|
"use_cache": True,
|
||||||
|
"description": "Kernelized execution",
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def _is_kernelization_available(self) -> bool:
|
def _is_kernelization_available(self) -> bool:
|
||||||
"""Check if kernelization is available for LLaMA."""
|
"""Check if kernelization is available for LLaMA."""
|
||||||
try:
|
try:
|
||||||
from kernels import Mode, kernelize
|
from kernels import Mode, kernelize # noqa: F401
|
||||||
|
|
||||||
return True
|
return True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
self.logger.debug("Kernelization not available: kernels module not found")
|
self.logger.debug("Kernelization not available: kernels module not found")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_default_generation_config(self) -> Dict[str, Any]:
|
def get_default_generation_config(self) -> dict[str, Any]:
|
||||||
"""Get LLaMA-specific generation configuration."""
|
"""Get LLaMA-specific generation configuration."""
|
||||||
return {
|
return {
|
||||||
"do_sample": False,
|
"do_sample": False,
|
||||||
@ -70,9 +78,8 @@ class LLaMABenchmark(ModelBenchmark):
|
|||||||
"max_new_tokens": None, # Will be set per scenario
|
"max_new_tokens": None, # Will be set per scenario
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_model_init_kwargs(self, config) -> Dict[str, Any]:
|
def get_model_init_kwargs(self, config) -> dict[str, Any]:
|
||||||
"""Get LLaMA-specific model initialization kwargs."""
|
"""Get LLaMA-specific model initialization kwargs."""
|
||||||
from benchmark_framework import BenchmarkConfig
|
|
||||||
return {
|
return {
|
||||||
"torch_dtype": getattr(torch, config.torch_dtype),
|
"torch_dtype": getattr(torch, config.torch_dtype),
|
||||||
"attn_implementation": config.attn_implementation,
|
"attn_implementation": config.attn_implementation,
|
||||||
@ -103,18 +110,20 @@ def run_llama(logger, output_dir, **kwargs):
|
|||||||
from benchmark_framework import BenchmarkRunner
|
from benchmark_framework import BenchmarkRunner
|
||||||
|
|
||||||
# Extract parameters with defaults
|
# Extract parameters with defaults
|
||||||
model_id = kwargs.get('model_id', 'meta-llama/Llama-2-7b-hf')
|
model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
|
||||||
warmup_iterations = kwargs.get('warmup_iterations', 3)
|
warmup_iterations = kwargs.get("warmup_iterations", 3)
|
||||||
measurement_iterations = kwargs.get('measurement_iterations', 5)
|
measurement_iterations = kwargs.get("measurement_iterations", 5)
|
||||||
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
|
num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
|
||||||
include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
|
include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
|
||||||
device = kwargs.get('device', 'cuda')
|
device = kwargs.get("device", "cuda")
|
||||||
torch_dtype = kwargs.get('torch_dtype', 'float16')
|
torch_dtype = kwargs.get("torch_dtype", "float16")
|
||||||
batch_size = kwargs.get('batch_size', 1)
|
batch_size = kwargs.get("batch_size", 1)
|
||||||
commit_id = kwargs.get('commit_id', None)
|
commit_id = kwargs.get("commit_id")
|
||||||
|
|
||||||
logger.info(f"Starting LLaMA benchmark for model: {model_id}")
|
logger.info(f"Starting LLaMA benchmark for model: {model_id}")
|
||||||
logger.info(f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}")
|
logger.info(
|
||||||
|
f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Create benchmark instance
|
# Create benchmark instance
|
||||||
@ -129,7 +138,7 @@ def run_llama(logger, output_dir, **kwargs):
|
|||||||
include_sdpa_variants=include_sdpa_variants,
|
include_sdpa_variants=include_sdpa_variants,
|
||||||
device=device,
|
device=device,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=torch_dtype,
|
||||||
batch_size=batch_size
|
batch_size=batch_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Created {len(scenarios)} benchmark scenarios")
|
logger.info(f"Created {len(scenarios)} benchmark scenarios")
|
||||||
@ -143,7 +152,7 @@ def run_llama(logger, output_dir, **kwargs):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Save results
|
# Save results
|
||||||
model_name = model_id.split('/')[-1] # Extract model name from ID
|
model_name = model_id.split("/")[-1] # Extract model name from ID
|
||||||
output_file = runner.save_results(model_name, results)
|
output_file = runner.save_results(model_name, results)
|
||||||
|
|
||||||
logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
|
logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
|
||||||
@ -152,5 +161,6 @@ def run_llama(logger, output_dir, **kwargs):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"LLaMA benchmark failed: {e}")
|
logger.error(f"LLaMA benchmark failed: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
raise
|
raise
|
@ -14,28 +14,26 @@
|
|||||||
|
|
||||||
import gc
|
import gc
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import statistics
|
|
||||||
import threading
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from contextlib import nullcontext
|
|
||||||
from dataclasses import dataclass, field, asdict
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union, TypedDict
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import statistics
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import asdict, dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Optional, TypedDict, Union
|
||||||
|
|
||||||
|
import gpustat
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import psutil
|
import psutil
|
||||||
import gpustat
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
class GPUMetrics(TypedDict):
|
class GPUMetrics(TypedDict):
|
||||||
"""GPU monitoring result with GPU metrics."""
|
"""GPU monitoring result with GPU metrics."""
|
||||||
|
|
||||||
gpu_utilization_mean: float
|
gpu_utilization_mean: float
|
||||||
gpu_utilization_max: float
|
gpu_utilization_max: float
|
||||||
gpu_utilization_min: float
|
gpu_utilization_min: float
|
||||||
@ -48,6 +46,7 @@ class GPUMetrics(TypedDict):
|
|||||||
|
|
||||||
class NoGPU(TypedDict):
|
class NoGPU(TypedDict):
|
||||||
"""GPU monitoring result without GPU metrics."""
|
"""GPU monitoring result without GPU metrics."""
|
||||||
|
|
||||||
gpu_monitoring_status: str
|
gpu_monitoring_status: str
|
||||||
gpu_monitoring_reason: str
|
gpu_monitoring_reason: str
|
||||||
|
|
||||||
@ -134,6 +133,7 @@ class ArchAwareTimer:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class BenchmarkConfig:
|
class BenchmarkConfig:
|
||||||
"""Configuration for a single benchmark scenario."""
|
"""Configuration for a single benchmark scenario."""
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
model_id: str
|
model_id: str
|
||||||
variant: str = "eager" # "eager", "compiled", "kernelized"
|
variant: str = "eager" # "eager", "compiled", "kernelized"
|
||||||
@ -143,13 +143,13 @@ class BenchmarkConfig:
|
|||||||
device: str = "cuda"
|
device: str = "cuda"
|
||||||
torch_dtype: str = "float16"
|
torch_dtype: str = "float16"
|
||||||
compile_mode: Optional[str] = None # None, "default", "reduce-overhead", "max-autotune"
|
compile_mode: Optional[str] = None # None, "default", "reduce-overhead", "max-autotune"
|
||||||
compile_options: Dict[str, Any] = field(default_factory=dict)
|
compile_options: dict[str, Any] = field(default_factory=dict)
|
||||||
use_cache: bool = True
|
use_cache: bool = True
|
||||||
batch_size: int = 1
|
batch_size: int = 1
|
||||||
sequence_length: Optional[int] = None
|
sequence_length: Optional[int] = None
|
||||||
attn_implementation: str = "sdpa" # "eager", "sdpa", "flash_attention_2"
|
attn_implementation: str = "sdpa" # "eager", "sdpa", "flash_attention_2"
|
||||||
sdpa_backend: Optional[str] = None # None, "math", "flash_attention", "efficient_attention", "cudnn_attention"
|
sdpa_backend: Optional[str] = None # None, "math", "flash_attention", "efficient_attention", "cudnn_attention"
|
||||||
custom_params: Dict[str, Any] = field(default_factory=dict)
|
custom_params: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkScenario:
|
class BenchmarkScenario:
|
||||||
@ -195,24 +195,24 @@ class BenchmarkScenario:
|
|||||||
return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')"
|
return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TimingResult:
|
class TimingResult:
|
||||||
"""Result from a timing measurement."""
|
"""Result from a timing measurement."""
|
||||||
|
|
||||||
time_to_first_token_seconds: Optional[float] = None
|
time_to_first_token_seconds: Optional[float] = None
|
||||||
latency_seconds: float = 0.0
|
latency_seconds: float = 0.0
|
||||||
tokens_per_second: Optional[float] = None
|
tokens_per_second: Optional[float] = None
|
||||||
time_per_output_token_seconds: Optional[float] = None
|
time_per_output_token_seconds: Optional[float] = None
|
||||||
total_tokens_generated: int = 0
|
total_tokens_generated: int = 0
|
||||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BenchmarkStatistics:
|
class BenchmarkStatistics:
|
||||||
"""Statistical analysis of benchmark measurements."""
|
"""Statistical analysis of benchmark measurements."""
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
measurements: List[float]
|
measurements: list[float]
|
||||||
mean: float
|
mean: float
|
||||||
median: float
|
median: float
|
||||||
std: float
|
std: float
|
||||||
@ -226,7 +226,7 @@ class BenchmarkStatistics:
|
|||||||
unit: str = "seconds"
|
unit: str = "seconds"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_measurements(cls, name: str, measurements: List[float], unit: str = "seconds") -> 'BenchmarkStatistics':
|
def from_measurements(cls, name: str, measurements: list[float], unit: str = "seconds") -> "BenchmarkStatistics":
|
||||||
"""Create statistics from a list of measurements."""
|
"""Create statistics from a list of measurements."""
|
||||||
if not measurements:
|
if not measurements:
|
||||||
raise ValueError("Cannot create statistics from empty measurements")
|
raise ValueError("Cannot create statistics from empty measurements")
|
||||||
@ -246,13 +246,14 @@ class BenchmarkStatistics:
|
|||||||
p90=float(np.percentile(measurements_array, 90)),
|
p90=float(np.percentile(measurements_array, 90)),
|
||||||
p95=float(np.percentile(measurements_array, 95)),
|
p95=float(np.percentile(measurements_array, 95)),
|
||||||
p99=float(np.percentile(measurements_array, 99)),
|
p99=float(np.percentile(measurements_array, 99)),
|
||||||
unit=unit
|
unit=unit,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class HardwareInfo:
|
class HardwareInfo:
|
||||||
"""Hardware information collected during benchmarking."""
|
"""Hardware information collected during benchmarking."""
|
||||||
|
|
||||||
gpu_name: str
|
gpu_name: str
|
||||||
gpu_memory_total_mb: int
|
gpu_memory_total_mb: int
|
||||||
cpu_count: int
|
cpu_count: int
|
||||||
@ -265,6 +266,7 @@ class HardwareInfo:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class BenchmarkMetadata:
|
class BenchmarkMetadata:
|
||||||
"""Metadata collected for each benchmark run."""
|
"""Metadata collected for each benchmark run."""
|
||||||
|
|
||||||
timestamp: str
|
timestamp: str
|
||||||
commit_id: str
|
commit_id: str
|
||||||
hardware_info: HardwareInfo
|
hardware_info: HardwareInfo
|
||||||
@ -274,7 +276,7 @@ class BenchmarkMetadata:
|
|||||||
class GPUMonitor:
|
class GPUMonitor:
|
||||||
"""Monitor GPU utilization during benchmark execution."""
|
"""Monitor GPU utilization during benchmark execution."""
|
||||||
|
|
||||||
def __init__(self, sample_interval: float = 0.1, logger: logging.Logger = None):
|
def __init__(self, sample_interval: float = 0.1, logger: Optional[logging.Logger] = None):
|
||||||
self.sample_interval = sample_interval
|
self.sample_interval = sample_interval
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
self.stop_event = threading.Event()
|
self.stop_event = threading.Event()
|
||||||
@ -321,10 +323,7 @@ class GPUMonitor:
|
|||||||
def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]:
|
def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]:
|
||||||
"""Stop monitoring and return collected metrics."""
|
"""Stop monitoring and return collected metrics."""
|
||||||
if not self.gpu_available:
|
if not self.gpu_available:
|
||||||
return NoGPU(
|
return NoGPU(gpu_monitoring_status="disabled", gpu_monitoring_reason="no_gpus_available")
|
||||||
gpu_monitoring_status="disabled",
|
|
||||||
gpu_monitoring_reason="no_gpus_available"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Signal the monitoring thread to stop
|
# Signal the monitoring thread to stop
|
||||||
self.stop_event.set()
|
self.stop_event.set()
|
||||||
@ -340,15 +339,12 @@ class GPUMonitor:
|
|||||||
gpu_memory_used_max=max(self.gpu_memory_used),
|
gpu_memory_used_max=max(self.gpu_memory_used),
|
||||||
gpu_memory_used_min=min(self.gpu_memory_used),
|
gpu_memory_used_min=min(self.gpu_memory_used),
|
||||||
sample_count=len(self.gpu_utilization),
|
sample_count=len(self.gpu_utilization),
|
||||||
gpu_monitoring_status="success"
|
gpu_monitoring_status="success",
|
||||||
)
|
)
|
||||||
self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected")
|
self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected")
|
||||||
return metrics
|
return metrics
|
||||||
else:
|
else:
|
||||||
return NoGPU(
|
return NoGPU(gpu_monitoring_status="failed", gpu_monitoring_reason="no_samples_collected")
|
||||||
gpu_monitoring_status="failed",
|
|
||||||
gpu_monitoring_reason="no_samples_collected"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _monitor_loop(self):
|
def _monitor_loop(self):
|
||||||
"""Background monitoring loop using threading.Event for communication."""
|
"""Background monitoring loop using threading.Event for communication."""
|
||||||
@ -400,7 +396,7 @@ def get_hardware_info() -> HardwareInfo:
|
|||||||
|
|
||||||
torch_version = torch.__version__
|
torch_version = torch.__version__
|
||||||
cuda_version = None
|
cuda_version = None
|
||||||
if hasattr(torch, 'cuda') and torch.cuda.is_available():
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
||||||
cuda_version = torch.version.cuda
|
cuda_version = torch.version.cuda
|
||||||
|
|
||||||
return HardwareInfo(
|
return HardwareInfo(
|
||||||
@ -410,14 +406,14 @@ def get_hardware_info() -> HardwareInfo:
|
|||||||
memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)),
|
memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)),
|
||||||
python_version=f"{sys.version.split()[0]}",
|
python_version=f"{sys.version.split()[0]}",
|
||||||
torch_version=torch_version,
|
torch_version=torch_version,
|
||||||
cuda_version=cuda_version
|
cuda_version=cuda_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def flush_memory():
|
def flush_memory():
|
||||||
"""Flush GPU memory and run garbage collection."""
|
"""Flush GPU memory and run garbage collection."""
|
||||||
gc.collect()
|
gc.collect()
|
||||||
if hasattr(torch, 'cuda') and torch.cuda.is_available():
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
torch.cuda.reset_max_memory_allocated()
|
torch.cuda.reset_max_memory_allocated()
|
||||||
torch.cuda.reset_peak_memory_stats()
|
torch.cuda.reset_peak_memory_stats()
|
||||||
@ -442,13 +438,10 @@ def get_sdpa_backend(backend_name: Optional[str]):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SDPAContext:
|
class SDPAContext:
|
||||||
"""Context manager for SDPA kernel selection."""
|
"""Context manager for SDPA kernel selection."""
|
||||||
|
|
||||||
def __init__(self, backend_name: Optional[str], logger: logging.Logger = None):
|
def __init__(self, backend_name: Optional[str], logger: Optional[logging.Logger] = None):
|
||||||
self.backend_name = backend_name
|
self.backend_name = backend_name
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
self.backend = get_sdpa_backend(backend_name) if backend_name else None
|
self.backend = get_sdpa_backend(backend_name) if backend_name else None
|
||||||
@ -466,7 +459,9 @@ class SDPAContext:
|
|||||||
self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}")
|
self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}")
|
||||||
self.context = None
|
self.context = None
|
||||||
elif self.backend_name and self.logger:
|
elif self.backend_name and self.logger:
|
||||||
self.logger.debug(f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})")
|
self.logger.debug(
|
||||||
|
f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})"
|
||||||
|
)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
@ -490,7 +485,7 @@ class AbstractModelBenchmark(ABC):
|
|||||||
self.scenarios = {} # Map of scenario_name -> BenchmarkScenario
|
self.scenarios = {} # Map of scenario_name -> BenchmarkScenario
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
|
def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
|
||||||
"""Create and return a dictionary of benchmark scenarios."""
|
"""Create and return a dictionary of benchmark scenarios."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -518,7 +513,7 @@ class AbstractModelBenchmark(ABC):
|
|||||||
"""Prepare inputs for the model. Override if needed."""
|
"""Prepare inputs for the model. Override if needed."""
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
|
def get_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
|
||||||
"""Get benchmark scenarios. Creates them if they don't exist."""
|
"""Get benchmark scenarios. Creates them if they don't exist."""
|
||||||
if not self.scenarios:
|
if not self.scenarios:
|
||||||
self.scenarios = self.create_scenarios(**kwargs)
|
self.scenarios = self.create_scenarios(**kwargs)
|
||||||
@ -547,9 +542,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
"""Default prompt for text generation. Override in subclasses if needed."""
|
"""Default prompt for text generation. Override in subclasses if needed."""
|
||||||
return self._default_prompt
|
return self._default_prompt
|
||||||
|
|
||||||
|
def get_attention_configs(self, include_sdpa_variants: bool = True) -> list[dict[str, Any]]:
|
||||||
|
|
||||||
def get_attention_configs(self, include_sdpa_variants: bool = True) -> List[Dict[str, Any]]:
|
|
||||||
"""
|
"""
|
||||||
Get attention implementation configurations.
|
Get attention implementation configurations.
|
||||||
|
|
||||||
@ -565,15 +558,17 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
|
|
||||||
# Add SDPA variants if requested
|
# Add SDPA variants if requested
|
||||||
if include_sdpa_variants:
|
if include_sdpa_variants:
|
||||||
attention_configs.append({
|
attention_configs.append(
|
||||||
"attn_implementation": "sdpa",
|
{
|
||||||
"sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
|
"attn_implementation": "sdpa",
|
||||||
"desc_suffix": ""
|
"sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
|
||||||
})
|
"desc_suffix": "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return attention_configs
|
return attention_configs
|
||||||
|
|
||||||
def get_scenario_configs(self) -> List[Dict[str, Any]]:
|
def get_scenario_configs(self) -> list[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Get base scenario configurations. Override in subclasses to customize.
|
Get base scenario configurations. Override in subclasses to customize.
|
||||||
|
|
||||||
@ -583,36 +578,38 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
return [
|
return [
|
||||||
# Eager variants
|
# Eager variants
|
||||||
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
|
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
|
||||||
|
|
||||||
# Compiled variants
|
# Compiled variants
|
||||||
{"variant": "compiled", "compile_mode": "max-autotune", "use_cache": True, "description": "Compiled with max autotune"},
|
{
|
||||||
|
"variant": "compiled",
|
||||||
|
"compile_mode": "max-autotune",
|
||||||
|
"use_cache": True,
|
||||||
|
"description": "Compiled with max autotune",
|
||||||
|
},
|
||||||
# Kernelized variant (if available)
|
# Kernelized variant (if available)
|
||||||
{"variant": "kernelized", "compile_mode": "max-autotune", "use_cache": True, "description": "Kernelized execution"},
|
{
|
||||||
|
"variant": "kernelized",
|
||||||
|
"compile_mode": "max-autotune",
|
||||||
|
"use_cache": True,
|
||||||
|
"description": "Kernelized execution",
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def _is_kernelization_available(self) -> bool:
|
def _is_kernelization_available(self) -> bool:
|
||||||
"""Check if kernelization is available. Override in subclasses."""
|
"""Check if kernelization is available. Override in subclasses."""
|
||||||
try:
|
try:
|
||||||
from kernels import Mode, kernelize
|
from kernels import Mode, kernelize # noqa: F401
|
||||||
|
|
||||||
return True
|
return True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_default_generation_config(self) -> Dict[str, Any]:
|
def get_default_generation_config(self) -> dict[str, Any]:
|
||||||
"""Get default generation configuration. Override in subclasses for model-specific defaults."""
|
"""Get default generation configuration. Override in subclasses for model-specific defaults."""
|
||||||
return {
|
return {"do_sample": False, "top_p": 1.0, "temperature": 1.0}
|
||||||
"do_sample": False,
|
|
||||||
"top_p": 1.0,
|
|
||||||
"temperature": 1.0
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_model_init_kwargs(self, config: BenchmarkConfig) -> Dict[str, Any]:
|
def get_model_init_kwargs(self, config: BenchmarkConfig) -> dict[str, Any]:
|
||||||
"""Get model initialization kwargs. Override in subclasses for model-specific parameters."""
|
"""Get model initialization kwargs. Override in subclasses for model-specific parameters."""
|
||||||
return {
|
return {"torch_dtype": getattr(torch, config.torch_dtype), "attn_implementation": config.attn_implementation}
|
||||||
"torch_dtype": getattr(torch, config.torch_dtype),
|
|
||||||
"attn_implementation": config.attn_implementation
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_default_torch_dtype(self) -> str:
|
def get_default_torch_dtype(self) -> str:
|
||||||
"""Get default torch dtype. Override in subclasses."""
|
"""Get default torch dtype. Override in subclasses."""
|
||||||
@ -622,19 +619,19 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
"""Get default device. Override in subclasses."""
|
"""Get default device. Override in subclasses."""
|
||||||
return "cuda"
|
return "cuda"
|
||||||
|
|
||||||
def create_scenarios(self, **kwargs) -> Dict[str, 'BenchmarkScenario']:
|
def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
|
||||||
"""Create benchmark scenarios for HuggingFace models."""
|
"""Create benchmark scenarios for HuggingFace models."""
|
||||||
scenarios = {}
|
scenarios = {}
|
||||||
|
|
||||||
# Extract parameters with model-specific defaults
|
# Extract parameters with model-specific defaults
|
||||||
model_id = kwargs.get('model_id', 'microsoft/DialoGPT-medium')
|
model_id = kwargs.get("model_id", "microsoft/DialoGPT-medium")
|
||||||
warmup_iterations = kwargs.get('warmup_iterations', 3)
|
warmup_iterations = kwargs.get("warmup_iterations", 3)
|
||||||
measurement_iterations = kwargs.get('measurement_iterations', 5)
|
measurement_iterations = kwargs.get("measurement_iterations", 5)
|
||||||
num_tokens_to_generate = kwargs.get('num_tokens_to_generate', 100)
|
num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
|
||||||
include_sdpa_variants = kwargs.get('include_sdpa_variants', True)
|
include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
|
||||||
device = kwargs.get('device', self.get_default_device())
|
device = kwargs.get("device", self.get_default_device())
|
||||||
torch_dtype = kwargs.get('torch_dtype', self.get_default_torch_dtype())
|
torch_dtype = kwargs.get("torch_dtype", self.get_default_torch_dtype())
|
||||||
batch_size = kwargs.get('batch_size', 1)
|
batch_size = kwargs.get("batch_size", 1)
|
||||||
|
|
||||||
# Get configurations
|
# Get configurations
|
||||||
attention_configs = self.get_attention_configs(include_sdpa_variants)
|
attention_configs = self.get_attention_configs(include_sdpa_variants)
|
||||||
@ -654,7 +651,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
|
|
||||||
# Create unique config for this scenario
|
# Create unique config for this scenario
|
||||||
config = BenchmarkConfig(
|
config = BenchmarkConfig(
|
||||||
name=scenario_config['variant'],
|
name=scenario_config["variant"],
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
variant=scenario_config["variant"],
|
variant=scenario_config["variant"],
|
||||||
compile_mode=scenario_config["compile_mode"],
|
compile_mode=scenario_config["compile_mode"],
|
||||||
@ -666,7 +663,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
torch_dtype=torch_dtype,
|
torch_dtype=torch_dtype,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
attn_implementation=attn_implementation,
|
attn_implementation=attn_implementation,
|
||||||
sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None
|
sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create scenario name
|
# Create scenario name
|
||||||
@ -695,11 +692,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
description += desc_suffix
|
description += desc_suffix
|
||||||
|
|
||||||
# Create scenario
|
# Create scenario
|
||||||
scenario = BenchmarkScenario(
|
scenario = BenchmarkScenario(name=scenario_name, config=config, description=description)
|
||||||
name=scenario_name,
|
|
||||||
config=config,
|
|
||||||
description=description
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add setup callbacks based on variant
|
# Add setup callbacks based on variant
|
||||||
if scenario_config["variant"] == "compiled":
|
if scenario_config["variant"] == "compiled":
|
||||||
@ -718,16 +711,12 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
|
|
||||||
# Perform torch.compile
|
# Perform torch.compile
|
||||||
if config.compile_mode is not None:
|
if config.compile_mode is not None:
|
||||||
self.compiled_model = torch.compile(
|
self.compiled_model = torch.compile(model, mode=config.compile_mode, **config.compile_options)
|
||||||
model,
|
|
||||||
mode=config.compile_mode,
|
|
||||||
**config.compile_options
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
self.compiled_model = torch.compile(model, **config.compile_options)
|
self.compiled_model = torch.compile(model, **config.compile_options)
|
||||||
|
|
||||||
# Setup static cache for compiled mode if needed
|
# Setup static cache for compiled mode if needed
|
||||||
if config.use_cache and hasattr(self, 'inputs') and self.inputs is not None:
|
if config.use_cache and hasattr(self, "inputs") and self.inputs is not None:
|
||||||
self._setup_static_cache(config)
|
self._setup_static_cache(config)
|
||||||
|
|
||||||
def _setup_kernelization_callback(self, model, tokenizer, config, logger):
|
def _setup_kernelization_callback(self, model, tokenizer, config, logger):
|
||||||
@ -737,10 +726,8 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from kernels import Mode, kernelize
|
from kernels import Mode, kernelize
|
||||||
self.compiled_model = kernelize(
|
|
||||||
model,
|
self.compiled_model = kernelize(model, mode=Mode.INFERENCE)
|
||||||
mode=Mode.INFERENCE
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if logger:
|
if logger:
|
||||||
logger.warning(f"Failed to setup kernelized mode: {e}")
|
logger.warning(f"Failed to setup kernelized mode: {e}")
|
||||||
@ -749,13 +736,14 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
|
|
||||||
def _setup_static_cache(self, config: BenchmarkConfig):
|
def _setup_static_cache(self, config: BenchmarkConfig):
|
||||||
"""Setup static cache for compiled models. Override if needed."""
|
"""Setup static cache for compiled models. Override if needed."""
|
||||||
if hasattr(self, 'inputs') and self.inputs is not None:
|
if hasattr(self, "inputs") and self.inputs is not None:
|
||||||
try:
|
try:
|
||||||
from transformers import StaticCache
|
from transformers import StaticCache
|
||||||
|
|
||||||
seq_length = self.inputs["input_ids"].shape[1]
|
seq_length = self.inputs["input_ids"].shape[1]
|
||||||
|
|
||||||
# Get the actual device the model is on
|
# Get the actual device the model is on
|
||||||
if hasattr(self.model, 'device'):
|
if hasattr(self.model, "device"):
|
||||||
cache_device = self.model.device
|
cache_device = self.model.device
|
||||||
else:
|
else:
|
||||||
cache_device = self.device
|
cache_device = self.device
|
||||||
@ -765,7 +753,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
max_batch_size=config.batch_size,
|
max_batch_size=config.batch_size,
|
||||||
max_cache_len=seq_length + config.num_tokens_to_generate,
|
max_cache_len=seq_length + config.num_tokens_to_generate,
|
||||||
device=cache_device,
|
device=cache_device,
|
||||||
dtype=getattr(torch, config.torch_dtype)
|
dtype=getattr(torch, config.torch_dtype),
|
||||||
)
|
)
|
||||||
self.logger.debug(f"StaticCache created on device: {cache_device}")
|
self.logger.debug(f"StaticCache created on device: {cache_device}")
|
||||||
except (ImportError, TypeError) as e:
|
except (ImportError, TypeError) as e:
|
||||||
@ -794,7 +782,6 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
def _load_model_and_tokenizer(self, config: BenchmarkConfig):
|
def _load_model_and_tokenizer(self, config: BenchmarkConfig):
|
||||||
"""Load the model and tokenizer. Override in subclasses for custom loading."""
|
"""Load the model and tokenizer. Override in subclasses for custom loading."""
|
||||||
|
|
||||||
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
@ -812,14 +799,9 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
target_device = config.device
|
target_device = config.device
|
||||||
# Get model initialization kwargs
|
# Get model initialization kwargs
|
||||||
model_init_kwargs = self.get_model_init_kwargs(config)
|
model_init_kwargs = self.get_model_init_kwargs(config)
|
||||||
model_init_kwargs.update({
|
model_init_kwargs.update({"generation_config": gen_config})
|
||||||
"generation_config": gen_config
|
|
||||||
})
|
|
||||||
|
|
||||||
self.model = AutoModelForCausalLM.from_pretrained(
|
self.model = AutoModelForCausalLM.from_pretrained(config.model_id, **model_init_kwargs).eval()
|
||||||
config.model_id,
|
|
||||||
**model_init_kwargs
|
|
||||||
).eval()
|
|
||||||
|
|
||||||
# Move model to target device
|
# Move model to target device
|
||||||
self.logger.info(f"Moving model to device: {target_device}")
|
self.logger.info(f"Moving model to device: {target_device}")
|
||||||
@ -832,7 +814,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt")
|
self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt")
|
||||||
|
|
||||||
# Move inputs to the same device as the model
|
# Move inputs to the same device as the model
|
||||||
if hasattr(self.model, 'device'):
|
if hasattr(self.model, "device"):
|
||||||
# Model is on a single device
|
# Model is on a single device
|
||||||
model_device = self.model.device
|
model_device = self.model.device
|
||||||
else:
|
else:
|
||||||
@ -849,16 +831,16 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
|
|
||||||
def cleanup_model(self) -> None:
|
def cleanup_model(self) -> None:
|
||||||
"""Cleanup model resources."""
|
"""Cleanup model resources."""
|
||||||
if hasattr(self, 'model') and self.model is not None:
|
if hasattr(self, "model") and self.model is not None:
|
||||||
del self.model
|
del self.model
|
||||||
self.model = None
|
self.model = None
|
||||||
if hasattr(self, 'compiled_model') and self.compiled_model is not None:
|
if hasattr(self, "compiled_model") and self.compiled_model is not None:
|
||||||
del self.compiled_model
|
del self.compiled_model
|
||||||
self.compiled_model = None
|
self.compiled_model = None
|
||||||
if hasattr(self, 'tokenizer') and self.tokenizer is not None:
|
if hasattr(self, "tokenizer") and self.tokenizer is not None:
|
||||||
del self.tokenizer
|
del self.tokenizer
|
||||||
self.tokenizer = None
|
self.tokenizer = None
|
||||||
if hasattr(self, 'past_key_values') and self.past_key_values is not None:
|
if hasattr(self, "past_key_values") and self.past_key_values is not None:
|
||||||
del self.past_key_values
|
del self.past_key_values
|
||||||
self.past_key_values = None
|
self.past_key_values = None
|
||||||
|
|
||||||
@ -877,7 +859,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
# Use SDPA context if specified
|
# Use SDPA context if specified
|
||||||
with SDPAContext(config.sdpa_backend, self.logger):
|
with SDPAContext(config.sdpa_backend, self.logger):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model_to_use.generate(**generation_kwargs)
|
_ = model_to_use.generate(**generation_kwargs)
|
||||||
|
|
||||||
return timer.elapsed_time()
|
return timer.elapsed_time()
|
||||||
|
|
||||||
@ -915,11 +897,11 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
"variant": config.variant,
|
"variant": config.variant,
|
||||||
"compile_mode": config.compile_mode,
|
"compile_mode": config.compile_mode,
|
||||||
"attn_implementation": config.attn_implementation,
|
"attn_implementation": config.attn_implementation,
|
||||||
"sdpa_backend": config.sdpa_backend
|
"sdpa_backend": config.sdpa_backend,
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> Dict[str, Any]:
|
def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> dict[str, Any]:
|
||||||
"""Get generation kwargs. Override in subclasses for custom generation."""
|
"""Get generation kwargs. Override in subclasses for custom generation."""
|
||||||
generation_config_dict = self.get_default_generation_config()
|
generation_config_dict = self.get_default_generation_config()
|
||||||
generation_kwargs = {
|
generation_kwargs = {
|
||||||
@ -935,11 +917,12 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
if self.past_key_values is not None and config.variant == "compiled":
|
if self.past_key_values is not None and config.variant == "compiled":
|
||||||
try:
|
try:
|
||||||
from transformers import StaticCache
|
from transformers import StaticCache
|
||||||
|
|
||||||
# Reset cache for each measurement
|
# Reset cache for each measurement
|
||||||
seq_length = self.inputs["input_ids"].shape[1]
|
seq_length = self.inputs["input_ids"].shape[1]
|
||||||
|
|
||||||
# Get the actual device the model is on
|
# Get the actual device the model is on
|
||||||
if hasattr(self.model, 'device'):
|
if hasattr(self.model, "device"):
|
||||||
cache_device = self.model.device
|
cache_device = self.model.device
|
||||||
else:
|
else:
|
||||||
cache_device = self.device
|
cache_device = self.device
|
||||||
@ -949,7 +932,7 @@ class ModelBenchmark(AbstractModelBenchmark):
|
|||||||
max_batch_size=config.batch_size,
|
max_batch_size=config.batch_size,
|
||||||
max_cache_len=seq_length + max_new_tokens,
|
max_cache_len=seq_length + max_new_tokens,
|
||||||
device=cache_device,
|
device=cache_device,
|
||||||
dtype=getattr(torch, config.torch_dtype)
|
dtype=getattr(torch, config.torch_dtype),
|
||||||
)
|
)
|
||||||
generation_kwargs["past_key_values"] = fresh_cache
|
generation_kwargs["past_key_values"] = fresh_cache
|
||||||
except (ImportError, TypeError) as e:
|
except (ImportError, TypeError) as e:
|
||||||
@ -967,14 +950,13 @@ class BenchmarkRunner:
|
|||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def run_benchmark(
|
def run_benchmark(
|
||||||
self,
|
self,
|
||||||
benchmark: ModelBenchmark,
|
benchmark: ModelBenchmark,
|
||||||
scenarios: Dict[str, BenchmarkScenario],
|
scenarios: dict[str, BenchmarkScenario],
|
||||||
collect_gpu_metrics: bool = True,
|
collect_gpu_metrics: bool = True,
|
||||||
commit_id: Optional[str] = None
|
commit_id: Optional[str] = None,
|
||||||
) -> Dict[str, Dict[str, Any]]:
|
) -> dict[str, dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Run benchmarks using scenarios.
|
Run benchmarks using scenarios.
|
||||||
|
|
||||||
@ -1021,7 +1003,7 @@ class BenchmarkRunner:
|
|||||||
timestamp=datetime.utcnow().isoformat(),
|
timestamp=datetime.utcnow().isoformat(),
|
||||||
commit_id=commit_id,
|
commit_id=commit_id,
|
||||||
hardware_info=get_hardware_info(),
|
hardware_info=get_hardware_info(),
|
||||||
config=config
|
config=config,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize GPU monitor
|
# Initialize GPU monitor
|
||||||
@ -1037,11 +1019,13 @@ class BenchmarkRunner:
|
|||||||
_ = benchmark.measure_latency(config)
|
_ = benchmark.measure_latency(config)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
warmup_failures += 1
|
warmup_failures += 1
|
||||||
self.logger.warning(f"Warmup iteration {i+1} failed: {e}")
|
self.logger.warning(f"Warmup iteration {i + 1} failed: {e}")
|
||||||
|
|
||||||
# If more than half the warmup iterations failed, skip this scenario
|
# If more than half the warmup iterations failed, skip this scenario
|
||||||
if warmup_failures > config.warmup_iterations // 2:
|
if warmup_failures > config.warmup_iterations // 2:
|
||||||
self.logger.warning(f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})")
|
self.logger.warning(
|
||||||
|
f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
||||||
benchmark.cleanup_model()
|
benchmark.cleanup_model()
|
||||||
@ -1077,12 +1061,18 @@ class BenchmarkRunner:
|
|||||||
if timing_result.time_per_output_token_seconds is not None:
|
if timing_result.time_per_output_token_seconds is not None:
|
||||||
itl_measurements.append(timing_result.time_per_output_token_seconds)
|
itl_measurements.append(timing_result.time_per_output_token_seconds)
|
||||||
|
|
||||||
itl_str = f", itl={timing_result.time_per_output_token_seconds:.4f}s/token" if timing_result.time_per_output_token_seconds else ""
|
itl_str = (
|
||||||
self.logger.debug(f"Iteration {i+1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}")
|
f", itl={timing_result.time_per_output_token_seconds:.4f}s/token"
|
||||||
|
if timing_result.time_per_output_token_seconds
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
self.logger.debug(
|
||||||
|
f"Iteration {i + 1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
measurement_failures += 1
|
measurement_failures += 1
|
||||||
self.logger.warning(f"Measurement iteration {i+1} failed: {e}")
|
self.logger.warning(f"Measurement iteration {i + 1} failed: {e}")
|
||||||
|
|
||||||
# Stop GPU monitoring
|
# Stop GPU monitoring
|
||||||
gpu_metrics = {}
|
gpu_metrics = {}
|
||||||
@ -1091,7 +1081,9 @@ class BenchmarkRunner:
|
|||||||
|
|
||||||
# If we don't have enough successful measurements, skip this scenario
|
# If we don't have enough successful measurements, skip this scenario
|
||||||
if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2:
|
if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2:
|
||||||
self.logger.warning(f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})")
|
self.logger.warning(
|
||||||
|
f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
||||||
benchmark.cleanup_model()
|
benchmark.cleanup_model()
|
||||||
@ -1104,7 +1096,7 @@ class BenchmarkRunner:
|
|||||||
"metadata": asdict(metadata),
|
"metadata": asdict(metadata),
|
||||||
"measurements": {},
|
"measurements": {},
|
||||||
"gpu_metrics": gpu_metrics,
|
"gpu_metrics": gpu_metrics,
|
||||||
"scenario_description": scenario.description
|
"scenario_description": scenario.description,
|
||||||
}
|
}
|
||||||
|
|
||||||
if latency_measurements:
|
if latency_measurements:
|
||||||
@ -1112,15 +1104,21 @@ class BenchmarkRunner:
|
|||||||
scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats)
|
scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats)
|
||||||
|
|
||||||
if ttft_measurements:
|
if ttft_measurements:
|
||||||
ttft_stats = BenchmarkStatistics.from_measurements("time_to_first_token_seconds", ttft_measurements)
|
ttft_stats = BenchmarkStatistics.from_measurements(
|
||||||
|
"time_to_first_token_seconds", ttft_measurements
|
||||||
|
)
|
||||||
scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats)
|
scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats)
|
||||||
|
|
||||||
if tokens_per_sec_measurements:
|
if tokens_per_sec_measurements:
|
||||||
tps_stats = BenchmarkStatistics.from_measurements("tokens_per_second", tokens_per_sec_measurements, "tokens/sec")
|
tps_stats = BenchmarkStatistics.from_measurements(
|
||||||
|
"tokens_per_second", tokens_per_sec_measurements, "tokens/sec"
|
||||||
|
)
|
||||||
scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats)
|
scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats)
|
||||||
|
|
||||||
if itl_measurements:
|
if itl_measurements:
|
||||||
itl_stats = BenchmarkStatistics.from_measurements("time_per_output_token_seconds", itl_measurements, "seconds/token")
|
itl_stats = BenchmarkStatistics.from_measurements(
|
||||||
|
"time_per_output_token_seconds", itl_measurements, "seconds/token"
|
||||||
|
)
|
||||||
scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats)
|
scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats)
|
||||||
|
|
||||||
# Log summary
|
# Log summary
|
||||||
@ -1149,6 +1147,7 @@ class BenchmarkRunner:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}")
|
self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
self.logger.debug(traceback.format_exc())
|
self.logger.debug(traceback.format_exc())
|
||||||
|
|
||||||
# Try to clean up if possible
|
# Try to clean up if possible
|
||||||
@ -1169,7 +1168,7 @@ class BenchmarkRunner:
|
|||||||
|
|
||||||
return all_results
|
return all_results
|
||||||
|
|
||||||
def save_results(self, model_name: str, results: Dict[str, Dict[str, Any]]) -> str:
|
def save_results(self, model_name: str, results: dict[str, dict[str, Any]]) -> str:
|
||||||
"""Save benchmark results to JSON file."""
|
"""Save benchmark results to JSON file."""
|
||||||
# Create model-specific subdirectory
|
# Create model-specific subdirectory
|
||||||
model_dir = os.path.join(self.output_dir, model_name)
|
model_dir = os.path.join(self.output_dir, model_name)
|
||||||
@ -1181,24 +1180,20 @@ class BenchmarkRunner:
|
|||||||
filepath = os.path.join(model_dir, filename)
|
filepath = os.path.join(model_dir, filename)
|
||||||
|
|
||||||
# Prepare output structure
|
# Prepare output structure
|
||||||
output_data = {
|
output_data = {"model_name": model_name, "benchmark_scenarios": []}
|
||||||
"model_name": model_name,
|
|
||||||
"benchmark_scenarios": []
|
|
||||||
}
|
|
||||||
|
|
||||||
for config_name, config_results in results.items():
|
for config_name, config_results in results.items():
|
||||||
scenario = {
|
scenario = {
|
||||||
"scenario_name": config_name,
|
"scenario_name": config_name,
|
||||||
"metadata": config_results["metadata"],
|
"metadata": config_results["metadata"],
|
||||||
"measurements": config_results["measurements"],
|
"measurements": config_results["measurements"],
|
||||||
"gpu_metrics": config_results.get("gpu_metrics", {})
|
"gpu_metrics": config_results.get("gpu_metrics", {}),
|
||||||
}
|
}
|
||||||
output_data["benchmark_scenarios"].append(scenario)
|
output_data["benchmark_scenarios"].append(scenario)
|
||||||
|
|
||||||
# Save to JSON file
|
# Save to JSON file
|
||||||
with open(filepath, 'w') as f:
|
with open(filepath, "w") as f:
|
||||||
json.dump(output_data, f, indent=2, default=str)
|
json.dump(output_data, f, indent=2, default=str)
|
||||||
|
|
||||||
self.logger.info(f"Results saved to {filepath}")
|
self.logger.info(f"Results saved to {filepath}")
|
||||||
return filepath
|
return filepath
|
||||||
|
|
@ -20,38 +20,34 @@ in the ./benches directory, organizing outputs into model-specific subfolders.
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
|
def setup_logging(log_level: str = "INFO", enable_file_logging: bool = False) -> logging.Logger:
|
||||||
"""Setup logging configuration."""
|
"""Setup logging configuration."""
|
||||||
numeric_level = getattr(logging, log_level.upper(), None)
|
numeric_level = getattr(logging, log_level.upper(), None)
|
||||||
if not isinstance(numeric_level, int):
|
if not isinstance(numeric_level, int):
|
||||||
raise ValueError(f'Invalid log level: {log_level}')
|
raise ValueError(f"Invalid log level: {log_level}")
|
||||||
|
|
||||||
handlers = [logging.StreamHandler(sys.stdout)]
|
handlers = [logging.StreamHandler(sys.stdout)]
|
||||||
|
|
||||||
if enable_file_logging:
|
if enable_file_logging:
|
||||||
handlers.append(
|
handlers.append(logging.FileHandler(f"benchmark_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"))
|
||||||
logging.FileHandler(f'benchmark_run_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=numeric_level,
|
level=numeric_level, format="[%(levelname)s - %(asctime)s] %(name)s: %(message)s", handlers=handlers
|
||||||
format='[%(levelname)s - %(asctime)s] %(name)s: %(message)s',
|
|
||||||
handlers=handlers
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return logging.getLogger(__name__)
|
return logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
|
def discover_benchmarks(benches_dir: str) -> list[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Discover all benchmark modules in the benches directory.
|
Discover all benchmark modules in the benches directory.
|
||||||
|
|
||||||
@ -77,20 +73,24 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
|
|||||||
spec.loader.exec_module(module)
|
spec.loader.exec_module(module)
|
||||||
|
|
||||||
# Check if it has a benchmark runner function
|
# Check if it has a benchmark runner function
|
||||||
if hasattr(module, f'run_{module_name}'):
|
if hasattr(module, f"run_{module_name}"):
|
||||||
benchmarks.append({
|
benchmarks.append(
|
||||||
'name': module_name,
|
{
|
||||||
'path': str(py_file),
|
"name": module_name,
|
||||||
'module': module,
|
"path": str(py_file),
|
||||||
'runner_function': getattr(module, f'run_{module_name}')
|
"module": module,
|
||||||
})
|
"runner_function": getattr(module, f"run_{module_name}"),
|
||||||
elif hasattr(module, 'run_benchmark'):
|
}
|
||||||
benchmarks.append({
|
)
|
||||||
'name': module_name,
|
elif hasattr(module, "run_benchmark"):
|
||||||
'path': str(py_file),
|
benchmarks.append(
|
||||||
'module': module,
|
{
|
||||||
'runner_function': getattr(module, 'run_benchmark')
|
"name": module_name,
|
||||||
})
|
"path": str(py_file),
|
||||||
|
"module": module,
|
||||||
|
"runner_function": getattr(module, "run_benchmark"),
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logging.warning(f"No runner function found in {py_file}")
|
logging.warning(f"No runner function found in {py_file}")
|
||||||
|
|
||||||
@ -101,10 +101,7 @@ def discover_benchmarks(benches_dir: str) -> List[Dict[str, Any]]:
|
|||||||
|
|
||||||
|
|
||||||
def run_single_benchmark(
|
def run_single_benchmark(
|
||||||
benchmark_info: Dict[str, Any],
|
benchmark_info: dict[str, Any], output_dir: str, logger: logging.Logger, **kwargs
|
||||||
output_dir: str,
|
|
||||||
logger: logging.Logger,
|
|
||||||
**kwargs
|
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Run a single benchmark and return the output file path.
|
Run a single benchmark and return the output file path.
|
||||||
@ -118,21 +115,19 @@ def run_single_benchmark(
|
|||||||
Returns:
|
Returns:
|
||||||
Path to the output file if successful, None otherwise
|
Path to the output file if successful, None otherwise
|
||||||
"""
|
"""
|
||||||
benchmark_name = benchmark_info['name']
|
benchmark_name = benchmark_info["name"]
|
||||||
runner_func = benchmark_info['runner_function']
|
runner_func = benchmark_info["runner_function"]
|
||||||
|
|
||||||
logger.info(f"Running benchmark: {benchmark_name}")
|
logger.info(f"Running benchmark: {benchmark_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Check function signature to determine what arguments to pass
|
# Check function signature to determine what arguments to pass
|
||||||
import inspect
|
import inspect
|
||||||
|
|
||||||
sig = inspect.signature(runner_func)
|
sig = inspect.signature(runner_func)
|
||||||
|
|
||||||
# Prepare arguments based on function signature
|
# Prepare arguments based on function signature
|
||||||
func_kwargs = {
|
func_kwargs = {"logger": logger, "output_dir": output_dir}
|
||||||
'logger': logger,
|
|
||||||
'output_dir': output_dir
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add other kwargs if the function accepts them
|
# Add other kwargs if the function accepts them
|
||||||
for param_name in sig.parameters:
|
for param_name in sig.parameters:
|
||||||
@ -145,8 +140,7 @@ def run_single_benchmark(
|
|||||||
if has_var_kwargs:
|
if has_var_kwargs:
|
||||||
valid_kwargs = {**func_kwargs, **kwargs}
|
valid_kwargs = {**func_kwargs, **kwargs}
|
||||||
else:
|
else:
|
||||||
valid_kwargs = {k: v for k, v in func_kwargs.items()
|
valid_kwargs = {k: v for k, v in func_kwargs.items() if k in sig.parameters}
|
||||||
if k in sig.parameters}
|
|
||||||
|
|
||||||
# Run the benchmark
|
# Run the benchmark
|
||||||
result = runner_func(**valid_kwargs)
|
result = runner_func(**valid_kwargs)
|
||||||
@ -161,15 +155,12 @@ def run_single_benchmark(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Benchmark {benchmark_name} failed: {e}")
|
logger.error(f"Benchmark {benchmark_name} failed: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def generate_summary_report(
|
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
|
||||||
output_dir: str,
|
|
||||||
benchmark_results: Dict[str, Any],
|
|
||||||
logger: logging.Logger
|
|
||||||
) -> str:
|
|
||||||
"""Generate a summary report of all benchmark runs."""
|
"""Generate a summary report of all benchmark runs."""
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
|
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
|
||||||
@ -179,13 +170,13 @@ def generate_summary_report(
|
|||||||
"timestamp": datetime.utcnow().isoformat(),
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
"total_benchmarks": len(benchmark_results),
|
"total_benchmarks": len(benchmark_results),
|
||||||
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
|
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
|
||||||
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None])
|
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
|
||||||
},
|
},
|
||||||
"benchmark_results": benchmark_results,
|
"benchmark_results": benchmark_results,
|
||||||
"output_directory": output_dir
|
"output_directory": output_dir,
|
||||||
}
|
}
|
||||||
|
|
||||||
with open(summary_file, 'w') as f:
|
with open(summary_file, "w") as f:
|
||||||
json.dump(summary_data, f, indent=2, default=str)
|
json.dump(summary_data, f, indent=2, default=str)
|
||||||
|
|
||||||
logger.info(f"Summary report saved to: {summary_file}")
|
logger.info(f"Summary report saved to: {summary_file}")
|
||||||
@ -194,22 +185,20 @@ def generate_summary_report(
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main entry point for the benchmarking script."""
|
"""Main entry point for the benchmarking script."""
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
|
||||||
description="Run all benchmarks in the ./benches directory"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-dir",
|
"--output-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default="benchmark_results",
|
default="benchmark_results",
|
||||||
help="Base output directory for benchmark results (default: benchmark_results)"
|
help="Base output directory for benchmark results (default: benchmark_results)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--benches-dir",
|
"--benches-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default="./benches",
|
default="./benches",
|
||||||
help="Directory containing benchmark implementations (default: ./benches)"
|
help="Directory containing benchmark implementations (default: ./benches)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -217,66 +206,34 @@ def main():
|
|||||||
type=str,
|
type=str,
|
||||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
default="INFO",
|
default="INFO",
|
||||||
help="Logging level (default: INFO)"
|
help="Logging level (default: INFO)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
|
||||||
"--model-id",
|
|
||||||
type=str,
|
parser.add_argument("--warmup-iterations", type=int, default=3, help="Number of warmup iterations (default: 3)")
|
||||||
help="Specific model ID to benchmark (if supported by benchmarks)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--warmup-iterations",
|
"--measurement-iterations", type=int, default=5, help="Number of measurement iterations (default: 5)"
|
||||||
type=int,
|
|
||||||
default=3,
|
|
||||||
help="Number of warmup iterations (default: 3)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--measurement-iterations",
|
|
||||||
type=int,
|
|
||||||
default=5,
|
|
||||||
help="Number of measurement iterations (default: 5)"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-tokens-to-generate",
|
"--num-tokens-to-generate",
|
||||||
type=int,
|
type=int,
|
||||||
default=100,
|
default=100,
|
||||||
help="Number of tokens to generate in benchmarks (default: 100)"
|
help="Number of tokens to generate in benchmarks (default: 100)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument("--include", type=str, nargs="*", help="Only run benchmarks matching these names")
|
||||||
"--include",
|
|
||||||
type=str,
|
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
|
||||||
nargs="*",
|
|
||||||
help="Only run benchmarks matching these names"
|
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
|
||||||
)
|
|
||||||
|
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--exclude",
|
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
||||||
type=str,
|
|
||||||
nargs="*",
|
|
||||||
help="Exclude benchmarks matching these names"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--enable-mock",
|
|
||||||
action="store_true",
|
|
||||||
help="Enable mock benchmark (skipped by default)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--enable-file-logging",
|
|
||||||
action="store_true",
|
|
||||||
help="Enable file logging (disabled by default)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--commit-id",
|
|
||||||
type=str,
|
|
||||||
help="Git commit ID for metadata (if not provided, will auto-detect from git)"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -304,13 +261,15 @@ def main():
|
|||||||
filtered_benchmarks = benchmarks
|
filtered_benchmarks = benchmarks
|
||||||
|
|
||||||
if args.include:
|
if args.include:
|
||||||
filtered_benchmarks = [b for b in filtered_benchmarks
|
filtered_benchmarks = [
|
||||||
if any(pattern in b['name'] for pattern in args.include)]
|
b for b in filtered_benchmarks if any(pattern in b["name"] for pattern in args.include)
|
||||||
|
]
|
||||||
logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
|
logger.info(f"Filtered to include: {[b['name'] for b in filtered_benchmarks]}")
|
||||||
|
|
||||||
if args.exclude:
|
if args.exclude:
|
||||||
filtered_benchmarks = [b for b in filtered_benchmarks
|
filtered_benchmarks = [
|
||||||
if not any(pattern in b['name'] for pattern in args.exclude)]
|
b for b in filtered_benchmarks if not any(pattern in b["name"] for pattern in args.exclude)
|
||||||
|
]
|
||||||
logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
|
logger.info(f"After exclusion: {[b['name'] for b in filtered_benchmarks]}")
|
||||||
|
|
||||||
if not filtered_benchmarks:
|
if not filtered_benchmarks:
|
||||||
@ -319,34 +278,29 @@ def main():
|
|||||||
|
|
||||||
# Prepare common kwargs for benchmarks
|
# Prepare common kwargs for benchmarks
|
||||||
benchmark_kwargs = {
|
benchmark_kwargs = {
|
||||||
'warmup_iterations': args.warmup_iterations,
|
"warmup_iterations": args.warmup_iterations,
|
||||||
'measurement_iterations': args.measurement_iterations,
|
"measurement_iterations": args.measurement_iterations,
|
||||||
'num_tokens_to_generate': args.num_tokens_to_generate
|
"num_tokens_to_generate": args.num_tokens_to_generate,
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.model_id:
|
if args.model_id:
|
||||||
benchmark_kwargs['model_id'] = args.model_id
|
benchmark_kwargs["model_id"] = args.model_id
|
||||||
|
|
||||||
# Add enable_mock flag for mock benchmark
|
# Add enable_mock flag for mock benchmark
|
||||||
benchmark_kwargs['enable_mock'] = args.enable_mock
|
benchmark_kwargs["enable_mock"] = args.enable_mock
|
||||||
|
|
||||||
# Add commit_id if provided
|
# Add commit_id if provided
|
||||||
if args.commit_id:
|
if args.commit_id:
|
||||||
benchmark_kwargs['commit_id'] = args.commit_id
|
benchmark_kwargs["commit_id"] = args.commit_id
|
||||||
|
|
||||||
# Run benchmarks
|
# Run benchmarks
|
||||||
benchmark_results = {}
|
benchmark_results = {}
|
||||||
successful_count = 0
|
successful_count = 0
|
||||||
|
|
||||||
for benchmark_info in filtered_benchmarks:
|
for benchmark_info in filtered_benchmarks:
|
||||||
result = run_single_benchmark(
|
result = run_single_benchmark(benchmark_info, args.output_dir, logger, **benchmark_kwargs)
|
||||||
benchmark_info,
|
|
||||||
args.output_dir,
|
|
||||||
logger,
|
|
||||||
**benchmark_kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
benchmark_results[benchmark_info['name']] = result
|
benchmark_results[benchmark_info["name"]] = result
|
||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
successful_count += 1
|
successful_count += 1
|
||||||
@ -377,6 +331,7 @@ def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Benchmark run failed: {e}")
|
logger.error(f"Benchmark run failed: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
logger.debug(traceback.format_exc())
|
logger.debug(traceback.format_exc())
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@ -4,8 +4,8 @@ import datasets
|
|||||||
|
|
||||||
import transformers
|
import transformers
|
||||||
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
|
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
|
||||||
from transformers.utils import logging
|
|
||||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||||
|
from transformers.utils import logging
|
||||||
|
|
||||||
|
|
||||||
logging.set_verbosity_info()
|
logging.set_verbosity_info()
|
||||||
@ -22,7 +22,9 @@ imperfect = 0
|
|||||||
wrong = 0
|
wrong = 0
|
||||||
|
|
||||||
|
|
||||||
def check_diff(spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
|
def check_diff(
|
||||||
|
spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
|
||||||
|
) -> bool:
|
||||||
if spm_diff == list(reversed(tok_diff)):
|
if spm_diff == list(reversed(tok_diff)):
|
||||||
# AAA -> AA+A vs A+AA case.
|
# AAA -> AA+A vs A+AA case.
|
||||||
return True
|
return True
|
||||||
@ -54,7 +56,9 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> bool:
|
def check_details(
|
||||||
|
line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
|
||||||
|
) -> bool:
|
||||||
# Encoding can be the same with same result AAA -> A + AA vs AA + A
|
# Encoding can be the same with same result AAA -> A + AA vs AA + A
|
||||||
# We can check that we use at least exactly the same number of tokens.
|
# We can check that we use at least exactly the same number of tokens.
|
||||||
for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
|
for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
|
||||||
@ -90,7 +94,9 @@ def check_details(line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTr
|
|||||||
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
|
if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width]
|
||||||
]
|
]
|
||||||
for j in possible_matches:
|
for j in possible_matches:
|
||||||
if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], slow, fast) and check_details(
|
if check_diff(
|
||||||
|
spm_ids[first : first + i], tok_ids[first : first + j], slow, fast
|
||||||
|
) and check_details(
|
||||||
line,
|
line,
|
||||||
spm_ids[first + i : last],
|
spm_ids[first + i : last],
|
||||||
tok_ids[first + j : last],
|
tok_ids[first + j : last],
|
||||||
@ -140,9 +146,9 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
|
|||||||
if skip_assert:
|
if skip_assert:
|
||||||
return
|
return
|
||||||
|
|
||||||
assert (
|
assert slow_ids == fast_ids, (
|
||||||
slow_ids == fast_ids
|
f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
|
||||||
), f"line {text} : \n\n{slow_ids}\n{fast_ids}\n\n{slow.tokenize(text)}\n{fast.tokenize(text)}"
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
|
def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
Script to close stale issue. Taken in part from the AllenNLP repository.
|
Script to close stale issue. Taken in part from the AllenNLP repository.
|
||||||
https://github.com/allenai/allennlp.
|
https://github.com/allenai/allennlp.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from datetime import datetime as dt
|
from datetime import datetime as dt
|
||||||
|
|
||||||
@ -39,10 +40,11 @@ def main():
|
|||||||
|
|
||||||
for i, issue in enumerate(open_issues):
|
for i, issue in enumerate(open_issues):
|
||||||
print(i, issue)
|
print(i, issue)
|
||||||
comments = sorted(list(issue.get_comments()), key=lambda i: i.created_at, reverse=True)
|
comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True)
|
||||||
last_comment = comments[0] if len(comments) > 0 else None
|
last_comment = comments[0] if len(comments) > 0 else None
|
||||||
if (
|
if (
|
||||||
last_comment is not None and last_comment.user.login == "github-actions[bot]"
|
last_comment is not None
|
||||||
|
and last_comment.user.login == "github-actions[bot]"
|
||||||
and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
|
and (dt.utcnow() - issue.updated_at.replace(tzinfo=None)).days > 7
|
||||||
and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
|
and (dt.utcnow() - issue.created_at.replace(tzinfo=None)).days >= 30
|
||||||
and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
|
and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
|
||||||
|
Reference in New Issue
Block a user