mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-21 01:23:56 +08:00
* Enable ruff on benchmark and scripts Signed-off-by: cyy <cyyever@outlook.com> * Cover benchmark_v2 Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> * correct * style * style --------- Signed-off-by: cyy <cyyever@outlook.com> Signed-off-by: Yuanyuan Chen <cyyever@outlook.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
1200 lines
47 KiB
Python
1200 lines
47 KiB
Python
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import gc
|
|
import json
|
|
import logging
|
|
import os
|
|
import statistics
|
|
import sys
|
|
import threading
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime
|
|
from typing import Any, Optional, TypedDict, Union
|
|
|
|
import gpustat
|
|
import numpy as np
|
|
import psutil
|
|
import torch
|
|
|
|
|
|
class GPUMetrics(TypedDict):
|
|
"""GPU monitoring result with GPU metrics."""
|
|
|
|
gpu_utilization_mean: float
|
|
gpu_utilization_max: float
|
|
gpu_utilization_min: float
|
|
gpu_memory_used_mean: float
|
|
gpu_memory_used_max: float
|
|
gpu_memory_used_min: float
|
|
sample_count: int
|
|
gpu_monitoring_status: str
|
|
|
|
|
|
class NoGPU(TypedDict):
|
|
"""GPU monitoring result without GPU metrics."""
|
|
|
|
gpu_monitoring_status: str
|
|
gpu_monitoring_reason: str
|
|
|
|
|
|
class ArchAwareTimer:
|
|
"""Architecture-aware timer for supposedly better prescision"""
|
|
|
|
def __init__(self, device: Optional[str] = None):
|
|
"""
|
|
Initialize architecture-aware timer.
|
|
|
|
Args:
|
|
device: Device to use. If None, uses current device.
|
|
"""
|
|
self.device = device
|
|
self.use_cuda = torch.cuda.is_available()
|
|
|
|
if self.use_cuda:
|
|
if device and device != "cpu":
|
|
self.device_obj = torch.device(device)
|
|
else:
|
|
# Fall back to CPU timing if device is CPU or CUDA not available
|
|
self.use_cuda = False
|
|
|
|
if self.use_cuda:
|
|
try:
|
|
# Create CUDA events for timing
|
|
self.start_event = torch.cuda.Event(enable_timing=True)
|
|
self.end_event = torch.cuda.Event(enable_timing=True)
|
|
except RuntimeError:
|
|
# Fall back to CPU timing if CUDA events fail
|
|
self.use_cuda = False
|
|
|
|
if not self.use_cuda:
|
|
self.start_time = None
|
|
self.end_time = None
|
|
|
|
def start(self):
|
|
"""Start timing."""
|
|
if self.use_cuda:
|
|
torch.cuda.synchronize(self.device_obj)
|
|
self.start_event.record(stream=torch.cuda.current_stream(self.device_obj))
|
|
else:
|
|
self.start_time = time.perf_counter()
|
|
|
|
def stop(self):
|
|
"""Stop timing."""
|
|
if self.use_cuda:
|
|
self.end_event.record(stream=torch.cuda.current_stream(self.device_obj))
|
|
torch.cuda.synchronize(self.device_obj)
|
|
else:
|
|
self.end_time = time.perf_counter()
|
|
|
|
def elapsed_time(self) -> float:
|
|
"""
|
|
Get elapsed time in seconds.
|
|
|
|
Returns:
|
|
Elapsed time in seconds
|
|
"""
|
|
if self.use_cuda:
|
|
# CUDA events return time in milliseconds, convert to seconds
|
|
return self.start_event.elapsed_time(self.end_event) / 1000.0
|
|
else:
|
|
if self.start_time is None or self.end_time is None:
|
|
raise RuntimeError("Timer not properly started/stopped")
|
|
return self.end_time - self.start_time
|
|
|
|
@property
|
|
def timing_method(self) -> str:
|
|
"""Get the timing method being used."""
|
|
return "CUDA Events" if self.use_cuda else "CPU perf_counter"
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry."""
|
|
self.start()
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit."""
|
|
self.stop()
|
|
|
|
|
|
@dataclass
|
|
class BenchmarkConfig:
|
|
"""Configuration for a single benchmark scenario."""
|
|
|
|
name: str
|
|
model_id: str
|
|
variant: str = "eager" # "eager", "compiled", "kernelized"
|
|
warmup_iterations: int = 3
|
|
measurement_iterations: int = 10
|
|
num_tokens_to_generate: int = 100
|
|
device: str = "cuda"
|
|
torch_dtype: str = "float16"
|
|
compile_mode: Optional[str] = None # None, "default", "reduce-overhead", "max-autotune"
|
|
compile_options: dict[str, Any] = field(default_factory=dict)
|
|
use_cache: bool = True
|
|
batch_size: int = 1
|
|
sequence_length: Optional[int] = None
|
|
attn_implementation: str = "sdpa" # "eager", "sdpa", "flash_attention_2"
|
|
sdpa_backend: Optional[str] = None # None, "math", "flash_attention", "efficient_attention", "cudnn_attention"
|
|
custom_params: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
class BenchmarkScenario:
|
|
"""
|
|
A benchmark scenario that encapsulates both configuration and setup logic.
|
|
This makes it easier to define and adapt benchmarks for different models.
|
|
"""
|
|
|
|
def __init__(self, name: str, config: BenchmarkConfig, description: str = ""):
|
|
self.name = name
|
|
self.config = config
|
|
self.description = description
|
|
self._setup_callbacks = []
|
|
self._teardown_callbacks = []
|
|
|
|
def add_setup_callback(self, callback: callable):
|
|
"""Add a callback to be executed during scenario setup."""
|
|
self._setup_callbacks.append(callback)
|
|
|
|
def add_teardown_callback(self, callback: callable):
|
|
"""Add a callback to be executed during scenario teardown."""
|
|
self._teardown_callbacks.append(callback)
|
|
|
|
def setup(self, model, tokenizer, logger=None):
|
|
"""Execute setup callbacks for this scenario."""
|
|
for callback in self._setup_callbacks:
|
|
try:
|
|
callback(model, tokenizer, self.config, logger)
|
|
except Exception as e:
|
|
if logger:
|
|
logger.warning(f"Setup callback failed for scenario {self.name}: {e}")
|
|
|
|
def teardown(self, model, tokenizer, logger=None):
|
|
"""Execute teardown callbacks for this scenario."""
|
|
for callback in self._teardown_callbacks:
|
|
try:
|
|
callback(model, tokenizer, self.config, logger)
|
|
except Exception as e:
|
|
if logger:
|
|
logger.warning(f"Teardown callback failed for scenario {self.name}: {e}")
|
|
|
|
def __repr__(self):
|
|
return f"BenchmarkScenario(name='{self.name}', variant='{self.config.variant}')"
|
|
|
|
|
|
@dataclass
|
|
class TimingResult:
|
|
"""Result from a timing measurement."""
|
|
|
|
time_to_first_token_seconds: Optional[float] = None
|
|
latency_seconds: float = 0.0
|
|
tokens_per_second: Optional[float] = None
|
|
time_per_output_token_seconds: Optional[float] = None
|
|
total_tokens_generated: int = 0
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class BenchmarkStatistics:
|
|
"""Statistical analysis of benchmark measurements."""
|
|
|
|
name: str
|
|
measurements: list[float]
|
|
mean: float
|
|
median: float
|
|
std: float
|
|
min: float
|
|
max: float
|
|
p25: float # 25th percentile
|
|
p75: float # 75th percentile
|
|
p90: float # 90th percentile
|
|
p95: float # 95th percentile
|
|
p99: float # 99th percentile
|
|
unit: str = "seconds"
|
|
|
|
@classmethod
|
|
def from_measurements(cls, name: str, measurements: list[float], unit: str = "seconds") -> "BenchmarkStatistics":
|
|
"""Create statistics from a list of measurements."""
|
|
if not measurements:
|
|
raise ValueError("Cannot create statistics from empty measurements")
|
|
|
|
measurements_array = np.array(measurements)
|
|
|
|
return cls(
|
|
name=name,
|
|
measurements=measurements,
|
|
mean=float(np.mean(measurements_array)),
|
|
median=float(np.median(measurements_array)),
|
|
std=float(np.std(measurements_array)),
|
|
min=float(np.min(measurements_array)),
|
|
max=float(np.max(measurements_array)),
|
|
p25=float(np.percentile(measurements_array, 25)),
|
|
p75=float(np.percentile(measurements_array, 75)),
|
|
p90=float(np.percentile(measurements_array, 90)),
|
|
p95=float(np.percentile(measurements_array, 95)),
|
|
p99=float(np.percentile(measurements_array, 99)),
|
|
unit=unit,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class HardwareInfo:
|
|
"""Hardware information collected during benchmarking."""
|
|
|
|
gpu_name: str
|
|
gpu_memory_total_mb: int
|
|
cpu_count: int
|
|
memory_total_mb: int
|
|
python_version: str
|
|
torch_version: Optional[str] = None
|
|
cuda_version: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class BenchmarkMetadata:
|
|
"""Metadata collected for each benchmark run."""
|
|
|
|
timestamp: str
|
|
commit_id: str
|
|
hardware_info: HardwareInfo
|
|
config: BenchmarkConfig
|
|
|
|
|
|
class GPUMonitor:
|
|
"""Monitor GPU utilization during benchmark execution."""
|
|
|
|
def __init__(self, sample_interval: float = 0.1, logger: Optional[logging.Logger] = None):
|
|
self.sample_interval = sample_interval
|
|
self.logger = logger or logging.getLogger(__name__)
|
|
self.stop_event = threading.Event()
|
|
self.thread = None
|
|
self.gpu_utilization = []
|
|
self.gpu_memory_used = []
|
|
self.timestamps = []
|
|
self.gpu_available = False
|
|
self.warning_logged = False
|
|
|
|
# Test GPU availability on initialization
|
|
self._test_gpu_availability()
|
|
|
|
def _test_gpu_availability(self):
|
|
"""Test if GPU monitoring is available."""
|
|
try:
|
|
gpu_stats = gpustat.GPUStatCollection.new_query()
|
|
if gpu_stats and len(gpu_stats) > 0:
|
|
self.gpu_available = True
|
|
self.logger.debug(f"GPU monitoring available: {len(gpu_stats)} GPU(s) detected")
|
|
else:
|
|
self.gpu_available = False
|
|
self.logger.debug("No GPUs detected by gpustat")
|
|
except Exception as e:
|
|
self.gpu_available = False
|
|
self.logger.debug(f"GPU monitoring not available: {e}")
|
|
|
|
def start(self):
|
|
"""Start monitoring GPU metrics."""
|
|
if not self.gpu_available:
|
|
self.logger.debug("GPU monitoring disabled: no GPUs available")
|
|
return
|
|
|
|
# Clear the stop event to enable monitoring
|
|
self.stop_event.clear()
|
|
self.gpu_utilization = []
|
|
self.gpu_memory_used = []
|
|
self.timestamps = []
|
|
self.warning_logged = False # Reset warning flag for new monitoring session
|
|
self.thread = threading.Thread(target=self._monitor_loop)
|
|
self.thread.start()
|
|
self.logger.debug("GPU monitoring started")
|
|
|
|
def stop_and_collect(self) -> Union[GPUMetrics, NoGPU]:
|
|
"""Stop monitoring and return collected metrics."""
|
|
if not self.gpu_available:
|
|
return NoGPU(gpu_monitoring_status="disabled", gpu_monitoring_reason="no_gpus_available")
|
|
|
|
# Signal the monitoring thread to stop
|
|
self.stop_event.set()
|
|
if self.thread:
|
|
self.thread.join()
|
|
|
|
if self.gpu_utilization:
|
|
metrics = GPUMetrics(
|
|
gpu_utilization_mean=statistics.mean(self.gpu_utilization),
|
|
gpu_utilization_max=max(self.gpu_utilization),
|
|
gpu_utilization_min=min(self.gpu_utilization),
|
|
gpu_memory_used_mean=statistics.mean(self.gpu_memory_used),
|
|
gpu_memory_used_max=max(self.gpu_memory_used),
|
|
gpu_memory_used_min=min(self.gpu_memory_used),
|
|
sample_count=len(self.gpu_utilization),
|
|
gpu_monitoring_status="success",
|
|
)
|
|
self.logger.debug(f"GPU monitoring completed: {len(self.gpu_utilization)} samples collected")
|
|
return metrics
|
|
else:
|
|
return NoGPU(gpu_monitoring_status="failed", gpu_monitoring_reason="no_samples_collected")
|
|
|
|
def _monitor_loop(self):
|
|
"""Background monitoring loop using threading.Event for communication."""
|
|
consecutive_failures = 0
|
|
max_consecutive_failures = 5
|
|
|
|
# Continue monitoring until stop_event is set
|
|
while not self.stop_event.is_set():
|
|
try:
|
|
gpu_stats = gpustat.GPUStatCollection.new_query()
|
|
if gpu_stats and len(gpu_stats) > 0:
|
|
gpu = gpu_stats[0]
|
|
self.gpu_utilization.append(gpu["utilization.gpu"])
|
|
self.gpu_memory_used.append(gpu["memory.used"])
|
|
self.timestamps.append(time.time())
|
|
consecutive_failures = 0 # Reset failure counter on success
|
|
else:
|
|
consecutive_failures += 1
|
|
if consecutive_failures >= max_consecutive_failures and not self.warning_logged:
|
|
self.logger.warning("GPU monitoring: No GPU data returned by gpustat")
|
|
self.warning_logged = True
|
|
|
|
except Exception as e:
|
|
consecutive_failures += 1
|
|
if consecutive_failures >= max_consecutive_failures and not self.warning_logged:
|
|
self.logger.warning(f"GPU monitoring failed after {max_consecutive_failures} attempts: {e}")
|
|
self.warning_logged = True
|
|
|
|
# Use Event.wait() with timeout instead of time.sleep()
|
|
# This allows for immediate response to stop signal while still maintaining sample interval
|
|
if self.stop_event.wait(timeout=self.sample_interval):
|
|
# Event was set, break out of loop immediately
|
|
break
|
|
|
|
|
|
def get_hardware_info() -> HardwareInfo:
|
|
"""Collect hardware information."""
|
|
gpu_name = "unknown"
|
|
gpu_memory_total = 0
|
|
|
|
try:
|
|
gpu_stats = gpustat.GPUStatCollection.new_query()
|
|
if gpu_stats and len(gpu_stats) > 0:
|
|
gpu = gpu_stats[0]
|
|
gpu_name = gpu["name"]
|
|
gpu_memory_total = gpu["memory.total"]
|
|
except Exception:
|
|
pass
|
|
|
|
torch_version = torch.__version__
|
|
cuda_version = None
|
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
|
cuda_version = torch.version.cuda
|
|
|
|
return HardwareInfo(
|
|
gpu_name=gpu_name,
|
|
gpu_memory_total_mb=gpu_memory_total,
|
|
cpu_count=psutil.cpu_count(),
|
|
memory_total_mb=int(psutil.virtual_memory().total / (1024 * 1024)),
|
|
python_version=f"{sys.version.split()[0]}",
|
|
torch_version=torch_version,
|
|
cuda_version=cuda_version,
|
|
)
|
|
|
|
|
|
def flush_memory():
|
|
"""Flush GPU memory and run garbage collection."""
|
|
gc.collect()
|
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.reset_max_memory_allocated()
|
|
torch.cuda.reset_peak_memory_stats()
|
|
torch.cuda.synchronize()
|
|
|
|
|
|
def get_sdpa_backend(backend_name: Optional[str]):
|
|
"""Get the SDPA backend enum from string name."""
|
|
if backend_name is None:
|
|
return None
|
|
|
|
try:
|
|
backend_map = {
|
|
"math": torch.nn.attention.SDPBackend.MATH,
|
|
"flash_attention": torch.nn.attention.SDPBackend.FLASH_ATTENTION,
|
|
"efficient_attention": torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
|
|
"cudnn_attention": torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
|
|
}
|
|
return backend_map.get(backend_name.lower())
|
|
except AttributeError:
|
|
# torch.nn.attention.SDPBackend not available in older torch versions
|
|
return None
|
|
|
|
|
|
class SDPAContext:
|
|
"""Context manager for SDPA kernel selection."""
|
|
|
|
def __init__(self, backend_name: Optional[str], logger: Optional[logging.Logger] = None):
|
|
self.backend_name = backend_name
|
|
self.logger = logger or logging.getLogger(__name__)
|
|
self.backend = get_sdpa_backend(backend_name) if backend_name else None
|
|
self.context = None
|
|
|
|
def __enter__(self):
|
|
if self.backend is not None:
|
|
try:
|
|
self.context = torch.nn.attention.sdpa_kernel(self.backend)
|
|
self.context.__enter__()
|
|
if self.logger:
|
|
self.logger.debug(f"Using SDPA backend: {self.backend_name}")
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.warning(f"Failed to set SDPA backend {self.backend_name}: {e}")
|
|
self.context = None
|
|
elif self.backend_name and self.logger:
|
|
self.logger.debug(
|
|
f"SDPA backend '{self.backend_name}' requested but not using kernel context (backend={self.backend})"
|
|
)
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
if self.context is not None:
|
|
try:
|
|
self.context.__exit__(exc_type, exc_val, exc_tb)
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.warning(f"Error exiting SDPA context: {e}")
|
|
return False
|
|
|
|
|
|
class AbstractModelBenchmark(ABC):
|
|
"""Abstract base class for model benchmarks."""
|
|
|
|
def __init__(self, logger: logging.Logger):
|
|
self.logger = logger
|
|
self.model = None
|
|
self.tokenizer = None
|
|
self.device = None
|
|
self.scenarios = {} # Map of scenario_name -> BenchmarkScenario
|
|
|
|
@abstractmethod
|
|
def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
|
|
"""Create and return a dictionary of benchmark scenarios."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def setup_model(self, config: BenchmarkConfig) -> None:
|
|
"""Setup the model for benchmarking with the given configuration."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def cleanup_model(self) -> None:
|
|
"""Cleanup model resources."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def measure_time_to_first_token(self, config: BenchmarkConfig) -> float:
|
|
"""Measure time to first token generation."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def measure_latency(self, config: BenchmarkConfig) -> TimingResult:
|
|
"""Measure full generation latency and compute tokens/sec."""
|
|
pass
|
|
|
|
def prepare_inputs(self, config: BenchmarkConfig) -> Any:
|
|
"""Prepare inputs for the model. Override if needed."""
|
|
return None
|
|
|
|
def get_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
|
|
"""Get benchmark scenarios. Creates them if they don't exist."""
|
|
if not self.scenarios:
|
|
self.scenarios = self.create_scenarios(**kwargs)
|
|
return self.scenarios
|
|
|
|
|
|
class ModelBenchmark(AbstractModelBenchmark):
|
|
"""
|
|
Base class for HuggingFace Transformers model benchmarks.
|
|
|
|
This class provides common scenario creation logic and handles the standard
|
|
patterns for eager, compiled, and kernelized execution variants with different
|
|
attention implementations and SDPA backends.
|
|
"""
|
|
|
|
def __init__(self, logger: logging.Logger):
|
|
super().__init__(logger)
|
|
self.inputs = None
|
|
self.compiled_model = None
|
|
self.past_key_values = None
|
|
self.config = None
|
|
self._default_prompt = "Why dogs are so cute?"
|
|
|
|
@property
|
|
def default_prompt(self) -> str:
|
|
"""Default prompt for text generation. Override in subclasses if needed."""
|
|
return self._default_prompt
|
|
|
|
def get_attention_configs(self, include_sdpa_variants: bool = True) -> list[dict[str, Any]]:
|
|
"""
|
|
Get attention implementation configurations.
|
|
|
|
Args:
|
|
include_sdpa_variants: Whether to include SDPA backend variants
|
|
|
|
Returns:
|
|
List of attention configuration dictionaries
|
|
"""
|
|
attention_configs = [
|
|
{"attn_implementation": "eager", "sdpa_backends": [None], "desc_suffix": " with eager attention"},
|
|
]
|
|
|
|
# Add SDPA variants if requested
|
|
if include_sdpa_variants:
|
|
attention_configs.append(
|
|
{
|
|
"attn_implementation": "sdpa",
|
|
"sdpa_backends": [None, "math", "flash_attention", "efficient_attention"],
|
|
"desc_suffix": "",
|
|
}
|
|
)
|
|
|
|
return attention_configs
|
|
|
|
def get_scenario_configs(self) -> list[dict[str, Any]]:
|
|
"""
|
|
Get base scenario configurations. Override in subclasses to customize.
|
|
|
|
Returns:
|
|
List of scenario configuration dictionaries
|
|
"""
|
|
return [
|
|
# Eager variants
|
|
{"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
|
|
# Compiled variants
|
|
{
|
|
"variant": "compiled",
|
|
"compile_mode": "max-autotune",
|
|
"use_cache": True,
|
|
"description": "Compiled with max autotune",
|
|
},
|
|
# Kernelized variant (if available)
|
|
{
|
|
"variant": "kernelized",
|
|
"compile_mode": "max-autotune",
|
|
"use_cache": True,
|
|
"description": "Kernelized execution",
|
|
},
|
|
]
|
|
|
|
def _is_kernelization_available(self) -> bool:
|
|
"""Check if kernelization is available. Override in subclasses."""
|
|
try:
|
|
from kernels import Mode, kernelize # noqa: F401
|
|
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
def get_default_generation_config(self) -> dict[str, Any]:
|
|
"""Get default generation configuration. Override in subclasses for model-specific defaults."""
|
|
return {"do_sample": False, "top_p": 1.0, "temperature": 1.0}
|
|
|
|
def get_model_init_kwargs(self, config: BenchmarkConfig) -> dict[str, Any]:
|
|
"""Get model initialization kwargs. Override in subclasses for model-specific parameters."""
|
|
return {"torch_dtype": getattr(torch, config.torch_dtype), "attn_implementation": config.attn_implementation}
|
|
|
|
def get_default_torch_dtype(self) -> str:
|
|
"""Get default torch dtype. Override in subclasses."""
|
|
return "float16"
|
|
|
|
def get_default_device(self) -> str:
|
|
"""Get default device. Override in subclasses."""
|
|
return "cuda"
|
|
|
|
def create_scenarios(self, **kwargs) -> dict[str, "BenchmarkScenario"]:
|
|
"""Create benchmark scenarios for HuggingFace models."""
|
|
scenarios = {}
|
|
|
|
# Extract parameters with model-specific defaults
|
|
model_id = kwargs.get("model_id", "microsoft/DialoGPT-medium")
|
|
warmup_iterations = kwargs.get("warmup_iterations", 3)
|
|
measurement_iterations = kwargs.get("measurement_iterations", 5)
|
|
num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
|
|
include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
|
|
device = kwargs.get("device", self.get_default_device())
|
|
torch_dtype = kwargs.get("torch_dtype", self.get_default_torch_dtype())
|
|
batch_size = kwargs.get("batch_size", 1)
|
|
|
|
# Get configurations
|
|
attention_configs = self.get_attention_configs(include_sdpa_variants)
|
|
scenario_configs = self.get_scenario_configs()
|
|
|
|
# Create scenarios for each attention config and variant combination
|
|
for attn_config in attention_configs:
|
|
attn_implementation = attn_config["attn_implementation"]
|
|
sdpa_backends = attn_config["sdpa_backends"]
|
|
desc_suffix = attn_config["desc_suffix"]
|
|
|
|
for scenario_config in scenario_configs:
|
|
for sdpa_backend in sdpa_backends:
|
|
# Skip kernelized if not available
|
|
if scenario_config["variant"] == "kernelized" and not self._is_kernelization_available():
|
|
continue
|
|
|
|
# Create unique config for this scenario
|
|
config = BenchmarkConfig(
|
|
name=scenario_config["variant"],
|
|
model_id=model_id,
|
|
variant=scenario_config["variant"],
|
|
compile_mode=scenario_config["compile_mode"],
|
|
use_cache=scenario_config["use_cache"],
|
|
warmup_iterations=warmup_iterations,
|
|
measurement_iterations=measurement_iterations,
|
|
num_tokens_to_generate=num_tokens_to_generate,
|
|
device=device,
|
|
torch_dtype=torch_dtype,
|
|
batch_size=batch_size,
|
|
attn_implementation=attn_implementation,
|
|
sdpa_backend=sdpa_backend if attn_implementation == "sdpa" else None,
|
|
)
|
|
|
|
# Create scenario name
|
|
scenario_name_parts = [scenario_config["variant"]]
|
|
if scenario_config["compile_mode"]:
|
|
scenario_name_parts.append(f"compile_{scenario_config['compile_mode']}")
|
|
|
|
# Add attention implementation to name
|
|
if attn_implementation == "eager":
|
|
scenario_name_parts.append("eager_attn")
|
|
elif attn_implementation == "sdpa":
|
|
if sdpa_backend:
|
|
scenario_name_parts.append(f"sdpa_{sdpa_backend}")
|
|
else:
|
|
scenario_name_parts.append("sdpa_default")
|
|
|
|
scenario_name = "_".join(scenario_name_parts)
|
|
|
|
# Create description
|
|
description = scenario_config["description"]
|
|
if attn_implementation == "sdpa" and sdpa_backend:
|
|
description += f" with SDPA {sdpa_backend} backend"
|
|
elif attn_implementation == "sdpa":
|
|
description += " with SDPA default backend"
|
|
else:
|
|
description += desc_suffix
|
|
|
|
# Create scenario
|
|
scenario = BenchmarkScenario(name=scenario_name, config=config, description=description)
|
|
|
|
# Add setup callbacks based on variant
|
|
if scenario_config["variant"] == "compiled":
|
|
scenario.add_setup_callback(self._setup_compilation_callback)
|
|
elif scenario_config["variant"] == "kernelized":
|
|
scenario.add_setup_callback(self._setup_kernelization_callback)
|
|
|
|
scenarios[scenario_name] = scenario
|
|
|
|
return scenarios
|
|
|
|
def _setup_compilation_callback(self, model, tokenizer, config, logger):
|
|
"""Setup callback for compilation scenarios."""
|
|
if logger:
|
|
logger.info(f"Setting up compilation with mode: {config.compile_mode}")
|
|
|
|
# Perform torch.compile
|
|
if config.compile_mode is not None:
|
|
self.compiled_model = torch.compile(model, mode=config.compile_mode, **config.compile_options)
|
|
else:
|
|
self.compiled_model = torch.compile(model, **config.compile_options)
|
|
|
|
# Setup static cache for compiled mode if needed
|
|
if config.use_cache and hasattr(self, "inputs") and self.inputs is not None:
|
|
self._setup_static_cache(config)
|
|
|
|
def _setup_kernelization_callback(self, model, tokenizer, config, logger):
|
|
"""Setup callback for kernelization scenarios."""
|
|
if logger:
|
|
logger.info("Setting up kernelization")
|
|
|
|
try:
|
|
from kernels import Mode, kernelize
|
|
|
|
self.compiled_model = kernelize(model, mode=Mode.INFERENCE)
|
|
except Exception as e:
|
|
if logger:
|
|
logger.warning(f"Failed to setup kernelized mode: {e}")
|
|
logger.warning("Falling back to eager mode")
|
|
config.variant = "eager"
|
|
|
|
def _setup_static_cache(self, config: BenchmarkConfig):
|
|
"""Setup static cache for compiled models. Override if needed."""
|
|
if hasattr(self, "inputs") and self.inputs is not None:
|
|
try:
|
|
from transformers import StaticCache
|
|
|
|
seq_length = self.inputs["input_ids"].shape[1]
|
|
|
|
# Get the actual device the model is on
|
|
if hasattr(self.model, "device"):
|
|
cache_device = self.model.device
|
|
else:
|
|
cache_device = self.device
|
|
|
|
self.past_key_values = StaticCache(
|
|
config=self.model.config,
|
|
max_batch_size=config.batch_size,
|
|
max_cache_len=seq_length + config.num_tokens_to_generate,
|
|
device=cache_device,
|
|
dtype=getattr(torch, config.torch_dtype),
|
|
)
|
|
self.logger.debug(f"StaticCache created on device: {cache_device}")
|
|
except (ImportError, TypeError) as e:
|
|
# StaticCache not available or incompatible, continue without it
|
|
self.logger.debug(f"StaticCache setup failed: {e}, continuing without cache")
|
|
self.past_key_values = None
|
|
|
|
def setup_model(self, config: BenchmarkConfig) -> None:
|
|
"""Setup the HuggingFace model for benchmarking with the given configuration."""
|
|
|
|
self.logger.info(f"Setting up model: {config.model_id} with variant: {config.variant}")
|
|
self.device = config.device
|
|
self.config = config
|
|
|
|
# Load model and tokenizer
|
|
self._load_model_and_tokenizer(config)
|
|
|
|
# Prepare inputs
|
|
self._prepare_model_inputs(config)
|
|
|
|
# Configure generation settings
|
|
self._configure_generation(config)
|
|
|
|
self.logger.info("Model setup complete")
|
|
|
|
def _load_model_and_tokenizer(self, config: BenchmarkConfig):
|
|
"""Load the model and tokenizer. Override in subclasses for custom loading."""
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
|
|
|
# Load tokenizer
|
|
self.tokenizer = AutoTokenizer.from_pretrained(config.model_id)
|
|
if self.tokenizer.pad_token is None:
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
|
|
# Prepare generation config
|
|
generation_config_dict = self.get_default_generation_config()
|
|
gen_config = GenerationConfig(**generation_config_dict)
|
|
|
|
# Load model
|
|
self.logger.info("Loading model...")
|
|
|
|
target_device = config.device
|
|
# Get model initialization kwargs
|
|
model_init_kwargs = self.get_model_init_kwargs(config)
|
|
model_init_kwargs.update({"generation_config": gen_config})
|
|
|
|
self.model = AutoModelForCausalLM.from_pretrained(config.model_id, **model_init_kwargs).eval()
|
|
|
|
# Move model to target device
|
|
self.logger.info(f"Moving model to device: {target_device}")
|
|
self.model.to(target_device)
|
|
self.device = target_device # Update device to match actual device used
|
|
|
|
def _prepare_model_inputs(self, config: BenchmarkConfig):
|
|
"""Prepare model inputs. Override in subclasses for custom inputs."""
|
|
# Prepare inputs
|
|
self.inputs = self.tokenizer(self.default_prompt, return_tensors="pt")
|
|
|
|
# Move inputs to the same device as the model
|
|
if hasattr(self.model, "device"):
|
|
# Model is on a single device
|
|
model_device = self.model.device
|
|
else:
|
|
# Model might be distributed, use self.device which was set during model loading
|
|
model_device = self.device
|
|
|
|
self.inputs = {k: v.to(model_device) for k, v in self.inputs.items()}
|
|
self.logger.debug(f"Moved inputs to device: {model_device}")
|
|
|
|
def _configure_generation(self, config: BenchmarkConfig):
|
|
"""Configure generation settings."""
|
|
seq_length = self.inputs["input_ids"].shape[1]
|
|
self.model.generation_config.max_length = seq_length + config.num_tokens_to_generate
|
|
|
|
def cleanup_model(self) -> None:
|
|
"""Cleanup model resources."""
|
|
if hasattr(self, "model") and self.model is not None:
|
|
del self.model
|
|
self.model = None
|
|
if hasattr(self, "compiled_model") and self.compiled_model is not None:
|
|
del self.compiled_model
|
|
self.compiled_model = None
|
|
if hasattr(self, "tokenizer") and self.tokenizer is not None:
|
|
del self.tokenizer
|
|
self.tokenizer = None
|
|
if hasattr(self, "past_key_values") and self.past_key_values is not None:
|
|
del self.past_key_values
|
|
self.past_key_values = None
|
|
|
|
# Clear CUDA cache
|
|
flush_memory()
|
|
|
|
def measure_time_to_first_token(self, config: BenchmarkConfig) -> float:
|
|
"""Measure time to first token generation."""
|
|
model_to_use = self.compiled_model if self.compiled_model is not None else self.model
|
|
|
|
# Prepare generation kwargs
|
|
generation_kwargs = self._get_generation_kwargs(config, max_new_tokens=1)
|
|
|
|
# Use CUDA timer for high-precision measurement
|
|
with ArchAwareTimer(device=config.device) as timer:
|
|
# Use SDPA context if specified
|
|
with SDPAContext(config.sdpa_backend, self.logger):
|
|
with torch.no_grad():
|
|
_ = model_to_use.generate(**generation_kwargs)
|
|
|
|
return timer.elapsed_time()
|
|
|
|
def measure_latency(self, config: BenchmarkConfig) -> TimingResult:
|
|
"""Measure full generation latency and compute tokens/sec."""
|
|
model_to_use = self.compiled_model if self.compiled_model is not None else self.model
|
|
|
|
# Prepare generation kwargs
|
|
generation_kwargs = self._get_generation_kwargs(config, max_new_tokens=config.num_tokens_to_generate)
|
|
|
|
# Use CUDA timer for high-precision measurement
|
|
with ArchAwareTimer(device=config.device) as timer:
|
|
# Use SDPA context if specified
|
|
with SDPAContext(config.sdpa_backend, self.logger):
|
|
with torch.no_grad():
|
|
outputs = model_to_use.generate(**generation_kwargs)
|
|
|
|
# Calculate metrics
|
|
latency = timer.elapsed_time()
|
|
input_length = self.inputs["input_ids"].shape[1]
|
|
output_length = outputs.shape[1]
|
|
tokens_generated = output_length - input_length
|
|
|
|
tokens_per_second = tokens_generated / latency if latency > 0 else 0
|
|
time_per_output_token = latency / tokens_generated if tokens_generated > 0 else None
|
|
|
|
return TimingResult(
|
|
latency_seconds=latency,
|
|
tokens_per_second=tokens_per_second,
|
|
time_per_output_token_seconds=time_per_output_token,
|
|
total_tokens_generated=tokens_generated,
|
|
metadata={
|
|
"input_length": input_length,
|
|
"output_length": output_length,
|
|
"variant": config.variant,
|
|
"compile_mode": config.compile_mode,
|
|
"attn_implementation": config.attn_implementation,
|
|
"sdpa_backend": config.sdpa_backend,
|
|
},
|
|
)
|
|
|
|
def _get_generation_kwargs(self, config: BenchmarkConfig, max_new_tokens: int) -> dict[str, Any]:
|
|
"""Get generation kwargs. Override in subclasses for custom generation."""
|
|
generation_config_dict = self.get_default_generation_config()
|
|
generation_kwargs = {
|
|
**self.inputs,
|
|
"max_new_tokens": max_new_tokens,
|
|
"do_sample": generation_config_dict.get("do_sample", False),
|
|
"temperature": generation_config_dict.get("temperature", 1.0),
|
|
"top_p": generation_config_dict.get("top_p", 1.0),
|
|
"pad_token_id": self.tokenizer.pad_token_id,
|
|
}
|
|
|
|
# Handle static cache for compiled models
|
|
if self.past_key_values is not None and config.variant == "compiled":
|
|
try:
|
|
from transformers import StaticCache
|
|
|
|
# Reset cache for each measurement
|
|
seq_length = self.inputs["input_ids"].shape[1]
|
|
|
|
# Get the actual device the model is on
|
|
if hasattr(self.model, "device"):
|
|
cache_device = self.model.device
|
|
else:
|
|
cache_device = self.device
|
|
|
|
fresh_cache = StaticCache(
|
|
config=self.model.config,
|
|
max_batch_size=config.batch_size,
|
|
max_cache_len=seq_length + max_new_tokens,
|
|
device=cache_device,
|
|
dtype=getattr(torch, config.torch_dtype),
|
|
)
|
|
generation_kwargs["past_key_values"] = fresh_cache
|
|
except (ImportError, TypeError) as e:
|
|
self.logger.debug(f"Fresh StaticCache creation failed: {e}")
|
|
pass
|
|
|
|
return generation_kwargs
|
|
|
|
|
|
class BenchmarkRunner:
|
|
"""Main benchmark runner that coordinates benchmark execution."""
|
|
|
|
def __init__(self, logger: logging.Logger, output_dir: str = "benchmark_results"):
|
|
self.logger = logger
|
|
self.output_dir = output_dir
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
def run_benchmark(
|
|
self,
|
|
benchmark: ModelBenchmark,
|
|
scenarios: dict[str, BenchmarkScenario],
|
|
collect_gpu_metrics: bool = True,
|
|
commit_id: Optional[str] = None,
|
|
) -> dict[str, dict[str, Any]]:
|
|
"""
|
|
Run benchmarks using scenarios.
|
|
|
|
Args:
|
|
benchmark: The benchmark instance to run
|
|
scenarios: Dictionary mapping scenario names to BenchmarkScenario instances
|
|
collect_gpu_metrics: Whether to collect GPU utilization metrics
|
|
commit_id: Git commit ID for metadata (if not provided, will auto-detect from git)
|
|
|
|
Returns:
|
|
Dictionary mapping scenario names to results with statistics
|
|
"""
|
|
all_results = {}
|
|
|
|
for scenario_name, scenario in scenarios.items():
|
|
self.logger.info(f"Running benchmark scenario: {scenario_name}")
|
|
config = scenario.config
|
|
|
|
try:
|
|
# Setup model for this configuration
|
|
benchmark.setup_model(config)
|
|
|
|
# Run scenario setup callbacks
|
|
scenario.setup(benchmark.model, benchmark.tokenizer, self.logger)
|
|
|
|
# Quick validation: try one measurement first to see if this scenario works
|
|
try:
|
|
flush_memory()
|
|
test_result = benchmark.measure_time_to_first_token(config)
|
|
if test_result is None or test_result <= 0:
|
|
raise ValueError("Invalid measurement result")
|
|
except Exception as validation_error:
|
|
self.logger.warning(f"Skipping scenario {scenario_name}: validation failed - {validation_error}")
|
|
# Clean up and skip this scenario
|
|
try:
|
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
|
benchmark.cleanup_model()
|
|
except Exception:
|
|
pass
|
|
continue
|
|
|
|
# Collect metadata
|
|
metadata = BenchmarkMetadata(
|
|
timestamp=datetime.utcnow().isoformat(),
|
|
commit_id=commit_id,
|
|
hardware_info=get_hardware_info(),
|
|
config=config,
|
|
)
|
|
|
|
# Initialize GPU monitor
|
|
gpu_monitor = None
|
|
if collect_gpu_metrics:
|
|
gpu_monitor = GPUMonitor(logger=self.logger)
|
|
|
|
# Warmup runs
|
|
self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
|
|
warmup_failures = 0
|
|
for i in range(config.warmup_iterations):
|
|
try:
|
|
_ = benchmark.measure_latency(config)
|
|
except Exception as e:
|
|
warmup_failures += 1
|
|
self.logger.warning(f"Warmup iteration {i + 1} failed: {e}")
|
|
|
|
# If more than half the warmup iterations failed, skip this scenario
|
|
if warmup_failures > config.warmup_iterations // 2:
|
|
self.logger.warning(
|
|
f"Skipping scenario {scenario_name}: too many warmup failures ({warmup_failures}/{config.warmup_iterations})"
|
|
)
|
|
try:
|
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
|
benchmark.cleanup_model()
|
|
except Exception:
|
|
pass
|
|
continue
|
|
|
|
# Start GPU monitoring
|
|
if gpu_monitor:
|
|
gpu_monitor.start()
|
|
|
|
# Measurement runs for latency
|
|
self.logger.info(f"Measuring latency with {config.measurement_iterations} iterations...")
|
|
latency_measurements = []
|
|
ttft_measurements = []
|
|
tokens_per_sec_measurements = []
|
|
itl_measurements = [] # Inter-Token Latency
|
|
measurement_failures = 0
|
|
|
|
for i in range(config.measurement_iterations):
|
|
try:
|
|
# Measure time to first token
|
|
ttft = benchmark.measure_time_to_first_token(config)
|
|
ttft_measurements.append(ttft)
|
|
|
|
# Measure full latency
|
|
timing_result = benchmark.measure_latency(config)
|
|
latency_measurements.append(timing_result.latency_seconds)
|
|
|
|
if timing_result.tokens_per_second is not None:
|
|
tokens_per_sec_measurements.append(timing_result.tokens_per_second)
|
|
|
|
if timing_result.time_per_output_token_seconds is not None:
|
|
itl_measurements.append(timing_result.time_per_output_token_seconds)
|
|
|
|
itl_str = (
|
|
f", itl={timing_result.time_per_output_token_seconds:.4f}s/token"
|
|
if timing_result.time_per_output_token_seconds
|
|
else ""
|
|
)
|
|
self.logger.debug(
|
|
f"Iteration {i + 1}: latency={timing_result.latency_seconds:.4f}s, ttft={ttft:.4f}s{itl_str}"
|
|
)
|
|
|
|
except Exception as e:
|
|
measurement_failures += 1
|
|
self.logger.warning(f"Measurement iteration {i + 1} failed: {e}")
|
|
|
|
# Stop GPU monitoring
|
|
gpu_metrics = {}
|
|
if gpu_monitor:
|
|
gpu_metrics = gpu_monitor.stop_and_collect()
|
|
|
|
# If we don't have enough successful measurements, skip this scenario
|
|
if not latency_measurements or len(latency_measurements) < config.measurement_iterations // 2:
|
|
self.logger.warning(
|
|
f"Skipping scenario {scenario_name}: insufficient successful measurements ({len(latency_measurements)}/{config.measurement_iterations})"
|
|
)
|
|
try:
|
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
|
benchmark.cleanup_model()
|
|
except Exception:
|
|
pass
|
|
continue
|
|
|
|
# Calculate statistics
|
|
scenario_results = {
|
|
"metadata": asdict(metadata),
|
|
"measurements": {},
|
|
"gpu_metrics": gpu_metrics,
|
|
"scenario_description": scenario.description,
|
|
}
|
|
|
|
if latency_measurements:
|
|
latency_stats = BenchmarkStatistics.from_measurements("latency_seconds", latency_measurements)
|
|
scenario_results["measurements"]["latency_seconds"] = asdict(latency_stats)
|
|
|
|
if ttft_measurements:
|
|
ttft_stats = BenchmarkStatistics.from_measurements(
|
|
"time_to_first_token_seconds", ttft_measurements
|
|
)
|
|
scenario_results["measurements"]["time_to_first_token_seconds"] = asdict(ttft_stats)
|
|
|
|
if tokens_per_sec_measurements:
|
|
tps_stats = BenchmarkStatistics.from_measurements(
|
|
"tokens_per_second", tokens_per_sec_measurements, "tokens/sec"
|
|
)
|
|
scenario_results["measurements"]["tokens_per_second"] = asdict(tps_stats)
|
|
|
|
if itl_measurements:
|
|
itl_stats = BenchmarkStatistics.from_measurements(
|
|
"time_per_output_token_seconds", itl_measurements, "seconds/token"
|
|
)
|
|
scenario_results["measurements"]["time_per_output_token_seconds"] = asdict(itl_stats)
|
|
|
|
# Log summary
|
|
if latency_measurements:
|
|
self.logger.info(f"Latency: {latency_stats.mean:.4f}±{latency_stats.std:.4f}s (mean±std)")
|
|
if ttft_measurements:
|
|
self.logger.info(f"TTFT: {ttft_stats.mean:.4f}±{ttft_stats.std:.4f}s (mean±std)")
|
|
if tokens_per_sec_measurements:
|
|
self.logger.info(f"Throughput: {tps_stats.mean:.2f}±{tps_stats.std:.2f} tokens/sec (mean±std)")
|
|
if itl_measurements:
|
|
self.logger.info(f"ITL: {itl_stats.mean:.4f}±{itl_stats.std:.4f}s/token (mean±std)")
|
|
|
|
# Add note about partial results if some measurements failed
|
|
if measurement_failures > 0:
|
|
scenario_results["warnings"] = [f"Some measurements failed ({measurement_failures} failures)"]
|
|
self.logger.info(f"Scenario completed with {measurement_failures} measurement failures")
|
|
|
|
# Run scenario teardown callbacks
|
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
|
|
|
# Cleanup model
|
|
benchmark.cleanup_model()
|
|
|
|
all_results[scenario_name] = scenario_results
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Skipping scenario {scenario_name}: setup failed - {e}")
|
|
import traceback
|
|
|
|
self.logger.debug(traceback.format_exc())
|
|
|
|
# Try to clean up if possible
|
|
try:
|
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
|
benchmark.cleanup_model()
|
|
except Exception:
|
|
pass
|
|
# Skip storing failed scenarios - just continue to the next one
|
|
finally:
|
|
try:
|
|
scenario.teardown(benchmark.model, benchmark.tokenizer, self.logger)
|
|
benchmark.cleanup_model()
|
|
except Exception as cleanup_error:
|
|
self.logger.warning(f"Cleanup failed for scenario {scenario_name}: {cleanup_error}")
|
|
|
|
flush_memory()
|
|
|
|
return all_results
|
|
|
|
def save_results(self, model_name: str, results: dict[str, dict[str, Any]]) -> str:
|
|
"""Save benchmark results to JSON file."""
|
|
# Create model-specific subdirectory
|
|
model_dir = os.path.join(self.output_dir, model_name)
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
# Create filename with timestamp
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"{model_name}_benchmark_{timestamp}.json"
|
|
filepath = os.path.join(model_dir, filename)
|
|
|
|
# Prepare output structure
|
|
output_data = {"model_name": model_name, "benchmark_scenarios": []}
|
|
|
|
for config_name, config_results in results.items():
|
|
scenario = {
|
|
"scenario_name": config_name,
|
|
"metadata": config_results["metadata"],
|
|
"measurements": config_results["measurements"],
|
|
"gpu_metrics": config_results.get("gpu_metrics", {}),
|
|
}
|
|
output_data["benchmark_scenarios"].append(scenario)
|
|
|
|
# Save to JSON file
|
|
with open(filepath, "w") as f:
|
|
json.dump(output_data, f, indent=2, default=str)
|
|
|
|
self.logger.info(f"Results saved to {filepath}")
|
|
return filepath
|