Benchmarking v2 GH workflows (#40716)

* WIP benchmark v2 workflow

* Container was missing

* Change to sandbox branch name

* Wrong place for image name

* Variable declarations

* Remove references to file logging

* Remove unnecessary step

* Fix deps install

* Syntax

* Add workdir

* Add upload feature

* typo

* No need for hf_transfer

* Pass in runner

* Runner config

* Runner config

* Runner config

* Runner config

* Runner config

* mi325 caller

* Name workflow runs properly

* Copy-paste error

* Add final repo IDs and schedule

* Review comments

* Remove wf params

* Remove parametrization from worfkflow files

* Fix callers

* Change push trigger to pull_request + label

* Add back schedule event

* Push to the same dataset

* Simplify parameter description
This commit is contained in:
Ákos Hadnagy
2025-09-19 10:54:49 +02:00
committed by GitHub
parent 5f6e278a51
commit 61eff450d3
7 changed files with 311 additions and 10 deletions

82
.github/workflows/benchmark_v2.yml vendored Normal file
View File

@ -0,0 +1,82 @@
name: Benchmark v2 Framework
on:
workflow_call:
inputs:
runner:
description: 'GH Actions runner group to use'
required: true
type: string
commit_sha:
description: 'Commit SHA to benchmark'
required: false
type: string
default: ''
upload_to_hub:
description: 'Uploading results to a HuggingFace Dataset'
required: false
type: string
default: 'false'
run_id:
description: 'Custom run ID for organizing results (auto-generated if not provided)'
required: false
type: string
default: ''
benchmark_repo_id:
description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")'
required: false
type: string
default: ''
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
# This token is created under the bot `hf-transformers-bot`.
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
jobs:
benchmark-v2:
name: Benchmark v2
runs-on: ${{ inputs.runner }}
if: |
(github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) ||
(github.event_name == 'schedule')
container:
image: huggingface/transformers-pytorch-gpu
options: --gpus all --privileged --ipc host --shm-size "16gb"
steps:
- name: Get repo
uses: actions/checkout@v4
with:
ref: ${{ inputs.commit_sha || github.sha }}
- name: Install benchmark dependencies
run: |
python3 -m pip install -r benchmark_v2/requirements.txt
- name: Reinstall transformers in edit mode
run: |
python3 -m pip uninstall -y transformers
python3 -m pip install -e ".[torch]"
- name: Show installed libraries and their versions
run: |
python3 -m pip list
python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true
nvidia-smi || true
- name: Run benchmark v2
working-directory: benchmark_v2
run: |
echo "Running benchmarks"
python3 run_benchmarks.py \
--commit-id '${{ inputs.commit_sha || github.sha }}' \
--upload-to-hub '${{ inputs.upload_to_hub || false}}' \
--run-id '${{ inputs.run_id }}' \
--benchmark-repo-id '${{ inputs.benchmark_repo_id}}' \
--log-level INFO
env:
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}

View File

@ -0,0 +1,20 @@
name: Benchmark v2 Scheduled Runner - A10 Single-GPU
on:
schedule:
# Run daily at 16:30 UTC
- cron: "30 16 * * *"
pull_request:
types: [ opened, labeled, reopened, synchronize ]
jobs:
benchmark-v2-default:
name: Benchmark v2 - Default Models
uses: ./.github/workflows/benchmark_v2.yml
with:
runner: aws-g5-4xlarge-cache-use1-public-80
commit_sha: ${{ github.sha }}
upload_to_hub: true
run_id: ${{ github.run_id }}
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
secrets: inherit

View File

@ -0,0 +1,20 @@
name: Benchmark v2 Scheduled Runner - MI325 Single-GPU
on:
schedule:
# Run daily at 16:30 UTC
- cron: "30 16 * * *"
pull_request:
types: [ opened, labeled, reopened, synchronize ]
jobs:
benchmark-v2-default:
name: Benchmark v2 - Default Models
uses: ./.github/workflows/benchmark_v2.yml
with:
runner: amd-mi325-ci-1gpu
commit_sha: ${{ github.sha }}
upload_to_hub: true
run_id: ${{ github.run_id }}
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
secrets: inherit

View File

@ -21,6 +21,36 @@ python run_benchmarks.py \
--num-tokens-to-generate 200
```
### Uploading Results to HuggingFace Dataset
You can automatically upload benchmark results to a HuggingFace Dataset for tracking and analysis:
```bash
# Upload to a public dataset with auto-generated run ID
python run_benchmarks.py --upload-to-hf username/benchmark-results
# Upload with a custom run ID for easy identification
python run_benchmarks.py --upload-to-hf username/benchmark-results --run-id experiment_v1
```
**Dataset Directory Structure:**
```
dataset_name/
├── 2025-01-15/
│ ├── runs/ # Non-scheduled runs (manual, PR, etc.)
│ │ └── 123-1245151651/ # GitHub run number and ID
│ │ └── benchmark_results/
│ │ ├── benchmark_summary_20250115_143022.json
│ │ └── model-name/
│ │ └── model-name_benchmark_20250115_143022.json
│ └── benchmark_results_abc123de/ # Scheduled runs (daily CI)
│ ├── benchmark_summary_20250115_143022.json
│ └── model-name/
│ └── model-name_benchmark_20250115_143022.json
└── 2025-01-16/
└── ...
```
### Running Specific Benchmarks
```bash

View File

@ -20,7 +20,6 @@ import torch
from benchmark_framework import ModelBenchmark
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "1"
torch.set_float32_matmul_precision("high")

View File

@ -3,4 +3,5 @@ psutil>=5.8.0
gpustat>=1.0.0
torch>=2.0.0
transformers>=4.30.0
datasets>=2.10.0
datasets>=2.10.0
huggingface_hub>=0.16.0

View File

@ -24,6 +24,7 @@ import json
import logging
import os
import sys
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any, Optional
@ -160,7 +161,12 @@ def run_single_benchmark(
return None
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str:
def generate_summary_report(
output_dir: str,
benchmark_results: dict[str, Any],
logger: logging.Logger,
benchmark_run_uuid: Optional[str] = None,
) -> str:
"""Generate a summary report of all benchmark runs."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
@ -168,6 +174,7 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
summary_data = {
"run_metadata": {
"timestamp": datetime.utcnow().isoformat(),
"benchmark_run_uuid": benchmark_run_uuid,
"total_benchmarks": len(benchmark_results),
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
@ -183,9 +190,115 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
return summary_file
def upload_results_to_hf_dataset(
output_dir: str,
summary_file: str,
dataset_name: str,
run_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
) -> Optional[str]:
"""
Upload benchmark results to a HuggingFace Dataset.
Based on upload_collated_report() from utils/collated_reports.py
Args:
output_dir: Local output directory containing results
summary_file: Path to the summary file
dataset_name: Name of the HuggingFace dataset to upload to
run_id: Unique run identifier (if None, will generate one)
logger: Logger instance
Returns:
The run_id used for the upload, None if upload failed
"""
if logger is None:
logger = logging.getLogger(__name__)
import os
from huggingface_hub import HfApi
api = HfApi()
if run_id is None:
github_run_number = os.getenv("GITHUB_RUN_NUMBER")
github_run_id = os.getenv("GITHUB_RUN_ID")
if github_run_number and github_run_id:
run_id = f"{github_run_number}-{github_run_id}"
date_folder = datetime.now().strftime("%Y-%m-%d")
github_event_name = os.getenv("GITHUB_EVENT_NAME")
if github_event_name != "schedule":
# Non-scheduled runs go under a runs subfolder
repo_path = f"{date_folder}/runs/{run_id}/benchmark_results"
else:
# Scheduled runs go directly under the date
repo_path = f"{date_folder}/{run_id}/benchmark_results"
logger.info(f"Uploading benchmark results to dataset '{dataset_name}' at path '{repo_path}'")
try:
# Get the authentication token (prioritize specific token, fallback to HF_TOKEN)
token = os.getenv("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN") or os.getenv("HF_TOKEN")
# Upload all files in the output directory
from pathlib import Path
output_path = Path(output_dir)
for file_path in output_path.rglob("*"):
if file_path.is_file():
# Calculate relative path from output_dir
relative_path = file_path.relative_to(output_path)
path_in_repo = f"{repo_path}/{relative_path}"
logger.debug(f"Uploading {file_path} to {path_in_repo}")
api.upload_file(
path_or_fileobj=str(file_path),
path_in_repo=path_in_repo,
repo_id=dataset_name,
repo_type="dataset",
token=token,
commit_message=f"Upload benchmark results for run {run_id}",
)
logger.info(
f"Successfully uploaded results to: https://huggingface.co/datasets/{dataset_name}/tree/main/{repo_path}"
)
return run_id
except Exception as upload_error:
logger.error(f"Failed to upload results: {upload_error}")
import traceback
logger.debug(traceback.format_exc())
return None
def main():
"""Main entry point for the benchmarking script."""
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory")
# Generate a unique UUID for this benchmark run
benchmark_run_uuid = str(uuid.uuid4())[:8]
parser = argparse.ArgumentParser(
description="Run all benchmarks in the ./benches directory",
epilog="""
Examples:
# Run all available benchmarks
python3 run_benchmarks.py
# Run with specific model and upload to HuggingFace Dataset
python3 run_benchmarks.py --model-id meta-llama/Llama-2-7b-hf --upload-to-hf username/benchmark-results
# Run with custom run ID and upload to HuggingFace Dataset
python3 run_benchmarks.py --run-id experiment_v1 --upload-to-hf org/benchmarks
# Run only specific benchmarks with file logging
python3 run_benchmarks.py --include llama --enable-file-logging
""", # noqa: W293
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--output-dir",
@ -228,20 +341,29 @@ def main():
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
parser.add_argument(
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
)
parser.add_argument(
"--upload-to-hub",
type=str,
help="Upload results to HuggingFace Dataset (provide dataset name, e.g., 'username/benchmark-results')",
)
parser.add_argument(
"--run-id", type=str, help="Custom run ID for organizing results (if not provided, will generate a unique ID)"
)
args = parser.parse_args()
# Setup logging
logger = setup_logging(args.log_level, args.enable_file_logging)
logger.info("Starting benchmark discovery and execution")
logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
logger.info(f"Output directory: {args.output_dir}")
logger.info(f"Benches directory: {args.benches_dir}")
@ -286,9 +408,6 @@ def main():
if args.model_id:
benchmark_kwargs["model_id"] = args.model_id
# Add enable_mock flag for mock benchmark
benchmark_kwargs["enable_mock"] = args.enable_mock
# Add commit_id if provided
if args.commit_id:
benchmark_kwargs["commit_id"] = args.commit_id
@ -306,7 +425,27 @@ def main():
successful_count += 1
# Generate summary report
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger)
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger, benchmark_run_uuid)
# Upload results to HuggingFace Dataset if requested
upload_run_id = None
if args.upload_to_hub:
logger.info("=" * 60)
logger.info("UPLOADING TO HUGGINGFACE DATASET")
logger.info("=" * 60)
# Use provided run_id or fallback to benchmark run UUID
effective_run_id = args.run_id or benchmark_run_uuid
upload_run_id = upload_results_to_hf_dataset(
output_dir=args.output_dir,
summary_file=summary_file,
dataset_name=args.upload_to_hub,
run_id=effective_run_id,
logger=logger,
)
if upload_run_id:
logger.info(f"Upload completed with run ID: {upload_run_id}")
else:
logger.warning("Upload failed - continuing with local results")
# Final summary
total_benchmarks = len(filtered_benchmarks)
@ -321,6 +460,16 @@ def main():
logger.info(f"Output directory: {args.output_dir}")
logger.info(f"Summary report: {summary_file}")
if args.upload_to_hub:
if upload_run_id:
logger.info(f"HuggingFace Dataset: {args.upload_to_hub}")
logger.info(f"Run ID: {upload_run_id}")
logger.info(
f"View results: https://huggingface.co/datasets/{args.upload_to_hub}/tree/main/{datetime.now().strftime('%Y-%m-%d')}/runs/{upload_run_id}"
)
else:
logger.warning("Upload to HuggingFace Dataset failed")
if failed_count > 0:
logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
return 1