Benchmarking v2 GH workflows (#40716)

* WIP benchmark v2 workflow

* Container was missing

* Change to sandbox branch name

* Wrong place for image name

* Variable declarations

* Remove references to file logging

* Remove unnecessary step

* Fix deps install

* Syntax

* Add workdir

* Add upload feature

* typo

* No need for hf_transfer

* Pass in runner

* Runner config

* Runner config

* Runner config

* Runner config

* Runner config

* mi325 caller

* Name workflow runs properly

* Copy-paste error

* Add final repo IDs and schedule

* Review comments

* Remove wf params

* Remove parametrization from worfkflow files

* Fix callers

* Change push trigger to pull_request + label

* Add back schedule event

* Push to the same dataset

* Simplify parameter description
This commit is contained in:
Ákos Hadnagy
2025-09-19 10:54:49 +02:00
committed by GitHub
parent 5f6e278a51
commit 61eff450d3
7 changed files with 311 additions and 10 deletions

82
.github/workflows/benchmark_v2.yml vendored Normal file
View File

@ -0,0 +1,82 @@
name: Benchmark v2 Framework
on:
workflow_call:
inputs:
runner:
description: 'GH Actions runner group to use'
required: true
type: string
commit_sha:
description: 'Commit SHA to benchmark'
required: false
type: string
default: ''
upload_to_hub:
description: 'Uploading results to a HuggingFace Dataset'
required: false
type: string
default: 'false'
run_id:
description: 'Custom run ID for organizing results (auto-generated if not provided)'
required: false
type: string
default: ''
benchmark_repo_id:
description: 'HuggingFace Dataset to upload results to (e.g., "org/benchmark-results")'
required: false
type: string
default: ''
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
# This token is created under the bot `hf-transformers-bot`.
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
jobs:
benchmark-v2:
name: Benchmark v2
runs-on: ${{ inputs.runner }}
if: |
(github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark')) ||
(github.event_name == 'schedule')
container:
image: huggingface/transformers-pytorch-gpu
options: --gpus all --privileged --ipc host --shm-size "16gb"
steps:
- name: Get repo
uses: actions/checkout@v4
with:
ref: ${{ inputs.commit_sha || github.sha }}
- name: Install benchmark dependencies
run: |
python3 -m pip install -r benchmark_v2/requirements.txt
- name: Reinstall transformers in edit mode
run: |
python3 -m pip uninstall -y transformers
python3 -m pip install -e ".[torch]"
- name: Show installed libraries and their versions
run: |
python3 -m pip list
python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')"
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python3 -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')" || true
nvidia-smi || true
- name: Run benchmark v2
working-directory: benchmark_v2
run: |
echo "Running benchmarks"
python3 run_benchmarks.py \
--commit-id '${{ inputs.commit_sha || github.sha }}' \
--upload-to-hub '${{ inputs.upload_to_hub || false}}' \
--run-id '${{ inputs.run_id }}' \
--benchmark-repo-id '${{ inputs.benchmark_repo_id}}' \
--log-level INFO
env:
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}

View File

@ -0,0 +1,20 @@
name: Benchmark v2 Scheduled Runner - A10 Single-GPU
on:
schedule:
# Run daily at 16:30 UTC
- cron: "30 16 * * *"
pull_request:
types: [ opened, labeled, reopened, synchronize ]
jobs:
benchmark-v2-default:
name: Benchmark v2 - Default Models
uses: ./.github/workflows/benchmark_v2.yml
with:
runner: aws-g5-4xlarge-cache-use1-public-80
commit_sha: ${{ github.sha }}
upload_to_hub: true
run_id: ${{ github.run_id }}
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
secrets: inherit

View File

@ -0,0 +1,20 @@
name: Benchmark v2 Scheduled Runner - MI325 Single-GPU
on:
schedule:
# Run daily at 16:30 UTC
- cron: "30 16 * * *"
pull_request:
types: [ opened, labeled, reopened, synchronize ]
jobs:
benchmark-v2-default:
name: Benchmark v2 - Default Models
uses: ./.github/workflows/benchmark_v2.yml
with:
runner: amd-mi325-ci-1gpu
commit_sha: ${{ github.sha }}
upload_to_hub: true
run_id: ${{ github.run_id }}
benchmark_repo_id: hf-internal-testing/transformers-daily-benchmarks
secrets: inherit

View File

@ -21,6 +21,36 @@ python run_benchmarks.py \
--num-tokens-to-generate 200 --num-tokens-to-generate 200
``` ```
### Uploading Results to HuggingFace Dataset
You can automatically upload benchmark results to a HuggingFace Dataset for tracking and analysis:
```bash
# Upload to a public dataset with auto-generated run ID
python run_benchmarks.py --upload-to-hf username/benchmark-results
# Upload with a custom run ID for easy identification
python run_benchmarks.py --upload-to-hf username/benchmark-results --run-id experiment_v1
```
**Dataset Directory Structure:**
```
dataset_name/
├── 2025-01-15/
│ ├── runs/ # Non-scheduled runs (manual, PR, etc.)
│ │ └── 123-1245151651/ # GitHub run number and ID
│ │ └── benchmark_results/
│ │ ├── benchmark_summary_20250115_143022.json
│ │ └── model-name/
│ │ └── model-name_benchmark_20250115_143022.json
│ └── benchmark_results_abc123de/ # Scheduled runs (daily CI)
│ ├── benchmark_summary_20250115_143022.json
│ └── model-name/
│ └── model-name_benchmark_20250115_143022.json
└── 2025-01-16/
└── ...
```
### Running Specific Benchmarks ### Running Specific Benchmarks
```bash ```bash

View File

@ -20,7 +20,6 @@ import torch
from benchmark_framework import ModelBenchmark from benchmark_framework import ModelBenchmark
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "1" os.environ["TOKENIZERS_PARALLELISM"] = "1"
torch.set_float32_matmul_precision("high") torch.set_float32_matmul_precision("high")

View File

@ -3,4 +3,5 @@ psutil>=5.8.0
gpustat>=1.0.0 gpustat>=1.0.0
torch>=2.0.0 torch>=2.0.0
transformers>=4.30.0 transformers>=4.30.0
datasets>=2.10.0 datasets>=2.10.0
huggingface_hub>=0.16.0

View File

@ -24,6 +24,7 @@ import json
import logging import logging
import os import os
import sys import sys
import uuid
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
@ -160,7 +161,12 @@ def run_single_benchmark(
return None return None
def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any], logger: logging.Logger) -> str: def generate_summary_report(
output_dir: str,
benchmark_results: dict[str, Any],
logger: logging.Logger,
benchmark_run_uuid: Optional[str] = None,
) -> str:
"""Generate a summary report of all benchmark runs.""" """Generate a summary report of all benchmark runs."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json") summary_file = os.path.join(output_dir, f"benchmark_summary_{timestamp}.json")
@ -168,6 +174,7 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
summary_data = { summary_data = {
"run_metadata": { "run_metadata": {
"timestamp": datetime.utcnow().isoformat(), "timestamp": datetime.utcnow().isoformat(),
"benchmark_run_uuid": benchmark_run_uuid,
"total_benchmarks": len(benchmark_results), "total_benchmarks": len(benchmark_results),
"successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]), "successful_benchmarks": len([r for r in benchmark_results.values() if r is not None]),
"failed_benchmarks": len([r for r in benchmark_results.values() if r is None]), "failed_benchmarks": len([r for r in benchmark_results.values() if r is None]),
@ -183,9 +190,115 @@ def generate_summary_report(output_dir: str, benchmark_results: dict[str, Any],
return summary_file return summary_file
def upload_results_to_hf_dataset(
output_dir: str,
summary_file: str,
dataset_name: str,
run_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
) -> Optional[str]:
"""
Upload benchmark results to a HuggingFace Dataset.
Based on upload_collated_report() from utils/collated_reports.py
Args:
output_dir: Local output directory containing results
summary_file: Path to the summary file
dataset_name: Name of the HuggingFace dataset to upload to
run_id: Unique run identifier (if None, will generate one)
logger: Logger instance
Returns:
The run_id used for the upload, None if upload failed
"""
if logger is None:
logger = logging.getLogger(__name__)
import os
from huggingface_hub import HfApi
api = HfApi()
if run_id is None:
github_run_number = os.getenv("GITHUB_RUN_NUMBER")
github_run_id = os.getenv("GITHUB_RUN_ID")
if github_run_number and github_run_id:
run_id = f"{github_run_number}-{github_run_id}"
date_folder = datetime.now().strftime("%Y-%m-%d")
github_event_name = os.getenv("GITHUB_EVENT_NAME")
if github_event_name != "schedule":
# Non-scheduled runs go under a runs subfolder
repo_path = f"{date_folder}/runs/{run_id}/benchmark_results"
else:
# Scheduled runs go directly under the date
repo_path = f"{date_folder}/{run_id}/benchmark_results"
logger.info(f"Uploading benchmark results to dataset '{dataset_name}' at path '{repo_path}'")
try:
# Get the authentication token (prioritize specific token, fallback to HF_TOKEN)
token = os.getenv("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN") or os.getenv("HF_TOKEN")
# Upload all files in the output directory
from pathlib import Path
output_path = Path(output_dir)
for file_path in output_path.rglob("*"):
if file_path.is_file():
# Calculate relative path from output_dir
relative_path = file_path.relative_to(output_path)
path_in_repo = f"{repo_path}/{relative_path}"
logger.debug(f"Uploading {file_path} to {path_in_repo}")
api.upload_file(
path_or_fileobj=str(file_path),
path_in_repo=path_in_repo,
repo_id=dataset_name,
repo_type="dataset",
token=token,
commit_message=f"Upload benchmark results for run {run_id}",
)
logger.info(
f"Successfully uploaded results to: https://huggingface.co/datasets/{dataset_name}/tree/main/{repo_path}"
)
return run_id
except Exception as upload_error:
logger.error(f"Failed to upload results: {upload_error}")
import traceback
logger.debug(traceback.format_exc())
return None
def main(): def main():
"""Main entry point for the benchmarking script.""" """Main entry point for the benchmarking script."""
parser = argparse.ArgumentParser(description="Run all benchmarks in the ./benches directory") # Generate a unique UUID for this benchmark run
benchmark_run_uuid = str(uuid.uuid4())[:8]
parser = argparse.ArgumentParser(
description="Run all benchmarks in the ./benches directory",
epilog="""
Examples:
# Run all available benchmarks
python3 run_benchmarks.py
# Run with specific model and upload to HuggingFace Dataset
python3 run_benchmarks.py --model-id meta-llama/Llama-2-7b-hf --upload-to-hf username/benchmark-results
# Run with custom run ID and upload to HuggingFace Dataset
python3 run_benchmarks.py --run-id experiment_v1 --upload-to-hf org/benchmarks
# Run only specific benchmarks with file logging
python3 run_benchmarks.py --include llama --enable-file-logging
""", # noqa: W293
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument( parser.add_argument(
"--output-dir", "--output-dir",
@ -228,20 +341,29 @@ def main():
parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names") parser.add_argument("--exclude", type=str, nargs="*", help="Exclude benchmarks matching these names")
parser.add_argument("--enable-mock", action="store_true", help="Enable mock benchmark (skipped by default)")
parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)") parser.add_argument("--enable-file-logging", action="store_true", help="Enable file logging (disabled by default)")
parser.add_argument( parser.add_argument(
"--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)" "--commit-id", type=str, help="Git commit ID for metadata (if not provided, will auto-detect from git)"
) )
parser.add_argument(
"--upload-to-hub",
type=str,
help="Upload results to HuggingFace Dataset (provide dataset name, e.g., 'username/benchmark-results')",
)
parser.add_argument(
"--run-id", type=str, help="Custom run ID for organizing results (if not provided, will generate a unique ID)"
)
args = parser.parse_args() args = parser.parse_args()
# Setup logging # Setup logging
logger = setup_logging(args.log_level, args.enable_file_logging) logger = setup_logging(args.log_level, args.enable_file_logging)
logger.info("Starting benchmark discovery and execution") logger.info("Starting benchmark discovery and execution")
logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
logger.info(f"Output directory: {args.output_dir}") logger.info(f"Output directory: {args.output_dir}")
logger.info(f"Benches directory: {args.benches_dir}") logger.info(f"Benches directory: {args.benches_dir}")
@ -286,9 +408,6 @@ def main():
if args.model_id: if args.model_id:
benchmark_kwargs["model_id"] = args.model_id benchmark_kwargs["model_id"] = args.model_id
# Add enable_mock flag for mock benchmark
benchmark_kwargs["enable_mock"] = args.enable_mock
# Add commit_id if provided # Add commit_id if provided
if args.commit_id: if args.commit_id:
benchmark_kwargs["commit_id"] = args.commit_id benchmark_kwargs["commit_id"] = args.commit_id
@ -306,7 +425,27 @@ def main():
successful_count += 1 successful_count += 1
# Generate summary report # Generate summary report
summary_file = generate_summary_report(args.output_dir, benchmark_results, logger) summary_file = generate_summary_report(args.output_dir, benchmark_results, logger, benchmark_run_uuid)
# Upload results to HuggingFace Dataset if requested
upload_run_id = None
if args.upload_to_hub:
logger.info("=" * 60)
logger.info("UPLOADING TO HUGGINGFACE DATASET")
logger.info("=" * 60)
# Use provided run_id or fallback to benchmark run UUID
effective_run_id = args.run_id or benchmark_run_uuid
upload_run_id = upload_results_to_hf_dataset(
output_dir=args.output_dir,
summary_file=summary_file,
dataset_name=args.upload_to_hub,
run_id=effective_run_id,
logger=logger,
)
if upload_run_id:
logger.info(f"Upload completed with run ID: {upload_run_id}")
else:
logger.warning("Upload failed - continuing with local results")
# Final summary # Final summary
total_benchmarks = len(filtered_benchmarks) total_benchmarks = len(filtered_benchmarks)
@ -321,6 +460,16 @@ def main():
logger.info(f"Output directory: {args.output_dir}") logger.info(f"Output directory: {args.output_dir}")
logger.info(f"Summary report: {summary_file}") logger.info(f"Summary report: {summary_file}")
if args.upload_to_hub:
if upload_run_id:
logger.info(f"HuggingFace Dataset: {args.upload_to_hub}")
logger.info(f"Run ID: {upload_run_id}")
logger.info(
f"View results: https://huggingface.co/datasets/{args.upload_to_hub}/tree/main/{datetime.now().strftime('%Y-%m-%d')}/runs/{upload_run_id}"
)
else:
logger.warning("Upload to HuggingFace Dataset failed")
if failed_count > 0: if failed_count > 0:
logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.") logger.warning(f"{failed_count} benchmark(s) failed. Check logs for details.")
return 1 return 1