transformers/benchmark_v2/benches/llama.py

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
from typing import Any

import torch
from benchmark_framework import ModelBenchmark


os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "1"
torch.set_float32_matmul_precision("high")


class LLaMABenchmark(ModelBenchmark):
    """Simplified LLaMA model benchmark implementation using the ModelBenchmark base class."""

    def __init__(self, logger: logging.Logger):
        super().__init__(logger)
        self._default_prompt = "Why dogs are so cute?"  # Custom prompt for LLaMA

    def get_scenario_configs(self) -> list[dict[str, Any]]:
        """
        Get LLaMA-specific scenario configurations.

        Returns:
            List of scenario configuration dictionaries
        """
        return [
            # Eager variants
            {"variant": "eager", "compile_mode": None, "use_cache": True, "description": "Eager execution with cache"},
            # Compiled variants
            {
                "variant": "compiled",
                "compile_mode": "max-autotune",
                "use_cache": True,
                "description": "Compiled with max autotune",
            },
            # Kernelized variant (if available)
            {
                "variant": "kernelized",
                "compile_mode": "max-autotune",
                "use_cache": True,
                "description": "Kernelized execution",
            },
        ]

    def _is_kernelization_available(self) -> bool:
        """Check if kernelization is available for LLaMA."""
        try:
            from kernels import Mode, kernelize  # noqa: F401

            return True
        except ImportError:
            self.logger.debug("Kernelization not available: kernels module not found")
            return False

    def get_default_generation_config(self) -> dict[str, Any]:
        """Get LLaMA-specific generation configuration."""
        return {
            "do_sample": False,
            "top_p": 1.0,
            "temperature": 1.0,
            "repetition_penalty": 1.0,
            "max_new_tokens": None,  # Will be set per scenario
        }

    def get_model_init_kwargs(self, config) -> dict[str, Any]:
        """Get LLaMA-specific model initialization kwargs."""
        return {
            "torch_dtype": getattr(torch, config.torch_dtype),
            "attn_implementation": config.attn_implementation,
            "use_cache": True,
        }

    def get_default_torch_dtype(self) -> str:
        """Get default torch dtype for LLaMA."""
        return "float16"  # LLaMA works well with float16

    def get_default_device(self) -> str:
        """Get default device for LLaMA."""
        return "cuda"  # LLaMA prefers CUDA


def run_llama(logger, output_dir, **kwargs):
    """
    Run LLaMA benchmark with the given configuration.

    Args:
        logger: Logger instance
        output_dir: Output directory for results
        **kwargs: Additional configuration options

    Returns:
        Path to output file if successful
    """
    from benchmark_framework import BenchmarkRunner

    # Extract parameters with defaults
    model_id = kwargs.get("model_id", "meta-llama/Llama-2-7b-hf")
    warmup_iterations = kwargs.get("warmup_iterations", 3)
    measurement_iterations = kwargs.get("measurement_iterations", 5)
    num_tokens_to_generate = kwargs.get("num_tokens_to_generate", 100)
    include_sdpa_variants = kwargs.get("include_sdpa_variants", True)
    device = kwargs.get("device", "cuda")
    torch_dtype = kwargs.get("torch_dtype", "float16")
    batch_size = kwargs.get("batch_size", 1)
    commit_id = kwargs.get("commit_id")

    logger.info(f"Starting LLaMA benchmark for model: {model_id}")
    logger.info(
        f"Configuration: warmup={warmup_iterations}, measurement={measurement_iterations}, tokens={num_tokens_to_generate}"
    )

    try:
        # Create benchmark instance
        benchmark = LLaMABenchmark(logger)

        # Create scenarios
        scenarios = benchmark.create_scenarios(
            model_id=model_id,
            warmup_iterations=warmup_iterations,
            measurement_iterations=measurement_iterations,
            num_tokens_to_generate=num_tokens_to_generate,
            include_sdpa_variants=include_sdpa_variants,
            device=device,
            torch_dtype=torch_dtype,
            batch_size=batch_size,
        )

        logger.info(f"Created {len(scenarios)} benchmark scenarios")

        # Create runner and execute benchmarks
        runner = BenchmarkRunner(logger, output_dir)
        results = runner.run_benchmark(benchmark, scenarios, commit_id=commit_id)

        if not results:
            logger.warning("No successful benchmark results")
            return None

        # Save results
        model_name = model_id.split("/")[-1]  # Extract model name from ID
        output_file = runner.save_results(model_name, results)

        logger.info(f"LLaMA benchmark completed successfully. Results saved to: {output_file}")
        return output_file

    except Exception as e:
        logger.error(f"LLaMA benchmark failed: {e}")
        import traceback

        logger.debug(traceback.format_exc())
        raise