[CI/Build] Replace lm-eval gsm8k tests with faster implementation (#23002)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-19 18:07:30 -04:00
parent a38b8af4c3
commit 0f4f0191d8
12 changed files with 476 additions and 3 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -451,13 +451,11 @@ steps:

 - label: LM Eval Small Models # 53min
  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

 - label: OpenAI API correctness
  mirror_hardwares: [amdexperimental]
--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
@ -0,0 +1,35 @@
+# GSM8K Accuracy Evaluation
+
+This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
+
+## Usage
+
+### Run tests with pytest (like buildkite)
+
+```bash
+pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+```
+
+### Run standalone evaluation script
+
+```bash
+# Start vLLM server first
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
+
+# Run evaluation
+python tests/gsm8k/gsm8k_eval.py --port 8000
+```
+
+## Configuration Format
+
+Model configs in `configs/` directory use this YAML format:
+
+```yaml
+model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+accuracy_threshold: 0.54  # Minimum expected accuracy
+num_questions: 1319       # Number of questions (default: full test set)
+num_fewshot: 5            # Few-shot examples from train set
+max_model_len: 4096       # Model context length
+```
--- a/tests/evals/gsm8k/init.py
+++ b/tests/evals/gsm8k/init.py
@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@ -0,0 +1,5 @@
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+accuracy_threshold: 0.74
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
--- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@ -0,0 +1,5 @@
+model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
+accuracy_threshold: 0.31
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@ -0,0 +1,5 @@
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+accuracy_threshold: 0.45
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
--- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -0,0 +1,5 @@
+model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+accuracy_threshold: 0.60
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
--- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@ -0,0 +1,5 @@
+model_name: "Qwen/Qwen3-0.6B-FP8"
+accuracy_threshold: 0.375
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
--- a/tests/evals/gsm8k/configs/models-small.txt
+++ b/tests/evals/gsm8k/configs/models-small.txt
@ -0,0 +1,5 @@
+Qwen3-0.6B-FP8.yaml
+Llama-3.2-1B-Instruct-INT8-CT.yaml
+Llama-3-8B-Instruct-nonuniform-CT.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-CT.yaml
--- a/tests/evals/gsm8k/conftest.py
+++ b/tests/evals/gsm8k/conftest.py
@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+
+def pytest_addoption(parser):
+    """Add custom command line options."""
+    parser.addoption("--config-list-file",
+                     default="configs/models-small.txt",
+                     help="File containing list of config files to test")
+    parser.addoption("--tp-size",
+                     default=1,
+                     type=int,
+                     help="Tensor parallel size")
+
+
+def pytest_generate_tests(metafunc):
+    """Generate test parameters from config files."""
+    if "config_filename" in metafunc.fixturenames:
+        config_list_file = metafunc.config.getoption("--config-list-file")
+        tp_size = metafunc.config.getoption("--tp-size")
+
+        # Handle both relative and absolute paths
+        config_list_path = Path(config_list_file)
+        if not config_list_path.is_absolute():
+            # If relative, try relative to test directory first
+            test_dir_path = Path(__file__).parent / config_list_file
+            if test_dir_path.exists():
+                config_list_path = test_dir_path
+            else:
+                # Try relative to current working directory
+                config_list_path = Path.cwd() / config_list_file
+
+        print(f"Looking for config list at: {config_list_path}")
+
+        config_files = []
+        if config_list_path.exists():
+            # Determine config directory (same directory as the list file)
+            config_dir = config_list_path.parent
+
+            with open(config_list_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith("#"):
+                        config_path = config_dir / line
+                        print(f"Checking config file: {config_path}")
+                        if config_path.exists():
+                            config_files.append(config_path)
+                            print(f"  ✓ Found: {config_path}")
+                        else:
+                            print(f"  ✗ Missing: {config_path}")
+        else:
+            print(f"Config list file not found: {config_list_path}")
+
+        # Generate test parameters
+        if config_files:
+            metafunc.parametrize(["config_filename", "tp_size"],
+                                 [(config_file, int(tp_size))
+                                  for config_file in config_files],
+                                 ids=[
+                                     f"{config_file.stem}-tp{tp_size}"
+                                     for config_file in config_files
+                                 ])
+        else:
+            print("No config files found, test will be skipped")
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Isolated GSM8K evaluation script for vLLM serve endpoint.
+"""
+
+import argparse
+import ast
+import asyncio
+import json
+import os
+import time
+from collections.abc import Generator
+from typing import Optional, Union
+
+import aiohttp
+import numpy as np
+import regex as re
+import requests
+from tqdm.asyncio import tqdm
+
+INVALID = -9999999
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
+    """Download and cache a file from a URL."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+
+    with open(filename, "wb") as f:
+        for chunk in response.iter_content(chunk_size=1024):
+            f.write(chunk)
+
+    return filename
+
+
+def load_gsm8k_data() -> tuple[list[dict], list[dict]]:
+    """Load GSM8K train and test data"""
+    train_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl"
+    test_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+
+    train_file = download_and_cache_file(train_url)
+    test_file = download_and_cache_file(test_url)
+
+    train_data = list(read_jsonl(train_file))
+    test_data = list(read_jsonl(test_file))
+
+    return train_data, test_data
+
+
+def read_jsonl(filename: str) -> Generator[dict, None, None]:
+    """Read a JSONL file."""
+    with open(filename) as fin:
+        for line in fin:
+            if not line.startswith("#"):
+                yield json.loads(line)
+
+
+def get_answer_value(answer_str: str) -> int:
+    """Extract the numerical answer from the response."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+async def call_vllm_api(session: aiohttp.ClientSession,
+                        prompt: str,
+                        temperature: float,
+                        max_tokens: int,
+                        stop: Optional[list[str]] = None,
+                        url: Optional[str] = None,
+                        seed: Optional[int] = None) -> str:
+    """Call vLLM's OpenAI-compatible completions endpoint."""
+    data = {
+        "prompt": prompt,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "stop": stop,
+    }
+    if seed is not None:
+        data["seed"] = seed
+
+    try:
+        async with session.post(f"{url}/v1/completions",
+                                json=data) as response:
+            response.raise_for_status()
+            result = await response.json()
+            return result["choices"][0]["text"]
+    except Exception as e:
+        print(f"Error calling vLLM API: {e}")
+        return ""
+
+
+def evaluate_gsm8k(num_questions: int = 1319,
+                   num_shots: int = 5,
+                   max_tokens: int = 256,
+                   host: str = "http://127.0.0.1",
+                   port: int = 8000,
+                   temperature: float = 0.0,
+                   seed: Optional[int] = 42) -> dict[str, Union[float, int]]:
+    """
+    Evaluate GSM8K accuracy using vLLM serve endpoint.
+    
+    Returns dict with accuracy, invalid_rate, latency, etc.
+    """
+    base_url = f"{host}:{port}"
+
+    # Load GSM8K train and test data
+    train_data, test_data = load_gsm8k_data()
+
+    # Limit to available test questions
+    num_questions = min(num_questions, len(test_data))
+
+    # Build few-shot examples from train split (like lm-eval does)
+    few_shot_examples = ""
+    for i in range(num_shots):
+        few_shot_examples += (f"Question: {train_data[i]['question']}\n"
+                              f"Answer: {train_data[i]['answer']}\n\n")
+
+    # Prepare test questions and labels from test split
+    questions = []
+    labels = []
+    for i in range(num_questions):
+        questions.append(f"Question: {test_data[i]['question']}\nAnswer:")
+        labels.append(get_answer_value(test_data[i]["answer"]))
+
+    assert all(label != INVALID for label in labels), "Some labels are invalid"
+
+    # Run evaluation
+    async def run_async_evaluation():
+        states: list[str] = [""] * num_questions
+
+        async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
+            prompt = few_shot_examples + questions[i]
+            answer = await call_vllm_api(
+                session=session,
+                prompt=prompt,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                stop=["Question", "Assistant:", "<|separator|>"],
+                url=base_url,
+                seed=seed,
+            )
+            states[i] = answer
+            return answer
+
+        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
+                total=600)) as session:
+            tasks = [get_answer(session, i) for i in range(num_questions)]
+            await tqdm.gather(*tasks, desc="Evaluating")
+
+        return states
+
+    print(f"Running GSM8K evaluation: {num_questions} questions, "
+          f"{num_shots}-shot")
+
+    tic = time.perf_counter()
+    states = asyncio.run(run_async_evaluation())
+    latency = time.perf_counter() - tic
+
+    # Compute metrics
+    preds = [get_answer_value(state) for state in states]
+    accuracy = np.mean(np.array(preds) == np.array(labels))
+    invalid_rate = np.mean(np.array(preds) == INVALID)
+
+    result = {
+        "accuracy": accuracy,
+        "invalid_rate": invalid_rate,
+        "latency": latency,
+        "questions_per_second": num_questions / latency,
+        "num_questions": num_questions,
+        "num_shots": num_shots,
+        "max_tokens": max_tokens,
+        "timestamp": time.time(),
+    }
+
+    return result
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="GSM8K evaluation for vLLM serve")
+    parser.add_argument("--num-shots",
+                        type=int,
+                        default=5,
+                        help="Number of few-shot examples")
+    parser.add_argument("--num-questions",
+                        type=int,
+                        default=1319,
+                        help="Number of questions to evaluate")
+    parser.add_argument("--max-tokens",
+                        type=int,
+                        default=256,
+                        help="Max tokens for generation")
+    parser.add_argument("--host",
+                        type=str,
+                        default="http://127.0.0.1",
+                        help="Host URL")
+    parser.add_argument("--port", type=int, default=8000, help="Port number")
+    parser.add_argument("--temperature",
+                        type=float,
+                        default=0.0,
+                        help="Temperature for generation")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=42,
+                        help="Random seed for reproducibility")
+    parser.add_argument("--save-results",
+                        type=str,
+                        help="Save results to JSON file")
+
+    args = parser.parse_args()
+
+    result = evaluate_gsm8k(
+        num_questions=args.num_questions,
+        num_shots=args.num_shots,
+        max_tokens=args.max_tokens,
+        host=args.host,
+        port=args.port,
+        temperature=args.temperature,
+        seed=args.seed,
+    )
+
+    # Print results to terminal
+    print("\nResults:")
+    print(f"Accuracy: {result['accuracy']:.3f}")
+    print(f"Invalid responses: {result['invalid_rate']:.3f}")
+    print(f"Total latency: {result['latency']:.3f} s")
+    print(f"Questions per second: {result['questions_per_second']:.3f}")
+
+    # Optional file saving
+    if args.save_results:
+        with open(args.save_results, "w") as f:
+            json.dump(result, f, indent=2)
+        print(f"Results saved to {args.save_results}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+GSM8K evaluation using vLLM server and isolated GSM8K script.
+Replacement for lm-eval-harness with better performance and control.
+
+Usage:
+pytest -s -v test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+"""
+
+import yaml
+
+from tests.utils import RemoteOpenAIServer
+
+from .gsm8k_eval import evaluate_gsm8k
+
+RTOL = 0.08  # Relative tolerance for accuracy comparison
+
+
+def launch_gsm8k_eval(eval_config, server_url, tp_size):
+    """Launch GSM8K evaluation using our isolated script."""
+    # Extract host and port from server URL
+    if "://" in server_url:
+        server_url = server_url.split("://")[1]
+
+    host_port = server_url.split("/")[0]  # Remove path if present
+    if ":" in host_port:
+        host, port = host_port.split(":")
+        port = int(port)
+    else:
+        host = host_port
+        port = 8000
+
+    # Add http:// prefix if not present
+    if not host.startswith("http"):
+        host = f"http://{host}"
+
+    # Run GSM8K evaluation
+    results = evaluate_gsm8k(
+        num_questions=eval_config["num_questions"],
+        num_shots=eval_config["num_fewshot"],
+        host=host,
+        port=port,
+    )
+
+    return results
+
+
+def test_gsm8k_correctness_param(config_filename, tp_size):
+    """Test GSM8K correctness for a given model configuration."""
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+
+    # Server arguments
+    server_args = [
+        "--max-model-len",
+        str(eval_config.get("max_model_len", 4096)),
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        str(tp_size),
+    ]
+
+    # Launch server and run evaluation
+    with RemoteOpenAIServer(eval_config["model_name"],
+                            server_args,
+                            max_wait_seconds=480) as remote_server:
+        server_url = remote_server.url_for("v1")
+
+        results = launch_gsm8k_eval(eval_config, server_url, tp_size)
+
+        # Check accuracy against threshold
+        measured_accuracy = results["accuracy"]
+        expected_accuracy = eval_config["accuracy_threshold"]
+
+        print(f"GSM8K Results for {eval_config['model_name']}:")
+        print(f"  Accuracy: {measured_accuracy:.3f}")
+        print(f"  Expected: {expected_accuracy:.3f}")
+        print(f"  Questions: {results['num_questions']}")
+        print(f"  Invalid rate: {results['invalid_rate']:.3f}")
+        print(f"  Latency: {results['latency']:.1f}s")
+        print(f"  QPS: {results['questions_per_second']:.1f}")
+
+        # Verify accuracy is within tolerance
+        assert measured_accuracy >= expected_accuracy - RTOL, (
+            f"Accuracy too low: {measured_accuracy:.3f} < "
+            f"{expected_accuracy:.3f} - {RTOL:.3f}")
+
+        print(f"✅ GSM8K test passed for {eval_config['model_name']}")