vllm-ascend/benchmarks/scripts/convert_json_to_markdown.py

import argparse
import json
import os
from pathlib import Path

import pandas as pd
from tabulate import tabulate

CUR_PATH = Path(__file__).parent.resolve()
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
    "test_name": "Test name",
    "avg_latency": "Mean latency (ms)",
    "P50": "Median latency (ms)",
    "P99": "P99 latency (ms)",
}

# throughput tests and the keys that will be printed into markdown
throughput_results = []
throughput_results_column_mapping = {
    "test_name": "Test name",
    "num_requests": "Num of reqs",
    "total_num_tokens": "Total num of tokens",
    "elapsed_time": "Elapsed time (s)",
    "requests_per_second": "Tput (req/s)",
    "tokens_per_second": "Tput (tok/s)",
}

# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
    "test_name": "Test name",
    "request_rate": "Request rate (req/s)",
    "request_throughput": "Tput (req/s)",
    "output_throughput": "Output Tput (tok/s)",
    "median_ttft_ms": "TTFT (ms)",
    "median_tpot_ms": "TPOT (ms)",
    "median_itl_ms": "ITL (ms)",
}


def read_markdown(file):
    if os.path.exists(file):
        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"


def results_to_json(latency, throughput, serving):
    return json.dumps({
        'latency': latency.to_dict(),
        'throughput': throughput.to_dict(),
        'serving': serving.to_dict()
    })


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Process the results of the benchmark tests.")
    parser.add_argument(
        "--results_folder",
        type=str,
        default="../results/",
        help="The folder where the benchmark results are stored.")
    parser.add_argument(
        "--output_folder",
        type=str,
        default="../results/",
        help="The folder where the benchmark results are stored.")
    parser.add_argument("--markdown_template",
                        type=str,
                        default="./perf_result_template.md",
                        help="The template file for the markdown report.")
    parser.add_argument("--tag",
                        default="main",
                        help="Tag to be used for release message.")
    parser.add_argument("--commit_id",
                        default="",
                        help="Commit ID to be used for release message.")

    args = parser.parse_args()
    results_folder = (CUR_PATH / args.results_folder).resolve()
    output_folder = (CUR_PATH / args.output_folder).resolve()
    markdown_template = (CUR_PATH / args.markdown_template).resolve()

    # collect results
    for test_file in results_folder.glob("*.json"):

        with open(test_file) as f:
            raw_result = json.loads(f.read())

        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`

            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})

            # add the result to raw_result
            serving_results.append(raw_result)
            continue

        elif "latency" in f.name:
            # this result is generated via `benchmark_latency.py`

            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})

            # get different percentiles
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000

            # add the result to raw_result
            latency_results.append(raw_result)
            continue

        elif "throughput" in f.name:
            # this result is generated via `benchmark_throughput.py`

            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})

            # add the result to raw_result
            throughput_results.append(raw_result)
            continue

        print(f"Skipping {test_file}")
    serving_results.sort(key=lambda x: (len(x['test_name']), x['test_name']))

    latency_results = pd.DataFrame.from_dict(latency_results)
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)

    raw_results_json = results_to_json(latency_results, throughput_results,
                                       serving_results)

    # remapping the key, for visualization purpose
    if not latency_results.empty:
        latency_results = latency_results[list(
            latency_column_mapping.keys())].rename(
                columns=latency_column_mapping)
    if not serving_results.empty:
        serving_results = serving_results[list(
            serving_column_mapping.keys())].rename(
                columns=serving_column_mapping)
    if not throughput_results.empty:
        throughput_results = throughput_results[list(
            throughput_results_column_mapping.keys())].rename(
                columns=throughput_results_column_mapping)

    processed_results_json = results_to_json(latency_results,
                                             throughput_results,
                                             serving_results)

    # get markdown tables
    latency_md_table = tabulate(latency_results,
                                headers='keys',
                                tablefmt='pipe',
                                showindex=False)
    serving_md_table = tabulate(serving_results,
                                headers='keys',
                                tablefmt='pipe',
                                showindex=False)
    throughput_md_table = tabulate(throughput_results,
                                   headers='keys',
                                   tablefmt='pipe',
                                   showindex=False)

    # document the result
    print(output_folder)
    with open(output_folder / "benchmark_results.md", "w") as f:

        results = read_markdown(markdown_template)
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
            benchmarking_results_in_json_string=processed_results_json)
        f.write(results)