mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Constant time access of first value in collection. This is a constant time operation instead of converting the item to a list to get the first item which is linear. The rule is turned on which automatically autofixes and enforces this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/115507 Approved by: https://github.com/malfet
145 lines
4.4 KiB
Python
145 lines
4.4 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
import click
|
|
import pandas as pd
|
|
from tabulate import tabulate
|
|
|
|
|
|
def gmean(s):
|
|
return s.product() ** (1 / len(s))
|
|
|
|
|
|
def find_csv_files(path, perf_compare):
|
|
"""
|
|
Recursively search for all CSV files in directory and subdirectories whose
|
|
name contains a target string.
|
|
"""
|
|
|
|
def is_csv(f):
|
|
if perf_compare:
|
|
regex = r"training_(torchbench|huggingface|timm_models)\.csv"
|
|
return re.match(regex, f) is not None
|
|
else:
|
|
return f.endswith("_performance.csv")
|
|
|
|
csv_files = []
|
|
for root, dirs, files in os.walk(path):
|
|
for file in files:
|
|
if is_csv(file):
|
|
csv_files.append(os.path.join(root, file))
|
|
return csv_files
|
|
|
|
|
|
@click.command()
|
|
@click.argument("directory", default="artifacts")
|
|
@click.option("--amp", is_flag=True)
|
|
@click.option("--float32", is_flag=True)
|
|
@click.option(
|
|
"--perf-compare",
|
|
is_flag=True,
|
|
help="Set if the CSVs were generated by running manually the action rather than picking them from the nightly job",
|
|
)
|
|
def main(directory, amp, float32, perf_compare):
|
|
"""
|
|
Given a directory containing multiple CSVs from --performance benchmark
|
|
runs, aggregates and generates summary statistics similar to the web UI at
|
|
https://torchci-git-fork-huydhn-add-compilers-bench-74abf8-fbopensource.vercel.app/benchmark/compilers
|
|
|
|
This is most useful if you've downloaded CSVs from CI and need to quickly
|
|
look at aggregate stats. The CSVs are expected to follow exactly the same
|
|
naming convention that is used in CI.
|
|
|
|
You may also be interested in
|
|
https://docs.google.com/document/d/1DQQxIgmKa3eF0HByDTLlcJdvefC4GwtsklJUgLs09fQ/edit#
|
|
which explains how to interpret the raw csv data.
|
|
"""
|
|
dtypes = ["amp", "float32"]
|
|
if amp and not float32:
|
|
dtypes = ["amp"]
|
|
if float32 and not amp:
|
|
dtypes = ["float32"]
|
|
|
|
dfs = defaultdict(list)
|
|
for f in find_csv_files(directory, perf_compare):
|
|
try:
|
|
dfs[os.path.basename(f)].append(pd.read_csv(f))
|
|
except Exception:
|
|
logging.warning("failed parsing %s", f)
|
|
raise
|
|
|
|
# dtype -> statistic -> benchmark -> compiler -> value
|
|
results = defaultdict( # dtype
|
|
lambda: defaultdict( # statistic
|
|
lambda: defaultdict(dict) # benchmark # compiler -> value
|
|
)
|
|
)
|
|
|
|
for k, v in sorted(dfs.items()):
|
|
if perf_compare:
|
|
regex = r"training_(torchbench|huggingface|timm_models)\.csv"
|
|
m = re.match(regex, k)
|
|
assert m is not None, k
|
|
compiler = "inductor"
|
|
benchmark = m.group(1)
|
|
dtype = "float32"
|
|
mode = "training"
|
|
device = "cuda"
|
|
else:
|
|
regex = (
|
|
"(.+)_"
|
|
"(torchbench|huggingface|timm_models)_"
|
|
"(float32|amp)_"
|
|
"(inference|training)_"
|
|
"(cpu|cuda)_"
|
|
r"performance\.csv"
|
|
)
|
|
m = re.match(regex, k)
|
|
compiler = m.group(1)
|
|
benchmark = m.group(2)
|
|
dtype = m.group(3)
|
|
mode = m.group(4)
|
|
device = m.group(5)
|
|
|
|
df = pd.concat(v)
|
|
df = df.dropna().query("speedup != 0")
|
|
|
|
statistics = {
|
|
"speedup": gmean(df["speedup"]),
|
|
"comptime": df["compilation_latency"].mean(),
|
|
"memory": gmean(df["compression_ratio"]),
|
|
}
|
|
|
|
if dtype not in dtypes:
|
|
continue
|
|
|
|
for statistic, v in statistics.items():
|
|
results[f"{device} {dtype} {mode}"][statistic][benchmark][compiler] = v
|
|
|
|
descriptions = {
|
|
"speedup": "Geometric mean speedup",
|
|
"comptime": "Mean compilation time",
|
|
"memory": "Peak memory compression ratio",
|
|
}
|
|
|
|
for dtype_mode, r in results.items():
|
|
print(f"# {dtype_mode} performance results")
|
|
for statistic, data in r.items():
|
|
print(f"## {descriptions[statistic]}")
|
|
|
|
table = []
|
|
for row_name in data[next(iter(data.keys()))]:
|
|
row = [row_name]
|
|
for col_name in data:
|
|
row.append(round(data[col_name][row_name], 2))
|
|
table.append(row)
|
|
|
|
headers = list(data.keys())
|
|
print(tabulate(table, headers=headers))
|
|
print()
|
|
|
|
|
|
main()
|