pytorch/benchmarks/dynamo/parse_logs.py

import csv
import os
import re
import sys


# This script takes the logs produced by the benchmark scripts (e.g.,
# torchbench.py) and parses it into a CSV file that summarizes what
# is failing and why.  It is kept separate from the benchmark script
# emitting a more structured output as it is often more convenient
# to iterate quickly on log files offline instead of having to make
# a change to the benchmark script and then do a full sweep to see
# the updates.
#
# This script is not very well written, feel free to rewrite it as necessary

assert len(sys.argv) == 2

full_log = open(sys.argv[1]).read()

# If the log contains a gist URL, extract it so we can include it in the CSV
gist_url = ""
m = re.search(r"https://gist.github.com/[a-f0-9]+", full_log)
if m is not None:
    gist_url = m.group(0)

# Split the log into an entry per benchmark
entries = re.split(
    r"(?:cuda (?:train|eval) +([^ ]+)|WARNING:root:([^ ]+) failed to load)", full_log
)[1:]
# Entries schema example:
# `['hf_Bert', None, '
#  PASS\nTIMING: entire_frame_compile:1.80925 backend_compile:6e-05\nDynamo produced 1 graph(s) covering 367 ops\n']`


def chunker(seq, size):
    return (seq[pos : pos + size] for pos in range(0, len(seq), size))


c = 0
i = 0

out = csv.DictWriter(
    sys.stdout,
    [
        "bench",
        "name",
        "result",
        "component",
        "context",
        "explain",
        "frame_time",
        "backend_time",
        "graph_count",
        "op_count",
        "graph_breaks",
        "unique_graph_breaks",
    ],
    dialect="excel",
)
out.writeheader()
out.writerow({"explain": gist_url})


# Sometimes backtraces will be in third party code, which results
# in very long file names.  Delete the absolute path in this case.
def normalize_file(f):
    if "site-packages/" in f:
        return f.split("site-packages/", 2)[1]
    else:
        return os.path.relpath(f)


# Assume we run torchbench, huggingface, timm_models in that order
# (as output doesn't say which suite the benchmark is part of)
# TODO: make this more robust

bench = "torchbench"

# 3 = 1 + number of matches in the entries split regex
for name, name2, log in chunker(entries, 3):
    if name is None:
        name = name2
    if name.startswith("Albert"):
        bench = "huggingface"
    elif name.startswith("adv_inc"):
        bench = "timm_models"

    # Payload that will go into the csv
    r = "UNKNOWN"
    explain = ""
    component = ""
    context = ""

    if "PASS" in log:
        r = "PASS"
    if "TIMEOUT" in log:
        r = "FAIL TIMEOUT"
    if "Accuracy failed" in log:
        r = "FAIL ACCURACY"

    # Attempt to extract out useful information from the traceback

    log = log.split(
        "The above exception was the direct cause of the following exception"
    )[0]
    split = log.split("Traceback (most recent call last)", maxsplit=1)
    if len(split) == 2:
        log = split[1]
    log = log.split("Original traceback:")[0]
    m = re.search(
        r'File "([^"]+)", line ([0-9]+), in .+\n +(.+)\n([A-Za-z]+(?:Error|Exception|NotImplementedError): ?.*)',
        log,
    )

    if m is not None:
        r = "FAIL"
        component = f"{normalize_file(m.group(1))}:{m.group(2)}"
        context = m.group(3)
        explain = f"{m.group(4)}"
    else:
        m = re.search(
            r'File "([^"]+)", line ([0-9]+), in .+\n +(.+)\nAssertionError', log
        )
        if m is not None:
            r = "FAIL"
            component = f"{normalize_file(m.group(1))}:{m.group(2)}"
            context = m.group(3)
            explain = "AssertionError"

    # Sometimes, the benchmark will say FAIL without any useful info
    # See https://github.com/pytorch/torchdynamo/issues/1910
    if "FAIL" in log:
        r = "FAIL"

    if r == "UNKNOWN":
        c += 1

    backend_time = None
    frame_time = None
    if "TIMING:" in log:
        result = re.search("TIMING:(.*)\n", log).group(1)
        split_str = result.split("backend_compile:")
        if len(split_str) == 2:
            backend_time = float(split_str[1])
            frame_time = float(split_str[0].split("entire_frame_compile:")[1])

    if "STATS:" in log:
        result = re.search("STATS:(.*)\n", log).group(1)
        # call_* op count: 970 | FakeTensor.__torch_dispatch__:35285 | ProxyTorchDispatchMode.__torch_dispatch__:13339
        split_all = result.split("|")
        # TODO: rewrite this to work with arbitrarily many stats

    graph_count = None
    op_count = None
    graph_breaks = None
    unique_graph_breaks = None
    if m := re.search(
        r"Dynamo produced (\d+) graphs covering (\d+) ops with (\d+) graph breaks \((\d+) unique\)",
        log,
    ):
        graph_count = m.group(1)
        op_count = m.group(2)
        graph_breaks = m.group(3)
        unique_graph_breaks = m.group(4)

    # If the context string is too long, don't put it in the CSV.
    # This is a hack to try to make it more likely that Google Sheets will
    # offer to split columns
    if len(context) > 78:
        context = ""

    # Temporary file names are meaningless, report it's generated code in this
    # case
    if "/tmp/" in component:
        component = "generated code"
        context = ""

    out.writerow(
        {
            "bench": bench,
            "name": name,
            "result": r,
            "component": component,
            "context": context,
            "explain": explain,
            "frame_time": frame_time,
            "backend_time": backend_time,
            "graph_count": graph_count,
            "op_count": op_count,
            "graph_breaks": graph_breaks,
            "unique_graph_breaks": unique_graph_breaks,
        }
    )
    i += 1  # noqa: SIM113

if c:
    print(f"failed to classify {c} entries", file=sys.stderr)