Add standard deviation of metrics over runs to inference benchmark (#113309)

Run each `(batch_size, compile)` benchmark 10 times in `./runner.sh` and get mean and standard deviation of metrics in output table Only report `warmup latency`, `average_latency`, `throughput` and `gpu_util` Break `output.md` file into a single markdown file per `(batch_size, compile)` configuration. Further runs of `./runner.sh` will append one row to the table in each file for easy comparison Pull Request resolved: https://github.com/pytorch/pytorch/pull/113309 Approved by: https://github.com/albanD
2025-10-20 21:14:14 +08:00 · 2023-11-08 14:43:06 -08:00
parent d977f118ad
commit b0c9ccdc4b
15 changed files with 129 additions and 69 deletions
--- a/benchmarks/inference/README.md
+++ b/benchmarks/inference/README.md
@ -27,6 +27,7 @@ The togglable commmand line arguments to the script are as follows:
  - `model_dir` (default: '.'): the directory to load the checkpoint from
  - `compile` (default: compile): or `--no-compile` whether to `torch.compile()`
    the model
+  - `output_file` (default: output.csv): The name of the csv file to write the outputs to in the `results/` directory.

 e.g. A sample command to run the benchmark

@ -34,31 +35,17 @@ e.g. A sample command to run the benchmark
 python -W ignore server.py --num_iters 1000 --batch_size 32
 ```

-A sample output is
+the results will be found in `results/output.csv`, which will be appended to if the file already exists.

-```
-torch.load() time: 3.95351 s
-m.compile() time (not actual first compilation): 3.41085 s
-Warmup latency: 15.92736 s
-Average latency (exclude warmup): 0.09556 +/- 0.07029 s
-Max latency: 0.60715 s
-Min latency: 0.05200 s
-Throughput (exclude warmup): 334.85437 samples per second
-Average GPU utilization: 20.74092
-```
-
-Note that `m.compile()` time above is not the time for the model to be compiled,
+Note that `m.compile()` time in the csv file is not the time for the model to be compiled,
 which happens during the first iteration, but rather the time for PT2 components
 to be lazily imported (e.g. triton).

 ### Running a sweep

 The script `runner.sh` will run a sweep of the benchmark over different batch
-sizes with compile on and off. The `results/` directory will contain the metrics
-from running a sweep as we develop this benchmark.
-
-To run the script
-```
-./runner.sh <filename>.md
-```
-will create `results/<filename>/md`.
+sizes with compile on and off and collect the mean and standard deviation of warmup latency,
+average latency, throughput and GPU utilization for each. The `results/` directory will contain the metrics
+from running a sweep as we develop this benchmark where `results/output_{batch_size}_{compile}.md`
+will contain the mean and standard deviation of results for a given batch size and compile setting,
+if the file already exists, the metrics form the run will be appended as a new row in the markdown table.
--- a/benchmarks/inference/process_metrics.py
+++ b/benchmarks/inference/process_metrics.py
@ -0,0 +1,49 @@
+"""
+This file will take the csv outputs from server.py, calculate the mean and
+variance of the warmup_latency, average_latency, throughput and gpu_util
+and write these to the corresponding `results/output_{batch_size}_{compile}.md`
+file, appending to the file if it exists or creatng a new one otherwise.
+"""
+
+import argparse
+import os
+
+import pandas as pd
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Parse output files")
+    parser.add_argument("--csv", type=str, help="Path to csv file")
+    args = parser.parse_args()
+
+    input_csv = "./results/" + args.csv
+    df = pd.read_csv(input_csv)
+
+    batch_size = int(os.path.basename(args.csv).split("_")[1])
+    compile = os.path.basename(args.csv).split("_")[-1].split(".")[0]
+
+    # Calculate mean and standard deviation for a subset of metrics
+    metrics = ["warmup_latency", "average_latency", "throughput", "gpu_util"]
+    means = dict()
+    stds = dict()
+
+    for metric in metrics:
+        means[metric] = df[metric].mean()
+        stds[metric] = df[metric].std()
+
+    output_md = f"results/output_{batch_size}_{compile}.md"
+    write_header = os.path.isfile(output_md) is False
+
+    with open(output_md, "a+") as f:
+        if write_header:
+            f.write(f"## Batch Size {batch_size} Compile {compile}\n\n")
+            f.write(
+                "| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |\n"
+            )
+            f.write(
+                "| ------------------ | ------------------- | ------------------------ | ------------------- |\n"
+            )
+
+        line = "|"
+        for metric in metrics:
+            line += f" {means[metric]:.3f} +/- {stds[metric]:.3f} |"
+        f.write(line + "\n")
--- a/benchmarks/inference/results/output.md
+++ b/benchmarks/inference/results/output.md
@ -1,12 +0,0 @@
-| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |
-| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |
-| 1, true | 4.96503 | 17.48787 | 0.13005 | 0.29250 | 0.03008 | 7.68938 | 1.56557 |
-| 1, false | 5.10042 | 9.25073 | 0.29875 | 0.53307 | 0.03849 | 3.34732 | 2.17808 |
-| 32, true | 4.63273 | 16.52005 | 0.07233 | 0.17758 | 0.05606 | 442.38822 | 9.59890 |
-| 32, false | 4.56469 | 7.40872 | 0.09007 | 0.32917 | 0.05411 | 355.27932 | 12.41176 |
-| 64, true | 6.38863 | 19.92130 | 0.16717 | 0.36933 | 0.10566 | 382.83639 | 11.85784 |
-| 64, false | 6.55728 | 11.15943 | 0.15646 | 0.51614 | 0.09775 | 409.05319 | 15.72000 |
-| 128, true | 3.96179 | 15.54395 | 0.22426 | 0.41163 | 0.17372 | 570.75645 | 19.71206 |
-| 128, false | 4.07400 | 7.41854 | 0.22373 | 0.31546 | 0.17344 | 572.11860 | 22.69027 |
-| 256, true | 5.87015 | 18.38878 | 0.46226 | 0.68052 | 0.35923 | 553.79917 | 27.11622 |
-| 256, false | 4.68391 | 8.14502 | 0.45867 | 0.69048 | 0.34811 | 558.12956 | 30.28707 |
--- a/benchmarks/inference/results/output_128_false.md
+++ b/benchmarks/inference/results/output_128_false.md
@ -0,0 +1,5 @@
+## Batch Size 128 Compile false
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 8.895 +/- 1.863 | 0.221 +/- 0.005 | 579.469 +/- 13.452 | 22.797 +/- 0.948 |
--- a/benchmarks/inference/results/output_128_true.md
+++ b/benchmarks/inference/results/output_128_true.md
@ -0,0 +1,5 @@
+## Batch Size 128 Compile true
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 19.006 +/- 2.697 | 0.221 +/- 0.003 | 579.121 +/- 7.831 | 19.304 +/- 2.208 |
--- a/benchmarks/inference/results/output_1_false.md
+++ b/benchmarks/inference/results/output_1_false.md
@ -0,0 +1,5 @@
+## Batch Size 1 Compile false
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 7.317 +/- 0.602 | 0.268 +/- 0.057 | 3.864 +/- 0.739 | 2.560 +/- 0.245 |
--- a/benchmarks/inference/results/output_1_true.md
+++ b/benchmarks/inference/results/output_1_true.md
@ -0,0 +1,5 @@
+## Batch Size 1 Compile true
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 18.353 +/- 1.888 | 0.172 +/- 0.081 | 7.080 +/- 3.199 | 1.414 +/- 0.154 |
--- a/benchmarks/inference/results/output_256_false.md
+++ b/benchmarks/inference/results/output_256_false.md
@ -0,0 +1,5 @@
+## Batch Size 256 Compile false
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 8.555 +/- 1.259 | 0.447 +/- 0.007 | 573.260 +/- 8.471 | 31.546 +/- 0.530 |
--- a/benchmarks/inference/results/output_256_true.md
+++ b/benchmarks/inference/results/output_256_true.md
@ -0,0 +1,5 @@
+## Batch Size 256 Compile true
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 17.986 +/- 1.737 | 0.454 +/- 0.008 | 564.066 +/- 9.441 | 26.642 +/- 1.730 |
--- a/benchmarks/inference/results/output_32_false.md
+++ b/benchmarks/inference/results/output_32_false.md
@ -0,0 +1,5 @@
+## Batch Size 32 Compile false
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 8.215 +/- 1.670 | 0.096 +/- 0.020 | 343.878 +/- 63.660 | 11.233 +/- 1.837 |
--- a/benchmarks/inference/results/output_32_true.md
+++ b/benchmarks/inference/results/output_32_true.md
@ -0,0 +1,5 @@
+## Batch Size 32 Compile true
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 16.586 +/- 2.838 | 2.092 +/- 6.316 | 314.174 +/- 123.214 | 44.014 +/- 113.006 |
--- a/benchmarks/inference/results/output_64_false.md
+++ b/benchmarks/inference/results/output_64_false.md
@ -0,0 +1,5 @@
+## Batch Size 64 Compile false
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 8.704 +/- 1.577 | 0.152 +/- 0.014 | 425.039 +/- 38.873 | 16.389 +/- 0.836 |
--- a/benchmarks/inference/results/output_64_true.md
+++ b/benchmarks/inference/results/output_64_true.md
@ -0,0 +1,5 @@
+## Batch Size 64 Compile true
+
+| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
+| ------------------ | ------------------- | ------------------------ | ------------------- |
+| 17.767 +/- 1.145 | 0.142 +/- 0.014 | 455.207 +/- 44.402 | 13.422 +/- 0.912 |
--- a/benchmarks/inference/runner.sh
+++ b/benchmarks/inference/runner.sh
@ -1,15 +1,9 @@
 #!/bin/bash

-if [ $# -ne 1 ]; then
-  echo "Usage: $0 <output_filename>.md"
-  exit 1
-fi
-
-output_markdown="results/$1"
 benchmark_script="server.py"
-output_file="results/temp_output.txt"
 checkpoint_file="resnet18-f37072fd.pth"
 downloaded_checkpoint=false
+num_iters=10

 batch_size_values=(1 32 64 128 256)
 compile_values=(true false)
@ -22,39 +16,25 @@ else
  wget https://download.pytorch.org/models/resnet18-f37072fd.pth
 fi

-if [ -e "$output_file" ]; then
-  rm "$output_file"
-fi
-touch $output_file
-
 for batch_size in "${batch_size_values[@]}"; do
  for compile in "${compile_values[@]}"; do
+    echo "Running benchmark for batch size ${batch_size} and compile=${compile}..."
+    output_file="output_${batch_size}_${compile}.csv"
+    if [ -e "./results/$output_file" ]; then
+      rm "./results/$output_file"
+    fi
+    for i in $(seq 1 $num_iters); do
      if [ "$compile" = true ]; then
-        python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile >> $output_file
+        python -W ignore "$benchmark_script" --batch_size "$batch_size" --output_file "$output_file" --compile
      else
-        python -W ignore "$benchmark_script" --batch_size "$batch_size" --no-compile >> $output_file
+        python -W ignore "$benchmark_script" --batch_size "$batch_size" --output_file "$output_file" --no-compile
      fi
+    done
+    python process_metrics.py --csv "$output_file"
+    rm "./results/$output_file"
  done
 done

-echo "| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |" > $output_markdown
-echo "| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |" >> $output_markdown
-
-while IFS= read -r line; do
-    batch_size=$(echo "$line" | jq -r '.batch_size')
-    compile=$(echo "$line" | jq -r '.compile')
-    torch_load=$(echo "$line" | jq -r '.torch_load_time' | awk '{printf "%.5f", $0}')
-    warmup_latency=$(echo "$line" | jq -r '.warmup_latency' | awk '{printf "%.5f", $0}')
-    avg_latency=$(echo "$line" | jq -r '.average_latency' | awk '{printf "%.5f", $0}')
-    max_latency=$(echo "$line" | jq -r '.max_latency' | awk '{printf "%.5f", $0}')
-    min_latency=$(echo "$line" | jq -r '.min_latency' | awk '{printf "%.5f", $0}')
-    throughput=$(echo "$line" | jq -r '.throughput' | awk '{printf "%.5f", $0}')
-    gpu_util=$(echo "$line" | jq -r '.GPU_utilization' | awk '{printf "%.5f", $0}')
-    echo "| $batch_size, $compile | $torch_load | $warmup_latency | $avg_latency | $max_latency | $min_latency | $throughput | $gpu_util |"
-done < $output_file >> $output_markdown
-
-rm "$output_file"
-
 if [ "$downloaded_checkpoint" = true ]; then
  echo "Cleaning up checkpoint..."
  rm "$checkpoint_file"
--- a/benchmarks/inference/server.py
+++ b/benchmarks/inference/server.py
@ -1,11 +1,11 @@
 import argparse
-import json
 import os.path
 import subprocess
 import time
 from queue import Empty

 import numpy as np
+import pandas as pd

 import torch
 import torch.multiprocessing as mp
@ -86,7 +86,7 @@ class FrontendWorker(mp.Process):
                gpu_utilizations.append(float(gpu_utilization))
            time.sleep(0.1)

-        self.metrics_dict["GPU_utilization"] = np.array(gpu_utilizations).mean()
+        self.metrics_dict["gpu_util"] = np.array(gpu_utilizations).mean()

    def _send_requests(self):
        """
@ -199,6 +199,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--compile", default=True, action=argparse.BooleanOptionalAction
    )
+    parser.add_argument("--output_file", type=str, default="output.csv")
    args = parser.parse_args()

    downloaded_checkpoint = False
@ -241,8 +242,13 @@ if __name__ == "__main__":
        frontend.join()
        backend.join()

-        output_str = json.dumps(metrics_dict._getvalue())
-        print(output_str)
+        metrics_dict = {k: [v] for k, v in metrics_dict._getvalue().items()}
+        output = pd.DataFrame.from_dict(metrics_dict, orient="columns")
+        output_file = "./results/" + args.output_file
+        is_empty = not os.path.isfile(output_file)
+
+        with open(output_file, "a+", newline="") as file:
+            output.to_csv(file, header=is_empty, index=False)

    finally:
        # Cleanup checkpoint file if we downloaded it