Tabulate outputs in inference benchmark (#112900)

- Fix error where script was always compiling model - Make`runner.sh` parse outputs into nice `.md` format Pull Request resolved: https://github.com/pytorch/pytorch/pull/112900 Approved by: https://github.com/albanD ghstack dependencies: #112582, #112863
2025-10-20 21:14:14 +08:00 · 2023-11-03 13:23:29 -07:00
parent 6ba2748690
commit df149581bc
4 changed files with 96 additions and 27 deletions
--- a/benchmarks/inference/README.md
+++ b/benchmarks/inference/README.md
@ -25,7 +25,8 @@ The togglable commmand line arguments to the script are as follows:
    excluding the first warmup request
  - `batch_size` (default: 32): the batch size of the requests.
  - `model_dir` (default: '.'): the directory to load the checkpoint from
-  - `compile` (default: True): whether to `torch.compile()` the model
+  - `compile` (default: compile): or `--no-compile` whether to `torch.compile()`
+    the model

 e.g. A sample command to run the benchmark

@ -55,3 +56,9 @@ to be lazily imported (e.g. triton).
 The script `runner.sh` will run a sweep of the benchmark over different batch
 sizes with compile on and off. The `results/` directory will contain the metrics
 from running a sweep as we develop this benchmark.
+
+To run the script
+```
+./runner.sh <filename>.md
+```
+will create `results/<filename>/md`.
--- a/benchmarks/inference/results/output.md
+++ b/benchmarks/inference/results/output.md
@ -0,0 +1,12 @@
+| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |
+| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |
+| 1, true | 4.96503 | 17.48787 | 0.13005 | 0.29250 | 0.03008 | 7.68938 | 1.56557 |
+| 1, false | 5.10042 | 9.25073 | 0.29875 | 0.53307 | 0.03849 | 3.34732 | 2.17808 |
+| 32, true | 4.63273 | 16.52005 | 0.07233 | 0.17758 | 0.05606 | 442.38822 | 9.59890 |
+| 32, false | 4.56469 | 7.40872 | 0.09007 | 0.32917 | 0.05411 | 355.27932 | 12.41176 |
+| 64, true | 6.38863 | 19.92130 | 0.16717 | 0.36933 | 0.10566 | 382.83639 | 11.85784 |
+| 64, false | 6.55728 | 11.15943 | 0.15646 | 0.51614 | 0.09775 | 409.05319 | 15.72000 |
+| 128, true | 3.96179 | 15.54395 | 0.22426 | 0.41163 | 0.17372 | 570.75645 | 19.71206 |
+| 128, false | 4.07400 | 7.41854 | 0.22373 | 0.31546 | 0.17344 | 572.11860 | 22.69027 |
+| 256, true | 5.87015 | 18.38878 | 0.46226 | 0.68052 | 0.35923 | 553.79917 | 27.11622 |
+| 256, false | 4.68391 | 8.14502 | 0.45867 | 0.69048 | 0.34811 | 558.12956 | 30.28707 |
--- a/benchmarks/inference/runner.sh
+++ b/benchmarks/inference/runner.sh
@ -1,30 +1,60 @@
 #!/bin/bash

-batch_size_values=(1 32 64 128 256)
-compile_values=(True False)
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <output_filename>.md"
+  exit 1
+fi

+output_markdown="results/$1"
 benchmark_script="server.py"
+output_file="results/temp_output.txt"
 checkpoint_file="resnet18-f37072fd.pth"
 downloaded_checkpoint=false

+batch_size_values=(1 32 64 128 256)
+compile_values=(true false)
+
 if [ -f $checkpoint_file ]; then
  echo "Checkpoint exists."
 else
  downloaded_checkpoint=true
  echo "Downloading checkpoint..."
  wget https://download.pytorch.org/models/resnet18-f37072fd.pth
-  echo "============================================================================="
 fi

-echo "Starting benchmark..."
+if [ -e "$output_file" ]; then
+  rm "$output_file"
+fi
+touch $output_file
+
 for batch_size in "${batch_size_values[@]}"; do
  for compile in "${compile_values[@]}"; do
-      echo "Running benchmark with batch_size=$batch_size, compile=$compile..."
-      python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile "$compile"
-      echo "============================================================================="
+      if [ "$compile" = true ]; then
+        python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile >> $output_file
+      else
+        python -W ignore "$benchmark_script" --batch_size "$batch_size" --no-compile >> $output_file
+      fi
  done
 done

+echo "| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |" > $output_markdown
+echo "| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |" >> $output_markdown
+
+while IFS= read -r line; do
+    batch_size=$(echo "$line" | jq -r '.batch_size')
+    compile=$(echo "$line" | jq -r '.compile')
+    torch_load=$(echo "$line" | jq -r '.torch_load_time' | awk '{printf "%.5f", $0}')
+    warmup_latency=$(echo "$line" | jq -r '.warmup_latency' | awk '{printf "%.5f", $0}')
+    avg_latency=$(echo "$line" | jq -r '.average_latency' | awk '{printf "%.5f", $0}')
+    max_latency=$(echo "$line" | jq -r '.max_latency' | awk '{printf "%.5f", $0}')
+    min_latency=$(echo "$line" | jq -r '.min_latency' | awk '{printf "%.5f", $0}')
+    throughput=$(echo "$line" | jq -r '.throughput' | awk '{printf "%.5f", $0}')
+    gpu_util=$(echo "$line" | jq -r '.GPU_utilization' | awk '{printf "%.5f", $0}')
+    echo "| $batch_size, $compile | $torch_load | $warmup_latency | $avg_latency | $max_latency | $min_latency | $throughput | $gpu_util |"
+done < $output_file >> $output_markdown
+
+rm "$output_file"
+
 if [ "$downloaded_checkpoint" = true ]; then
  echo "Cleaning up checkpoint..."
  rm "$checkpoint_file"
--- a/benchmarks/inference/server.py
+++ b/benchmarks/inference/server.py
@ -1,4 +1,5 @@
 import argparse
+import json
 import os.path
 import subprocess
 import time
@ -16,8 +17,11 @@ class FrontendWorker(mp.Process):
    throughput and latency of those requests as well as GPU utilization.
    """

-    def __init__(self, request_queue, response_queue, batch_size, num_iters=10):
+    def __init__(
+        self, metrics_dict, request_queue, response_queue, batch_size, num_iters=10
+    ):
        super().__init__()
+        self.metrics_dict = metrics_dict
        self.request_queue = request_queue
        self.response_queue = response_queue
        self.warmup_event = mp.Event()
@ -45,15 +49,12 @@ class FrontendWorker(mp.Process):
        self.poll_gpu = False

        response_times = np.array(response_times)
-        print(f"Warmup latency: {warmup_response_time:.5f} s")
-        print(
-            f"Average latency (exclude warmup): {response_times.mean():.5f} +/- {response_times.std():.5f} s"
-        )
-        print(f"Max latency: {response_times.max():.5f} s")
-        print(f"Min latency: {response_times.min():.5f} s")
-        print(
-            "Throughput (exclude warmup): "
-            f"{(self.num_iters * self.batch_size) / response_times.sum():.5f} samples per second"
+        self.metrics_dict["warmup_latency"] = warmup_response_time
+        self.metrics_dict["average_latency"] = response_times.mean()
+        self.metrics_dict["max_latency"] = response_times.max()
+        self.metrics_dict["min_latency"] = response_times.min()
+        self.metrics_dict["throughput"] = (
+            self.num_iters * self.batch_size / response_times.sum()
        )

    def _run_gpu_utilization(self):
@ -84,7 +85,8 @@ class FrontendWorker(mp.Process):
            if gpu_utilization != "N/A":
                gpu_utilizations.append(float(gpu_utilization))
            time.sleep(0.1)
-        print(f"Average GPU utilization: {np.array(gpu_utilizations).mean():.5f}")
+
+        self.metrics_dict["GPU_utilization"] = np.array(gpu_utilizations).mean()

    def _send_requests(self):
        """
@ -128,10 +130,16 @@ class BackendWorker(mp.Process):
    """

    def __init__(
-        self, request_queue, response_queue, model_dir=".", compile_model=True
+        self,
+        metrics_dict,
+        request_queue,
+        response_queue,
+        model_dir=".",
+        compile_model=True,
    ):
        super().__init__()
        self.device = "cuda:0"
+        self.metrics_dict = metrics_dict
        self.request_queue = request_queue
        self.response_queue = response_queue
        self.model_dir = model_dir
@ -155,7 +163,7 @@ class BackendWorker(mp.Process):
            mmap=True,
            map_location=self.device,
        )
-        print(f"torch.load() time: {time.time() - start_load_time:.5f} s")
+        self.metrics_dict["torch_load_time"] = time.time() - start_load_time
        m.load_state_dict(state_dict, assign=True)
        m.eval()

@ -163,9 +171,7 @@ class BackendWorker(mp.Process):
            start_compile_time = time.time()
            m.compile()
            end_compile_time = time.time()
-            print(
-                f"m.compile() time (not actual first compilation): {end_compile_time - start_compile_time:.5f} s"
-            )
+            self.metrics_dict["m_compile_time"] = end_compile_time - start_compile_time
        return m

    def run(self):
@ -190,7 +196,9 @@ if __name__ == "__main__":
    parser.add_argument("--num_iters", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--model_dir", type=str, default=".")
-    parser.add_argument("--compile", type=bool, default=True)
+    parser.add_argument(
+        "--compile", default=True, action=argparse.BooleanOptionalAction
+    )
    args = parser.parse_args()

    downloaded_checkpoint = False
@ -211,11 +219,20 @@ if __name__ == "__main__":
        request_queue = mp.Queue()
        response_queue = mp.Queue()

+        manager = mp.Manager()
+        metrics_dict = manager.dict()
+        metrics_dict["batch_size"] = args.batch_size
+        metrics_dict["compile"] = args.compile
+
        frontend = FrontendWorker(
-            request_queue, response_queue, args.batch_size, num_iters=args.num_iters
+            metrics_dict,
+            request_queue,
+            response_queue,
+            args.batch_size,
+            num_iters=args.num_iters,
        )
        backend = BackendWorker(
-            request_queue, response_queue, args.model_dir, args.compile
+            metrics_dict, request_queue, response_queue, args.model_dir, args.compile
        )

        frontend.start()
@ -224,6 +241,9 @@ if __name__ == "__main__":
        frontend.join()
        backend.join()

+        output_str = json.dumps(metrics_dict._getvalue())
+        print(output_str)
+
    finally:
        # Cleanup checkpoint file if we downloaded it
        if downloaded_checkpoint: