mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Add standard deviation of metrics over runs to inference benchmark (#113309)
Run each `(batch_size, compile)` benchmark 10 times in `./runner.sh` and get mean and standard deviation of metrics in output table Only report `warmup latency`, `average_latency`, `throughput` and `gpu_util` Break `output.md` file into a single markdown file per `(batch_size, compile)` configuration. Further runs of `./runner.sh` will append one row to the table in each file for easy comparison Pull Request resolved: https://github.com/pytorch/pytorch/pull/113309 Approved by: https://github.com/albanD
This commit is contained in:
committed by
PyTorch MergeBot
parent
d977f118ad
commit
b0c9ccdc4b
@ -27,6 +27,7 @@ The togglable commmand line arguments to the script are as follows:
|
||||
- `model_dir` (default: '.'): the directory to load the checkpoint from
|
||||
- `compile` (default: compile): or `--no-compile` whether to `torch.compile()`
|
||||
the model
|
||||
- `output_file` (default: output.csv): The name of the csv file to write the outputs to in the `results/` directory.
|
||||
|
||||
e.g. A sample command to run the benchmark
|
||||
|
||||
@ -34,31 +35,17 @@ e.g. A sample command to run the benchmark
|
||||
python -W ignore server.py --num_iters 1000 --batch_size 32
|
||||
```
|
||||
|
||||
A sample output is
|
||||
the results will be found in `results/output.csv`, which will be appended to if the file already exists.
|
||||
|
||||
```
|
||||
torch.load() time: 3.95351 s
|
||||
m.compile() time (not actual first compilation): 3.41085 s
|
||||
Warmup latency: 15.92736 s
|
||||
Average latency (exclude warmup): 0.09556 +/- 0.07029 s
|
||||
Max latency: 0.60715 s
|
||||
Min latency: 0.05200 s
|
||||
Throughput (exclude warmup): 334.85437 samples per second
|
||||
Average GPU utilization: 20.74092
|
||||
```
|
||||
|
||||
Note that `m.compile()` time above is not the time for the model to be compiled,
|
||||
Note that `m.compile()` time in the csv file is not the time for the model to be compiled,
|
||||
which happens during the first iteration, but rather the time for PT2 components
|
||||
to be lazily imported (e.g. triton).
|
||||
|
||||
### Running a sweep
|
||||
|
||||
The script `runner.sh` will run a sweep of the benchmark over different batch
|
||||
sizes with compile on and off. The `results/` directory will contain the metrics
|
||||
from running a sweep as we develop this benchmark.
|
||||
|
||||
To run the script
|
||||
```
|
||||
./runner.sh <filename>.md
|
||||
```
|
||||
will create `results/<filename>/md`.
|
||||
sizes with compile on and off and collect the mean and standard deviation of warmup latency,
|
||||
average latency, throughput and GPU utilization for each. The `results/` directory will contain the metrics
|
||||
from running a sweep as we develop this benchmark where `results/output_{batch_size}_{compile}.md`
|
||||
will contain the mean and standard deviation of results for a given batch size and compile setting,
|
||||
if the file already exists, the metrics form the run will be appended as a new row in the markdown table.
|
||||
|
49
benchmarks/inference/process_metrics.py
Normal file
49
benchmarks/inference/process_metrics.py
Normal file
@ -0,0 +1,49 @@
|
||||
"""
|
||||
This file will take the csv outputs from server.py, calculate the mean and
|
||||
variance of the warmup_latency, average_latency, throughput and gpu_util
|
||||
and write these to the corresponding `results/output_{batch_size}_{compile}.md`
|
||||
file, appending to the file if it exists or creatng a new one otherwise.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Parse output files")
|
||||
parser.add_argument("--csv", type=str, help="Path to csv file")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_csv = "./results/" + args.csv
|
||||
df = pd.read_csv(input_csv)
|
||||
|
||||
batch_size = int(os.path.basename(args.csv).split("_")[1])
|
||||
compile = os.path.basename(args.csv).split("_")[-1].split(".")[0]
|
||||
|
||||
# Calculate mean and standard deviation for a subset of metrics
|
||||
metrics = ["warmup_latency", "average_latency", "throughput", "gpu_util"]
|
||||
means = dict()
|
||||
stds = dict()
|
||||
|
||||
for metric in metrics:
|
||||
means[metric] = df[metric].mean()
|
||||
stds[metric] = df[metric].std()
|
||||
|
||||
output_md = f"results/output_{batch_size}_{compile}.md"
|
||||
write_header = os.path.isfile(output_md) is False
|
||||
|
||||
with open(output_md, "a+") as f:
|
||||
if write_header:
|
||||
f.write(f"## Batch Size {batch_size} Compile {compile}\n\n")
|
||||
f.write(
|
||||
"| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |\n"
|
||||
)
|
||||
f.write(
|
||||
"| ------------------ | ------------------- | ------------------------ | ------------------- |\n"
|
||||
)
|
||||
|
||||
line = "|"
|
||||
for metric in metrics:
|
||||
line += f" {means[metric]:.3f} +/- {stds[metric]:.3f} |"
|
||||
f.write(line + "\n")
|
@ -1,12 +0,0 @@
|
||||
| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |
|
||||
| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |
|
||||
| 1, true | 4.96503 | 17.48787 | 0.13005 | 0.29250 | 0.03008 | 7.68938 | 1.56557 |
|
||||
| 1, false | 5.10042 | 9.25073 | 0.29875 | 0.53307 | 0.03849 | 3.34732 | 2.17808 |
|
||||
| 32, true | 4.63273 | 16.52005 | 0.07233 | 0.17758 | 0.05606 | 442.38822 | 9.59890 |
|
||||
| 32, false | 4.56469 | 7.40872 | 0.09007 | 0.32917 | 0.05411 | 355.27932 | 12.41176 |
|
||||
| 64, true | 6.38863 | 19.92130 | 0.16717 | 0.36933 | 0.10566 | 382.83639 | 11.85784 |
|
||||
| 64, false | 6.55728 | 11.15943 | 0.15646 | 0.51614 | 0.09775 | 409.05319 | 15.72000 |
|
||||
| 128, true | 3.96179 | 15.54395 | 0.22426 | 0.41163 | 0.17372 | 570.75645 | 19.71206 |
|
||||
| 128, false | 4.07400 | 7.41854 | 0.22373 | 0.31546 | 0.17344 | 572.11860 | 22.69027 |
|
||||
| 256, true | 5.87015 | 18.38878 | 0.46226 | 0.68052 | 0.35923 | 553.79917 | 27.11622 |
|
||||
| 256, false | 4.68391 | 8.14502 | 0.45867 | 0.69048 | 0.34811 | 558.12956 | 30.28707 |
|
5
benchmarks/inference/results/output_128_false.md
Normal file
5
benchmarks/inference/results/output_128_false.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 128 Compile false
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.895 +/- 1.863 | 0.221 +/- 0.005 | 579.469 +/- 13.452 | 22.797 +/- 0.948 |
|
5
benchmarks/inference/results/output_128_true.md
Normal file
5
benchmarks/inference/results/output_128_true.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 128 Compile true
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 19.006 +/- 2.697 | 0.221 +/- 0.003 | 579.121 +/- 7.831 | 19.304 +/- 2.208 |
|
5
benchmarks/inference/results/output_1_false.md
Normal file
5
benchmarks/inference/results/output_1_false.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 1 Compile false
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 7.317 +/- 0.602 | 0.268 +/- 0.057 | 3.864 +/- 0.739 | 2.560 +/- 0.245 |
|
5
benchmarks/inference/results/output_1_true.md
Normal file
5
benchmarks/inference/results/output_1_true.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 1 Compile true
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 18.353 +/- 1.888 | 0.172 +/- 0.081 | 7.080 +/- 3.199 | 1.414 +/- 0.154 |
|
5
benchmarks/inference/results/output_256_false.md
Normal file
5
benchmarks/inference/results/output_256_false.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 256 Compile false
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.555 +/- 1.259 | 0.447 +/- 0.007 | 573.260 +/- 8.471 | 31.546 +/- 0.530 |
|
5
benchmarks/inference/results/output_256_true.md
Normal file
5
benchmarks/inference/results/output_256_true.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 256 Compile true
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 17.986 +/- 1.737 | 0.454 +/- 0.008 | 564.066 +/- 9.441 | 26.642 +/- 1.730 |
|
5
benchmarks/inference/results/output_32_false.md
Normal file
5
benchmarks/inference/results/output_32_false.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 32 Compile false
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.215 +/- 1.670 | 0.096 +/- 0.020 | 343.878 +/- 63.660 | 11.233 +/- 1.837 |
|
5
benchmarks/inference/results/output_32_true.md
Normal file
5
benchmarks/inference/results/output_32_true.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 32 Compile true
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 16.586 +/- 2.838 | 2.092 +/- 6.316 | 314.174 +/- 123.214 | 44.014 +/- 113.006 |
|
5
benchmarks/inference/results/output_64_false.md
Normal file
5
benchmarks/inference/results/output_64_false.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 64 Compile false
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 8.704 +/- 1.577 | 0.152 +/- 0.014 | 425.039 +/- 38.873 | 16.389 +/- 0.836 |
|
5
benchmarks/inference/results/output_64_true.md
Normal file
5
benchmarks/inference/results/output_64_true.md
Normal file
@ -0,0 +1,5 @@
|
||||
## Batch Size 64 Compile true
|
||||
|
||||
| Warmup_latency (s) | Average_latency (s) | Throughput (samples/sec) | GPU Utilization (%) |
|
||||
| ------------------ | ------------------- | ------------------------ | ------------------- |
|
||||
| 17.767 +/- 1.145 | 0.142 +/- 0.014 | 455.207 +/- 44.402 | 13.422 +/- 0.912 |
|
@ -1,15 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 <output_filename>.md"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
output_markdown="results/$1"
|
||||
benchmark_script="server.py"
|
||||
output_file="results/temp_output.txt"
|
||||
checkpoint_file="resnet18-f37072fd.pth"
|
||||
downloaded_checkpoint=false
|
||||
num_iters=10
|
||||
|
||||
batch_size_values=(1 32 64 128 256)
|
||||
compile_values=(true false)
|
||||
@ -22,39 +16,25 @@ else
|
||||
wget https://download.pytorch.org/models/resnet18-f37072fd.pth
|
||||
fi
|
||||
|
||||
if [ -e "$output_file" ]; then
|
||||
rm "$output_file"
|
||||
fi
|
||||
touch $output_file
|
||||
|
||||
for batch_size in "${batch_size_values[@]}"; do
|
||||
for compile in "${compile_values[@]}"; do
|
||||
echo "Running benchmark for batch size ${batch_size} and compile=${compile}..."
|
||||
output_file="output_${batch_size}_${compile}.csv"
|
||||
if [ -e "./results/$output_file" ]; then
|
||||
rm "./results/$output_file"
|
||||
fi
|
||||
for i in $(seq 1 $num_iters); do
|
||||
if [ "$compile" = true ]; then
|
||||
python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile >> $output_file
|
||||
python -W ignore "$benchmark_script" --batch_size "$batch_size" --output_file "$output_file" --compile
|
||||
else
|
||||
python -W ignore "$benchmark_script" --batch_size "$batch_size" --no-compile >> $output_file
|
||||
python -W ignore "$benchmark_script" --batch_size "$batch_size" --output_file "$output_file" --no-compile
|
||||
fi
|
||||
done
|
||||
python process_metrics.py --csv "$output_file"
|
||||
rm "./results/$output_file"
|
||||
done
|
||||
done
|
||||
|
||||
echo "| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |" > $output_markdown
|
||||
echo "| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |" >> $output_markdown
|
||||
|
||||
while IFS= read -r line; do
|
||||
batch_size=$(echo "$line" | jq -r '.batch_size')
|
||||
compile=$(echo "$line" | jq -r '.compile')
|
||||
torch_load=$(echo "$line" | jq -r '.torch_load_time' | awk '{printf "%.5f", $0}')
|
||||
warmup_latency=$(echo "$line" | jq -r '.warmup_latency' | awk '{printf "%.5f", $0}')
|
||||
avg_latency=$(echo "$line" | jq -r '.average_latency' | awk '{printf "%.5f", $0}')
|
||||
max_latency=$(echo "$line" | jq -r '.max_latency' | awk '{printf "%.5f", $0}')
|
||||
min_latency=$(echo "$line" | jq -r '.min_latency' | awk '{printf "%.5f", $0}')
|
||||
throughput=$(echo "$line" | jq -r '.throughput' | awk '{printf "%.5f", $0}')
|
||||
gpu_util=$(echo "$line" | jq -r '.GPU_utilization' | awk '{printf "%.5f", $0}')
|
||||
echo "| $batch_size, $compile | $torch_load | $warmup_latency | $avg_latency | $max_latency | $min_latency | $throughput | $gpu_util |"
|
||||
done < $output_file >> $output_markdown
|
||||
|
||||
rm "$output_file"
|
||||
|
||||
if [ "$downloaded_checkpoint" = true ]; then
|
||||
echo "Cleaning up checkpoint..."
|
||||
rm "$checkpoint_file"
|
||||
|
@ -1,11 +1,11 @@
|
||||
import argparse
|
||||
import json
|
||||
import os.path
|
||||
import subprocess
|
||||
import time
|
||||
from queue import Empty
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
@ -86,7 +86,7 @@ class FrontendWorker(mp.Process):
|
||||
gpu_utilizations.append(float(gpu_utilization))
|
||||
time.sleep(0.1)
|
||||
|
||||
self.metrics_dict["GPU_utilization"] = np.array(gpu_utilizations).mean()
|
||||
self.metrics_dict["gpu_util"] = np.array(gpu_utilizations).mean()
|
||||
|
||||
def _send_requests(self):
|
||||
"""
|
||||
@ -199,6 +199,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--compile", default=True, action=argparse.BooleanOptionalAction
|
||||
)
|
||||
parser.add_argument("--output_file", type=str, default="output.csv")
|
||||
args = parser.parse_args()
|
||||
|
||||
downloaded_checkpoint = False
|
||||
@ -241,8 +242,13 @@ if __name__ == "__main__":
|
||||
frontend.join()
|
||||
backend.join()
|
||||
|
||||
output_str = json.dumps(metrics_dict._getvalue())
|
||||
print(output_str)
|
||||
metrics_dict = {k: [v] for k, v in metrics_dict._getvalue().items()}
|
||||
output = pd.DataFrame.from_dict(metrics_dict, orient="columns")
|
||||
output_file = "./results/" + args.output_file
|
||||
is_empty = not os.path.isfile(output_file)
|
||||
|
||||
with open(output_file, "a+", newline="") as file:
|
||||
output.to_csv(file, header=is_empty, index=False)
|
||||
|
||||
finally:
|
||||
# Cleanup checkpoint file if we downloaded it
|
||||
|
Reference in New Issue
Block a user