Tabulate outputs in inference benchmark (#112900)

- Fix error where script was always compiling model
- Make`runner.sh` parse outputs into nice `.md` format

Pull Request resolved: https://github.com/pytorch/pytorch/pull/112900
Approved by: https://github.com/albanD
ghstack dependencies: #112582, #112863
This commit is contained in:
Mikayla Gawarecki
2023-11-03 13:23:29 -07:00
committed by PyTorch MergeBot
parent 6ba2748690
commit df149581bc
4 changed files with 96 additions and 27 deletions

View File

@ -25,7 +25,8 @@ The togglable commmand line arguments to the script are as follows:
excluding the first warmup request
- `batch_size` (default: 32): the batch size of the requests.
- `model_dir` (default: '.'): the directory to load the checkpoint from
- `compile` (default: True): whether to `torch.compile()` the model
- `compile` (default: compile): or `--no-compile` whether to `torch.compile()`
the model
e.g. A sample command to run the benchmark
@ -55,3 +56,9 @@ to be lazily imported (e.g. triton).
The script `runner.sh` will run a sweep of the benchmark over different batch
sizes with compile on and off. The `results/` directory will contain the metrics
from running a sweep as we develop this benchmark.
To run the script
```
./runner.sh <filename>.md
```
will create `results/<filename>/md`.

View File

@ -0,0 +1,12 @@
| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |
| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |
| 1, true | 4.96503 | 17.48787 | 0.13005 | 0.29250 | 0.03008 | 7.68938 | 1.56557 |
| 1, false | 5.10042 | 9.25073 | 0.29875 | 0.53307 | 0.03849 | 3.34732 | 2.17808 |
| 32, true | 4.63273 | 16.52005 | 0.07233 | 0.17758 | 0.05606 | 442.38822 | 9.59890 |
| 32, false | 4.56469 | 7.40872 | 0.09007 | 0.32917 | 0.05411 | 355.27932 | 12.41176 |
| 64, true | 6.38863 | 19.92130 | 0.16717 | 0.36933 | 0.10566 | 382.83639 | 11.85784 |
| 64, false | 6.55728 | 11.15943 | 0.15646 | 0.51614 | 0.09775 | 409.05319 | 15.72000 |
| 128, true | 3.96179 | 15.54395 | 0.22426 | 0.41163 | 0.17372 | 570.75645 | 19.71206 |
| 128, false | 4.07400 | 7.41854 | 0.22373 | 0.31546 | 0.17344 | 572.11860 | 22.69027 |
| 256, true | 5.87015 | 18.38878 | 0.46226 | 0.68052 | 0.35923 | 553.79917 | 27.11622 |
| 256, false | 4.68391 | 8.14502 | 0.45867 | 0.69048 | 0.34811 | 558.12956 | 30.28707 |

View File

@ -1,30 +1,60 @@
#!/bin/bash
batch_size_values=(1 32 64 128 256)
compile_values=(True False)
if [ $# -ne 1 ]; then
echo "Usage: $0 <output_filename>.md"
exit 1
fi
output_markdown="results/$1"
benchmark_script="server.py"
output_file="results/temp_output.txt"
checkpoint_file="resnet18-f37072fd.pth"
downloaded_checkpoint=false
batch_size_values=(1 32 64 128 256)
compile_values=(true false)
if [ -f $checkpoint_file ]; then
echo "Checkpoint exists."
else
downloaded_checkpoint=true
echo "Downloading checkpoint..."
wget https://download.pytorch.org/models/resnet18-f37072fd.pth
echo "============================================================================="
fi
echo "Starting benchmark..."
if [ -e "$output_file" ]; then
rm "$output_file"
fi
touch $output_file
for batch_size in "${batch_size_values[@]}"; do
for compile in "${compile_values[@]}"; do
echo "Running benchmark with batch_size=$batch_size, compile=$compile..."
python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile "$compile"
echo "============================================================================="
if [ "$compile" = true ]; then
python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile >> $output_file
else
python -W ignore "$benchmark_script" --batch_size "$batch_size" --no-compile >> $output_file
fi
done
done
echo "| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |" > $output_markdown
echo "| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |" >> $output_markdown
while IFS= read -r line; do
batch_size=$(echo "$line" | jq -r '.batch_size')
compile=$(echo "$line" | jq -r '.compile')
torch_load=$(echo "$line" | jq -r '.torch_load_time' | awk '{printf "%.5f", $0}')
warmup_latency=$(echo "$line" | jq -r '.warmup_latency' | awk '{printf "%.5f", $0}')
avg_latency=$(echo "$line" | jq -r '.average_latency' | awk '{printf "%.5f", $0}')
max_latency=$(echo "$line" | jq -r '.max_latency' | awk '{printf "%.5f", $0}')
min_latency=$(echo "$line" | jq -r '.min_latency' | awk '{printf "%.5f", $0}')
throughput=$(echo "$line" | jq -r '.throughput' | awk '{printf "%.5f", $0}')
gpu_util=$(echo "$line" | jq -r '.GPU_utilization' | awk '{printf "%.5f", $0}')
echo "| $batch_size, $compile | $torch_load | $warmup_latency | $avg_latency | $max_latency | $min_latency | $throughput | $gpu_util |"
done < $output_file >> $output_markdown
rm "$output_file"
if [ "$downloaded_checkpoint" = true ]; then
echo "Cleaning up checkpoint..."
rm "$checkpoint_file"

View File

@ -1,4 +1,5 @@
import argparse
import json
import os.path
import subprocess
import time
@ -16,8 +17,11 @@ class FrontendWorker(mp.Process):
throughput and latency of those requests as well as GPU utilization.
"""
def __init__(self, request_queue, response_queue, batch_size, num_iters=10):
def __init__(
self, metrics_dict, request_queue, response_queue, batch_size, num_iters=10
):
super().__init__()
self.metrics_dict = metrics_dict
self.request_queue = request_queue
self.response_queue = response_queue
self.warmup_event = mp.Event()
@ -45,15 +49,12 @@ class FrontendWorker(mp.Process):
self.poll_gpu = False
response_times = np.array(response_times)
print(f"Warmup latency: {warmup_response_time:.5f} s")
print(
f"Average latency (exclude warmup): {response_times.mean():.5f} +/- {response_times.std():.5f} s"
)
print(f"Max latency: {response_times.max():.5f} s")
print(f"Min latency: {response_times.min():.5f} s")
print(
"Throughput (exclude warmup): "
f"{(self.num_iters * self.batch_size) / response_times.sum():.5f} samples per second"
self.metrics_dict["warmup_latency"] = warmup_response_time
self.metrics_dict["average_latency"] = response_times.mean()
self.metrics_dict["max_latency"] = response_times.max()
self.metrics_dict["min_latency"] = response_times.min()
self.metrics_dict["throughput"] = (
self.num_iters * self.batch_size / response_times.sum()
)
def _run_gpu_utilization(self):
@ -84,7 +85,8 @@ class FrontendWorker(mp.Process):
if gpu_utilization != "N/A":
gpu_utilizations.append(float(gpu_utilization))
time.sleep(0.1)
print(f"Average GPU utilization: {np.array(gpu_utilizations).mean():.5f}")
self.metrics_dict["GPU_utilization"] = np.array(gpu_utilizations).mean()
def _send_requests(self):
"""
@ -128,10 +130,16 @@ class BackendWorker(mp.Process):
"""
def __init__(
self, request_queue, response_queue, model_dir=".", compile_model=True
self,
metrics_dict,
request_queue,
response_queue,
model_dir=".",
compile_model=True,
):
super().__init__()
self.device = "cuda:0"
self.metrics_dict = metrics_dict
self.request_queue = request_queue
self.response_queue = response_queue
self.model_dir = model_dir
@ -155,7 +163,7 @@ class BackendWorker(mp.Process):
mmap=True,
map_location=self.device,
)
print(f"torch.load() time: {time.time() - start_load_time:.5f} s")
self.metrics_dict["torch_load_time"] = time.time() - start_load_time
m.load_state_dict(state_dict, assign=True)
m.eval()
@ -163,9 +171,7 @@ class BackendWorker(mp.Process):
start_compile_time = time.time()
m.compile()
end_compile_time = time.time()
print(
f"m.compile() time (not actual first compilation): {end_compile_time - start_compile_time:.5f} s"
)
self.metrics_dict["m_compile_time"] = end_compile_time - start_compile_time
return m
def run(self):
@ -190,7 +196,9 @@ if __name__ == "__main__":
parser.add_argument("--num_iters", type=int, default=100)
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--model_dir", type=str, default=".")
parser.add_argument("--compile", type=bool, default=True)
parser.add_argument(
"--compile", default=True, action=argparse.BooleanOptionalAction
)
args = parser.parse_args()
downloaded_checkpoint = False
@ -211,11 +219,20 @@ if __name__ == "__main__":
request_queue = mp.Queue()
response_queue = mp.Queue()
manager = mp.Manager()
metrics_dict = manager.dict()
metrics_dict["batch_size"] = args.batch_size
metrics_dict["compile"] = args.compile
frontend = FrontendWorker(
request_queue, response_queue, args.batch_size, num_iters=args.num_iters
metrics_dict,
request_queue,
response_queue,
args.batch_size,
num_iters=args.num_iters,
)
backend = BackendWorker(
request_queue, response_queue, args.model_dir, args.compile
metrics_dict, request_queue, response_queue, args.model_dir, args.compile
)
frontend.start()
@ -224,6 +241,9 @@ if __name__ == "__main__":
frontend.join()
backend.join()
output_str = json.dumps(metrics_dict._getvalue())
print(output_str)
finally:
# Cleanup checkpoint file if we downloaded it
if downloaded_checkpoint: