mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Tabulate outputs in inference benchmark (#112900)
- Fix error where script was always compiling model - Make`runner.sh` parse outputs into nice `.md` format Pull Request resolved: https://github.com/pytorch/pytorch/pull/112900 Approved by: https://github.com/albanD ghstack dependencies: #112582, #112863
This commit is contained in:
committed by
PyTorch MergeBot
parent
6ba2748690
commit
df149581bc
@ -25,7 +25,8 @@ The togglable commmand line arguments to the script are as follows:
|
||||
excluding the first warmup request
|
||||
- `batch_size` (default: 32): the batch size of the requests.
|
||||
- `model_dir` (default: '.'): the directory to load the checkpoint from
|
||||
- `compile` (default: True): whether to `torch.compile()` the model
|
||||
- `compile` (default: compile): or `--no-compile` whether to `torch.compile()`
|
||||
the model
|
||||
|
||||
e.g. A sample command to run the benchmark
|
||||
|
||||
@ -55,3 +56,9 @@ to be lazily imported (e.g. triton).
|
||||
The script `runner.sh` will run a sweep of the benchmark over different batch
|
||||
sizes with compile on and off. The `results/` directory will contain the metrics
|
||||
from running a sweep as we develop this benchmark.
|
||||
|
||||
To run the script
|
||||
```
|
||||
./runner.sh <filename>.md
|
||||
```
|
||||
will create `results/<filename>/md`.
|
||||
|
12
benchmarks/inference/results/output.md
Normal file
12
benchmarks/inference/results/output.md
Normal file
@ -0,0 +1,12 @@
|
||||
| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |
|
||||
| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |
|
||||
| 1, true | 4.96503 | 17.48787 | 0.13005 | 0.29250 | 0.03008 | 7.68938 | 1.56557 |
|
||||
| 1, false | 5.10042 | 9.25073 | 0.29875 | 0.53307 | 0.03849 | 3.34732 | 2.17808 |
|
||||
| 32, true | 4.63273 | 16.52005 | 0.07233 | 0.17758 | 0.05606 | 442.38822 | 9.59890 |
|
||||
| 32, false | 4.56469 | 7.40872 | 0.09007 | 0.32917 | 0.05411 | 355.27932 | 12.41176 |
|
||||
| 64, true | 6.38863 | 19.92130 | 0.16717 | 0.36933 | 0.10566 | 382.83639 | 11.85784 |
|
||||
| 64, false | 6.55728 | 11.15943 | 0.15646 | 0.51614 | 0.09775 | 409.05319 | 15.72000 |
|
||||
| 128, true | 3.96179 | 15.54395 | 0.22426 | 0.41163 | 0.17372 | 570.75645 | 19.71206 |
|
||||
| 128, false | 4.07400 | 7.41854 | 0.22373 | 0.31546 | 0.17344 | 572.11860 | 22.69027 |
|
||||
| 256, true | 5.87015 | 18.38878 | 0.46226 | 0.68052 | 0.35923 | 553.79917 | 27.11622 |
|
||||
| 256, false | 4.68391 | 8.14502 | 0.45867 | 0.69048 | 0.34811 | 558.12956 | 30.28707 |
|
@ -1,30 +1,60 @@
|
||||
#!/bin/bash
|
||||
|
||||
batch_size_values=(1 32 64 128 256)
|
||||
compile_values=(True False)
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 <output_filename>.md"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
output_markdown="results/$1"
|
||||
benchmark_script="server.py"
|
||||
output_file="results/temp_output.txt"
|
||||
checkpoint_file="resnet18-f37072fd.pth"
|
||||
downloaded_checkpoint=false
|
||||
|
||||
batch_size_values=(1 32 64 128 256)
|
||||
compile_values=(true false)
|
||||
|
||||
if [ -f $checkpoint_file ]; then
|
||||
echo "Checkpoint exists."
|
||||
else
|
||||
downloaded_checkpoint=true
|
||||
echo "Downloading checkpoint..."
|
||||
wget https://download.pytorch.org/models/resnet18-f37072fd.pth
|
||||
echo "============================================================================="
|
||||
fi
|
||||
|
||||
echo "Starting benchmark..."
|
||||
if [ -e "$output_file" ]; then
|
||||
rm "$output_file"
|
||||
fi
|
||||
touch $output_file
|
||||
|
||||
for batch_size in "${batch_size_values[@]}"; do
|
||||
for compile in "${compile_values[@]}"; do
|
||||
echo "Running benchmark with batch_size=$batch_size, compile=$compile..."
|
||||
python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile "$compile"
|
||||
echo "============================================================================="
|
||||
if [ "$compile" = true ]; then
|
||||
python -W ignore "$benchmark_script" --batch_size "$batch_size" --compile >> $output_file
|
||||
else
|
||||
python -W ignore "$benchmark_script" --batch_size "$batch_size" --no-compile >> $output_file
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
echo "| bs, compile | torch.load() / s | warmup latency / s | avg latency / s | max latency / s | min latency / s | throughput samples/s | GPU util % |" > $output_markdown
|
||||
echo "| ----------- | ---------------- | ------------------ | --------------- | --------------- | --------------- | -------------------- | ---------- |" >> $output_markdown
|
||||
|
||||
while IFS= read -r line; do
|
||||
batch_size=$(echo "$line" | jq -r '.batch_size')
|
||||
compile=$(echo "$line" | jq -r '.compile')
|
||||
torch_load=$(echo "$line" | jq -r '.torch_load_time' | awk '{printf "%.5f", $0}')
|
||||
warmup_latency=$(echo "$line" | jq -r '.warmup_latency' | awk '{printf "%.5f", $0}')
|
||||
avg_latency=$(echo "$line" | jq -r '.average_latency' | awk '{printf "%.5f", $0}')
|
||||
max_latency=$(echo "$line" | jq -r '.max_latency' | awk '{printf "%.5f", $0}')
|
||||
min_latency=$(echo "$line" | jq -r '.min_latency' | awk '{printf "%.5f", $0}')
|
||||
throughput=$(echo "$line" | jq -r '.throughput' | awk '{printf "%.5f", $0}')
|
||||
gpu_util=$(echo "$line" | jq -r '.GPU_utilization' | awk '{printf "%.5f", $0}')
|
||||
echo "| $batch_size, $compile | $torch_load | $warmup_latency | $avg_latency | $max_latency | $min_latency | $throughput | $gpu_util |"
|
||||
done < $output_file >> $output_markdown
|
||||
|
||||
rm "$output_file"
|
||||
|
||||
if [ "$downloaded_checkpoint" = true ]; then
|
||||
echo "Cleaning up checkpoint..."
|
||||
rm "$checkpoint_file"
|
||||
|
@ -1,4 +1,5 @@
|
||||
import argparse
|
||||
import json
|
||||
import os.path
|
||||
import subprocess
|
||||
import time
|
||||
@ -16,8 +17,11 @@ class FrontendWorker(mp.Process):
|
||||
throughput and latency of those requests as well as GPU utilization.
|
||||
"""
|
||||
|
||||
def __init__(self, request_queue, response_queue, batch_size, num_iters=10):
|
||||
def __init__(
|
||||
self, metrics_dict, request_queue, response_queue, batch_size, num_iters=10
|
||||
):
|
||||
super().__init__()
|
||||
self.metrics_dict = metrics_dict
|
||||
self.request_queue = request_queue
|
||||
self.response_queue = response_queue
|
||||
self.warmup_event = mp.Event()
|
||||
@ -45,15 +49,12 @@ class FrontendWorker(mp.Process):
|
||||
self.poll_gpu = False
|
||||
|
||||
response_times = np.array(response_times)
|
||||
print(f"Warmup latency: {warmup_response_time:.5f} s")
|
||||
print(
|
||||
f"Average latency (exclude warmup): {response_times.mean():.5f} +/- {response_times.std():.5f} s"
|
||||
)
|
||||
print(f"Max latency: {response_times.max():.5f} s")
|
||||
print(f"Min latency: {response_times.min():.5f} s")
|
||||
print(
|
||||
"Throughput (exclude warmup): "
|
||||
f"{(self.num_iters * self.batch_size) / response_times.sum():.5f} samples per second"
|
||||
self.metrics_dict["warmup_latency"] = warmup_response_time
|
||||
self.metrics_dict["average_latency"] = response_times.mean()
|
||||
self.metrics_dict["max_latency"] = response_times.max()
|
||||
self.metrics_dict["min_latency"] = response_times.min()
|
||||
self.metrics_dict["throughput"] = (
|
||||
self.num_iters * self.batch_size / response_times.sum()
|
||||
)
|
||||
|
||||
def _run_gpu_utilization(self):
|
||||
@ -84,7 +85,8 @@ class FrontendWorker(mp.Process):
|
||||
if gpu_utilization != "N/A":
|
||||
gpu_utilizations.append(float(gpu_utilization))
|
||||
time.sleep(0.1)
|
||||
print(f"Average GPU utilization: {np.array(gpu_utilizations).mean():.5f}")
|
||||
|
||||
self.metrics_dict["GPU_utilization"] = np.array(gpu_utilizations).mean()
|
||||
|
||||
def _send_requests(self):
|
||||
"""
|
||||
@ -128,10 +130,16 @@ class BackendWorker(mp.Process):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, request_queue, response_queue, model_dir=".", compile_model=True
|
||||
self,
|
||||
metrics_dict,
|
||||
request_queue,
|
||||
response_queue,
|
||||
model_dir=".",
|
||||
compile_model=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.device = "cuda:0"
|
||||
self.metrics_dict = metrics_dict
|
||||
self.request_queue = request_queue
|
||||
self.response_queue = response_queue
|
||||
self.model_dir = model_dir
|
||||
@ -155,7 +163,7 @@ class BackendWorker(mp.Process):
|
||||
mmap=True,
|
||||
map_location=self.device,
|
||||
)
|
||||
print(f"torch.load() time: {time.time() - start_load_time:.5f} s")
|
||||
self.metrics_dict["torch_load_time"] = time.time() - start_load_time
|
||||
m.load_state_dict(state_dict, assign=True)
|
||||
m.eval()
|
||||
|
||||
@ -163,9 +171,7 @@ class BackendWorker(mp.Process):
|
||||
start_compile_time = time.time()
|
||||
m.compile()
|
||||
end_compile_time = time.time()
|
||||
print(
|
||||
f"m.compile() time (not actual first compilation): {end_compile_time - start_compile_time:.5f} s"
|
||||
)
|
||||
self.metrics_dict["m_compile_time"] = end_compile_time - start_compile_time
|
||||
return m
|
||||
|
||||
def run(self):
|
||||
@ -190,7 +196,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--num_iters", type=int, default=100)
|
||||
parser.add_argument("--batch_size", type=int, default=32)
|
||||
parser.add_argument("--model_dir", type=str, default=".")
|
||||
parser.add_argument("--compile", type=bool, default=True)
|
||||
parser.add_argument(
|
||||
"--compile", default=True, action=argparse.BooleanOptionalAction
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
downloaded_checkpoint = False
|
||||
@ -211,11 +219,20 @@ if __name__ == "__main__":
|
||||
request_queue = mp.Queue()
|
||||
response_queue = mp.Queue()
|
||||
|
||||
manager = mp.Manager()
|
||||
metrics_dict = manager.dict()
|
||||
metrics_dict["batch_size"] = args.batch_size
|
||||
metrics_dict["compile"] = args.compile
|
||||
|
||||
frontend = FrontendWorker(
|
||||
request_queue, response_queue, args.batch_size, num_iters=args.num_iters
|
||||
metrics_dict,
|
||||
request_queue,
|
||||
response_queue,
|
||||
args.batch_size,
|
||||
num_iters=args.num_iters,
|
||||
)
|
||||
backend = BackendWorker(
|
||||
request_queue, response_queue, args.model_dir, args.compile
|
||||
metrics_dict, request_queue, response_queue, args.model_dir, args.compile
|
||||
)
|
||||
|
||||
frontend.start()
|
||||
@ -224,6 +241,9 @@ if __name__ == "__main__":
|
||||
frontend.join()
|
||||
backend.join()
|
||||
|
||||
output_str = json.dumps(metrics_dict._getvalue())
|
||||
print(output_str)
|
||||
|
||||
finally:
|
||||
# Cleanup checkpoint file if we downloaded it
|
||||
if downloaded_checkpoint:
|
||||
|
Reference in New Issue
Block a user