From b5a10eb0ef68f45c7dbdef2917e02bebca780d1a Mon Sep 17 00:00:00 2001
From: kg6-sleipnir <45186108+kg6-sleipnir@users.noreply.github.com>
Date: Sun, 1 Oct 2023 00:04:03 -0400
Subject: [PATCH] Added `dtype` arg to benchmarks (#1228)

---
 benchmarks/benchmark_latency.py    | 10 ++++++++++
 benchmarks/benchmark_throughput.py | 13 ++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index be50a5f484..5b45894105 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -23,6 +23,7 @@ def main(args: argparse.Namespace):
         max_num_seqs=args.batch_size,
         max_num_batched_tokens=args.batch_size * args.input_len,
         trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
     )
 
     sampling_params = SamplingParams(
@@ -87,5 +88,14 @@ if __name__ == '__main__':
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c200deb64d..5f8026ed3b 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -64,6 +64,7 @@ def run_vllm(
     n: int,
     use_beam_search: bool,
     trust_remote_code: bool,
+    dtype: str,
 ) -> float:
     llm = LLM(
         model=model,
@@ -72,6 +73,7 @@ def run_vllm(
         tensor_parallel_size=tensor_parallel_size,
         seed=seed,
         trust_remote_code=trust_remote_code,
+        dtype=dtype,
     )
 
     # Add the requests to the engine.
@@ -171,7 +173,7 @@ def main(args: argparse.Namespace):
         elapsed_time = run_vllm(requests, args.model, args.tokenizer,
                                 args.quantization, args.tensor_parallel_size,
                                 args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code)
+                                args.trust_remote_code, args.dtype)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -219,6 +221,15 @@ if __name__ == "__main__":
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
     args = parser.parse_args()
 
     if args.backend == "vllm":