From b5a10eb0ef68f45c7dbdef2917e02bebca780d1a Mon Sep 17 00:00:00 2001 From: kg6-sleipnir <45186108+kg6-sleipnir@users.noreply.github.com> Date: Sun, 1 Oct 2023 00:04:03 -0400 Subject: [PATCH] Added `dtype` arg to benchmarks (#1228) --- benchmarks/benchmark_latency.py | 10 ++++++++++ benchmarks/benchmark_throughput.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index be50a5f484..5b45894105 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -23,6 +23,7 @@ def main(args: argparse.Namespace): max_num_seqs=args.batch_size, max_num_batched_tokens=args.batch_size * args.input_len, trust_remote_code=args.trust_remote_code, + dtype=args.dtype, ) sampling_params = SamplingParams( @@ -87,5 +88,14 @@ if __name__ == '__main__': parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument( + '--dtype', + type=str, + default='auto', + choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], + help='data type for model weights and activations. ' + 'The "auto" option will use FP16 precision ' + 'for FP32 and FP16 models, and BF16 precision ' + 'for BF16 models.') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c200deb64d..5f8026ed3b 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -64,6 +64,7 @@ def run_vllm( n: int, use_beam_search: bool, trust_remote_code: bool, + dtype: str, ) -> float: llm = LLM( model=model, @@ -72,6 +73,7 @@ def run_vllm( tensor_parallel_size=tensor_parallel_size, seed=seed, trust_remote_code=trust_remote_code, + dtype=dtype, ) # Add the requests to the engine. @@ -171,7 +173,7 @@ def main(args: argparse.Namespace): elapsed_time = run_vllm(requests, args.model, args.tokenizer, args.quantization, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code) + args.trust_remote_code, args.dtype) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -219,6 +221,15 @@ if __name__ == "__main__": parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument( + '--dtype', + type=str, + default='auto', + choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], + help='data type for model weights and activations. ' + 'The "auto" option will use FP16 precision ' + 'for FP32 and FP16 models, and BF16 precision ' + 'for BF16 models.') args = parser.parse_args() if args.backend == "vllm":