mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
vllm bench serve shows num of failed requests (#26478)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
This commit is contained in:
@ -64,6 +64,7 @@ class TaskType(Enum):
|
||||
@dataclass
|
||||
class BenchmarkMetrics:
|
||||
completed: int
|
||||
failed: int
|
||||
total_input: int
|
||||
total_output: int
|
||||
request_throughput: float
|
||||
@ -97,6 +98,7 @@ class BenchmarkMetrics:
|
||||
@dataclass
|
||||
class EmbedBenchmarkMetrics:
|
||||
completed: int
|
||||
failed: int
|
||||
total_input: int
|
||||
request_throughput: float
|
||||
total_token_throughput: float
|
||||
@ -239,12 +241,15 @@ def calculate_metrics_for_embeddings(
|
||||
"""
|
||||
total_input = 0
|
||||
completed = 0
|
||||
failed = 0
|
||||
e2els: list[float] = []
|
||||
for i in range(len(outputs)):
|
||||
if outputs[i].success:
|
||||
e2els.append(outputs[i].latency)
|
||||
completed += 1
|
||||
total_input += outputs[i].prompt_len
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
if completed == 0:
|
||||
warnings.warn(
|
||||
@ -254,6 +259,7 @@ def calculate_metrics_for_embeddings(
|
||||
)
|
||||
metrics = EmbedBenchmarkMetrics(
|
||||
completed=completed,
|
||||
failed=failed,
|
||||
total_input=total_input,
|
||||
request_throughput=completed / dur_s,
|
||||
total_token_throughput=total_input / dur_s,
|
||||
@ -366,6 +372,7 @@ def calculate_metrics(
|
||||
|
||||
# Find the time range across all successful requests
|
||||
successful_outputs = [output for output in outputs if output.success]
|
||||
failed_outputs = [output for output in outputs if not output.success]
|
||||
if successful_outputs:
|
||||
min_start_time = min(output.start_time for output in successful_outputs)
|
||||
max_end_time = max(
|
||||
@ -427,6 +434,7 @@ def calculate_metrics(
|
||||
|
||||
metrics = BenchmarkMetrics(
|
||||
completed=completed,
|
||||
failed=len(failed_outputs),
|
||||
total_input=total_input,
|
||||
total_output=sum(actual_output_lens),
|
||||
request_throughput=completed / dur_s,
|
||||
@ -734,6 +742,7 @@ async def benchmark(
|
||||
|
||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||
print("{:<40} {:<10}".format("Failed requests:", metrics.failed))
|
||||
if max_concurrency is not None:
|
||||
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
||||
if request_rate != float("inf"):
|
||||
@ -779,6 +788,7 @@ async def benchmark(
|
||||
result = {
|
||||
"duration": benchmark_duration,
|
||||
"completed": metrics.completed,
|
||||
"failed": metrics.failed,
|
||||
"total_input_tokens": metrics.total_input,
|
||||
"total_output_tokens": metrics.total_output,
|
||||
"request_throughput": metrics.request_throughput,
|
||||
|
Reference in New Issue
Block a user