vllm bench serve shows num of failed requests (#26478)

Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
This commit is contained in:
Tomas Ruiz
2025-10-17 04:55:09 +02:00
committed by GitHub
parent 4d055ef465
commit 965c5f4914

View File

@ -64,6 +64,7 @@ class TaskType(Enum):
@dataclass
class BenchmarkMetrics:
completed: int
failed: int
total_input: int
total_output: int
request_throughput: float
@ -97,6 +98,7 @@ class BenchmarkMetrics:
@dataclass
class EmbedBenchmarkMetrics:
completed: int
failed: int
total_input: int
request_throughput: float
total_token_throughput: float
@ -239,12 +241,15 @@ def calculate_metrics_for_embeddings(
"""
total_input = 0
completed = 0
failed = 0
e2els: list[float] = []
for i in range(len(outputs)):
if outputs[i].success:
e2els.append(outputs[i].latency)
completed += 1
total_input += outputs[i].prompt_len
else:
failed += 1
if completed == 0:
warnings.warn(
@ -254,6 +259,7 @@ def calculate_metrics_for_embeddings(
)
metrics = EmbedBenchmarkMetrics(
completed=completed,
failed=failed,
total_input=total_input,
request_throughput=completed / dur_s,
total_token_throughput=total_input / dur_s,
@ -366,6 +372,7 @@ def calculate_metrics(
# Find the time range across all successful requests
successful_outputs = [output for output in outputs if output.success]
failed_outputs = [output for output in outputs if not output.success]
if successful_outputs:
min_start_time = min(output.start_time for output in successful_outputs)
max_end_time = max(
@ -427,6 +434,7 @@ def calculate_metrics(
metrics = BenchmarkMetrics(
completed=completed,
failed=len(failed_outputs),
total_input=total_input,
total_output=sum(actual_output_lens),
request_throughput=completed / dur_s,
@ -734,6 +742,7 @@ async def benchmark(
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
print("{:<40} {:<10}".format("Failed requests:", metrics.failed))
if max_concurrency is not None:
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
if request_rate != float("inf"):
@ -779,6 +788,7 @@ async def benchmark(
result = {
"duration": benchmark_duration,
"completed": metrics.completed,
"failed": metrics.failed,
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,