[V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
Mark McLoughlin
2025-05-28 19:54:12 +01:00
committed by GitHub
parent c68b5c63eb
commit 0e98964e94
6 changed files with 1 additions and 156 deletions

View File

@ -35,19 +35,6 @@ The following metrics are exposed:
--8<-- "vllm/engine/metrics.py:metrics-definitions"
```
The following metrics are deprecated and due to be removed in a future version:
- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
`vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
used in V1.
- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
counters in V1.
- `vllm:time_in_queue_requests` because it duplicates
`vllm:request_queue_time_seconds`.
- `vllm:model_forward_time_milliseconds` and
`vllm:model_execute_time_milliseconds` because
prefill/decode/inference time metrics should be used instead.
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
and are then removed in version `X.Y+2`.

View File

@ -577,23 +577,6 @@
"refId": "A",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "Num Swapped",
"range": true,
"refId": "B",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
@ -874,19 +857,6 @@
"legendFormat": "GPU Cache Usage",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
"hide": false,
"instant": false,
"legendFormat": "CPU Cache Usage",
"range": true,
"refId": "B"
}
],
"title": "Cache Utilization",

View File

@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
EXPECTED_METRICS = [
"vllm:num_requests_running",
"vllm:num_requests_swapped", # deprecated
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:cpu_cache_usage_perc", # deprecated
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count",
]
HIDDEN_DEPRECATED_METRICS = [
"vllm:num_requests_swapped",
"vllm:cpu_cache_usage_perc",
]
HIDDEN_DEPRECATED_METRICS: list[str] = []
@pytest.mark.asyncio

View File

@ -1680,9 +1680,6 @@ class LLMEngine:
time_inference_requests: List[float] = []
time_prefill_requests: List[float] = []
time_decode_requests: List[float] = []
time_in_queue_requests: List[float] = []
model_forward_time_requests: List[float] = []
model_execute_time_requests: List[float] = []
# Metadata
num_prompt_tokens_requests: List[int] = []
num_generation_tokens_requests: List[int] = []
@ -1790,15 +1787,6 @@ class LLMEngine:
now - seq_group.metrics.first_token_time)
time_inference_requests.append(
now - seq_group.metrics.first_scheduled_time)
if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
if seq_group.metrics.model_forward_time is not None:
model_forward_time_requests.append(
seq_group.metrics.model_forward_time)
if seq_group.metrics.model_execute_time is not None:
model_execute_time_requests.append(
seq_group.metrics.model_execute_time * 1000)
# Metadata
num_prompt_tokens_requests.append(
len(seq_group.prompt_token_ids))
@ -1867,9 +1855,6 @@ class LLMEngine:
time_inference_requests=time_inference_requests,
time_prefill_requests=time_prefill_requests,
time_decode_requests=time_decode_requests,
time_in_queue_requests=time_in_queue_requests,
model_forward_time_requests=model_forward_time_requests,
model_execute_time_requests=model_execute_time_requests,
# Metadata
num_prompt_tokens_requests=num_prompt_tokens_requests,
num_generation_tokens_requests=num_generation_tokens_requests,

View File

@ -80,17 +80,6 @@ class Metrics:
multiprocess_mode="livemostrecent",
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped",
documentation=(
"Number of requests swapped to CPU. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
# KV Cache Usage in %
self.gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc",
@ -98,35 +87,6 @@ class Metrics:
labelnames=labelnames,
multiprocess_mode="sum")
# Deprecated in 0.8 - KV cache offloading is not used in V1
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.gauge_cpu_cache_usage = self._gauge_cls(
name="vllm:cpu_cache_usage_perc",
documentation=(
"CPU KV-cache usage. 1 means 100 percent usage. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
name="vllm:cpu_prefix_cache_hit_rate",
documentation=(
"CPU prefix cache block hit rate. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
# Deprecated in 0.8 - replaced by queries+hits counters in V1
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
name="vllm:gpu_prefix_cache_hit_rate",
documentation=("GPU prefix cache block hit rate. "
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
"and vllm:gpu_prefix_cache_queries in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
# Iteration stats
self.counter_num_preemption = self._counter_cls(
name="vllm:num_preemptions_total",
@ -200,36 +160,6 @@ class Metrics:
"Histogram of time spent in DECODE phase for request.",
labelnames=labelnames,
buckets=request_latency_buckets)
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.histogram_time_in_queue_request = self._histogram_cls(
name="vllm:time_in_queue_requests",
documentation=
("Histogram of time the request spent in the queue in seconds. "
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
labelnames=labelnames,
buckets=request_latency_buckets)
# Deprecated in 0.8 - use prefill/decode/inference time metrics
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.histogram_model_forward_time_request = self._histogram_cls(
name="vllm:model_forward_time_milliseconds",
documentation=
("Histogram of time spent in the model forward pass in ms. "
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
self.histogram_model_execute_time_request = self._histogram_cls(
name="vllm:model_execute_time_milliseconds",
documentation=
("Histogram of time spent in the model execute function in ms."
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
# Metadata
self.histogram_num_prompt_tokens_request = self._histogram_cls(
@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase):
# System state data
self._log_gauge(self.metrics.gauge_scheduler_running,
stats.num_running_sys)
if self.metrics.show_hidden_metrics:
self._log_gauge(self.metrics.gauge_scheduler_swapped,
stats.num_swapped_sys)
self._log_gauge(self.metrics.gauge_scheduler_waiting,
stats.num_waiting_sys)
self._log_gauge(self.metrics.gauge_gpu_cache_usage,
stats.gpu_cache_usage_sys)
if self.metrics.show_hidden_metrics:
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
stats.cpu_cache_usage_sys)
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
stats.cpu_prefix_cache_hit_rate)
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate)
# Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic.
lora_info = {
@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase):
stats.time_prefill_requests)
self._log_histogram(self.metrics.histogram_decode_time_request,
stats.time_decode_requests)
if self.metrics.show_hidden_metrics:
self._log_histogram(self.metrics.histogram_time_in_queue_request,
stats.time_in_queue_requests)
self._log_histogram(
self.metrics.histogram_model_forward_time_request,
stats.model_forward_time_requests)
self._log_histogram(
self.metrics.histogram_model_execute_time_request,
stats.model_execute_time_requests)
# Metadata
finished_reason_counter = CollectionsCounter(
stats.finished_reason_requests)

View File

@ -53,9 +53,6 @@ class Stats:
time_inference_requests: List[float]
time_prefill_requests: List[float]
time_decode_requests: List[float]
time_in_queue_requests: List[float]
model_forward_time_requests: List[float]
model_execute_time_requests: List[float]
# Metadata
num_prompt_tokens_requests: List[int]
num_generation_tokens_requests: List[int]