mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
@ -35,19 +35,6 @@ The following metrics are exposed:
|
||||
--8<-- "vllm/engine/metrics.py:metrics-definitions"
|
||||
```
|
||||
|
||||
The following metrics are deprecated and due to be removed in a future version:
|
||||
|
||||
- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
|
||||
`vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
|
||||
used in V1.
|
||||
- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
|
||||
counters in V1.
|
||||
- `vllm:time_in_queue_requests` because it duplicates
|
||||
`vllm:request_queue_time_seconds`.
|
||||
- `vllm:model_forward_time_milliseconds` and
|
||||
`vllm:model_execute_time_milliseconds` because
|
||||
prefill/decode/inference time metrics should be used instead.
|
||||
|
||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
||||
and are then removed in version `X.Y+2`.
|
||||
|
@ -577,23 +577,6 @@
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "Num Swapped",
|
||||
"range": true,
|
||||
"refId": "B",
|
||||
"useBackend": false
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -874,19 +857,6 @@
|
||||
"legendFormat": "GPU Cache Usage",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "CPU Cache Usage",
|
||||
"range": true,
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Cache Utilization",
|
||||
|
@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
|
||||
|
||||
EXPECTED_METRICS = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_swapped", # deprecated
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:cpu_cache_usage_perc", # deprecated
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS = [
|
||||
"vllm:num_requests_swapped",
|
||||
"vllm:cpu_cache_usage_perc",
|
||||
]
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
@ -1680,9 +1680,6 @@ class LLMEngine:
|
||||
time_inference_requests: List[float] = []
|
||||
time_prefill_requests: List[float] = []
|
||||
time_decode_requests: List[float] = []
|
||||
time_in_queue_requests: List[float] = []
|
||||
model_forward_time_requests: List[float] = []
|
||||
model_execute_time_requests: List[float] = []
|
||||
# Metadata
|
||||
num_prompt_tokens_requests: List[int] = []
|
||||
num_generation_tokens_requests: List[int] = []
|
||||
@ -1790,15 +1787,6 @@ class LLMEngine:
|
||||
now - seq_group.metrics.first_token_time)
|
||||
time_inference_requests.append(
|
||||
now - seq_group.metrics.first_scheduled_time)
|
||||
if seq_group.metrics.time_in_queue is not None:
|
||||
time_in_queue_requests.append(
|
||||
seq_group.metrics.time_in_queue)
|
||||
if seq_group.metrics.model_forward_time is not None:
|
||||
model_forward_time_requests.append(
|
||||
seq_group.metrics.model_forward_time)
|
||||
if seq_group.metrics.model_execute_time is not None:
|
||||
model_execute_time_requests.append(
|
||||
seq_group.metrics.model_execute_time * 1000)
|
||||
# Metadata
|
||||
num_prompt_tokens_requests.append(
|
||||
len(seq_group.prompt_token_ids))
|
||||
@ -1867,9 +1855,6 @@ class LLMEngine:
|
||||
time_inference_requests=time_inference_requests,
|
||||
time_prefill_requests=time_prefill_requests,
|
||||
time_decode_requests=time_decode_requests,
|
||||
time_in_queue_requests=time_in_queue_requests,
|
||||
model_forward_time_requests=model_forward_time_requests,
|
||||
model_execute_time_requests=model_execute_time_requests,
|
||||
# Metadata
|
||||
num_prompt_tokens_requests=num_prompt_tokens_requests,
|
||||
num_generation_tokens_requests=num_generation_tokens_requests,
|
||||
|
@ -80,17 +80,6 @@ class Metrics:
|
||||
multiprocess_mode="livemostrecent",
|
||||
)
|
||||
|
||||
# Deprecated in 0.8 - KV cache offloading is not used in V1
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.gauge_scheduler_swapped = self._gauge_cls(
|
||||
name="vllm:num_requests_swapped",
|
||||
documentation=(
|
||||
"Number of requests swapped to CPU. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# KV Cache Usage in %
|
||||
self.gauge_gpu_cache_usage = self._gauge_cls(
|
||||
name="vllm:gpu_cache_usage_perc",
|
||||
@ -98,35 +87,6 @@ class Metrics:
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Deprecated in 0.8 - KV cache offloading is not used in V1
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.gauge_cpu_cache_usage = self._gauge_cls(
|
||||
name="vllm:cpu_cache_usage_perc",
|
||||
documentation=(
|
||||
"CPU KV-cache usage. 1 means 100 percent usage. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
|
||||
name="vllm:cpu_prefix_cache_hit_rate",
|
||||
documentation=(
|
||||
"CPU prefix cache block hit rate. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Deprecated in 0.8 - replaced by queries+hits counters in V1
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
|
||||
name="vllm:gpu_prefix_cache_hit_rate",
|
||||
documentation=("GPU prefix cache block hit rate. "
|
||||
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
|
||||
"and vllm:gpu_prefix_cache_queries in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Iteration stats
|
||||
self.counter_num_preemption = self._counter_cls(
|
||||
name="vllm:num_preemptions_total",
|
||||
@ -200,36 +160,6 @@ class Metrics:
|
||||
"Histogram of time spent in DECODE phase for request.",
|
||||
labelnames=labelnames,
|
||||
buckets=request_latency_buckets)
|
||||
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.histogram_time_in_queue_request = self._histogram_cls(
|
||||
name="vllm:time_in_queue_requests",
|
||||
documentation=
|
||||
("Histogram of time the request spent in the queue in seconds. "
|
||||
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
|
||||
labelnames=labelnames,
|
||||
buckets=request_latency_buckets)
|
||||
|
||||
# Deprecated in 0.8 - use prefill/decode/inference time metrics
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.histogram_model_forward_time_request = self._histogram_cls(
|
||||
name="vllm:model_forward_time_milliseconds",
|
||||
documentation=
|
||||
("Histogram of time spent in the model forward pass in ms. "
|
||||
"DEPRECATED: use prefill/decode/inference time metrics instead"
|
||||
),
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_3_5_8_buckets(3000))
|
||||
self.histogram_model_execute_time_request = self._histogram_cls(
|
||||
name="vllm:model_execute_time_milliseconds",
|
||||
documentation=
|
||||
("Histogram of time spent in the model execute function in ms."
|
||||
"DEPRECATED: use prefill/decode/inference time metrics instead"
|
||||
),
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_3_5_8_buckets(3000))
|
||||
|
||||
# Metadata
|
||||
self.histogram_num_prompt_tokens_request = self._histogram_cls(
|
||||
@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
# System state data
|
||||
self._log_gauge(self.metrics.gauge_scheduler_running,
|
||||
stats.num_running_sys)
|
||||
if self.metrics.show_hidden_metrics:
|
||||
self._log_gauge(self.metrics.gauge_scheduler_swapped,
|
||||
stats.num_swapped_sys)
|
||||
self._log_gauge(self.metrics.gauge_scheduler_waiting,
|
||||
stats.num_waiting_sys)
|
||||
self._log_gauge(self.metrics.gauge_gpu_cache_usage,
|
||||
stats.gpu_cache_usage_sys)
|
||||
if self.metrics.show_hidden_metrics:
|
||||
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
|
||||
stats.cpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
|
||||
stats.cpu_prefix_cache_hit_rate)
|
||||
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
|
||||
stats.gpu_prefix_cache_hit_rate)
|
||||
# Including max-lora in metric, in future this property of lora
|
||||
# config maybe extended to be dynamic.
|
||||
lora_info = {
|
||||
@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
stats.time_prefill_requests)
|
||||
self._log_histogram(self.metrics.histogram_decode_time_request,
|
||||
stats.time_decode_requests)
|
||||
if self.metrics.show_hidden_metrics:
|
||||
self._log_histogram(self.metrics.histogram_time_in_queue_request,
|
||||
stats.time_in_queue_requests)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_model_forward_time_request,
|
||||
stats.model_forward_time_requests)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_model_execute_time_request,
|
||||
stats.model_execute_time_requests)
|
||||
# Metadata
|
||||
finished_reason_counter = CollectionsCounter(
|
||||
stats.finished_reason_requests)
|
||||
|
@ -53,9 +53,6 @@ class Stats:
|
||||
time_inference_requests: List[float]
|
||||
time_prefill_requests: List[float]
|
||||
time_decode_requests: List[float]
|
||||
time_in_queue_requests: List[float]
|
||||
model_forward_time_requests: List[float]
|
||||
model_execute_time_requests: List[float]
|
||||
# Metadata
|
||||
num_prompt_tokens_requests: List[int]
|
||||
num_generation_tokens_requests: List[int]
|
||||
|
Reference in New Issue
Block a user