[Misc] Fix estimated max model len msg (#18966)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
This commit is contained in:
Yong Hoon Shin
2025-05-31 01:43:44 -07:00
committed by GitHub
parent dff80b0e42
commit 1e123529d7

View File

@ -544,16 +544,17 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
available_memory)
estimated_msg = ""
if estimated_max_len > 0:
estimated_msg = " Based on the available memory,"
f" the estimated maximum model length is {estimated_max_len}."
estimated_msg = (
"Based on the available memory, "
f"the estimated maximum model length is {estimated_max_len}.")
raise ValueError(
f"To serve at least one request with the models's max seq len "
f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV "
f"cache is needed, which is larger than the available KV cache "
f"memory ({available_memory/GiB_bytes:.2f} GiB)."
f"memory ({available_memory/GiB_bytes:.2f} GiB). "
f"{estimated_msg} "
f" Try increasing `gpu_memory_utilization` or decreasing "
f"Try increasing `gpu_memory_utilization` or decreasing "
f"`max_model_len` when initializing the engine.")