[V0 deprecation] Remove _set_default_args_v0 function (#25409)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-20 23:03:52 +08:00 · 2025-09-23 09:52:09 +08:00
parent a0b5617263
commit 1b30043f0d
1 changed files with 11 additions and 72 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1147,20 +1147,15 @@ class EngineArgs:
        else:
            envs.set_vllm_use_v1(use_v1)

-        # Set default arguments for V0 or V1 Engine.
-        if use_v1:
-            self._set_default_args_v1(usage_context, model_config)
-            # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
-            if current_platform.is_cpu(
-            ) and current_platform.get_cpu_architecture() in (
-                    CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
-                logger.info(
-                    "Chunked prefill is not supported for ARM and POWER "
-                    "and S390X CPUs; "
-                    "disabling it for V1 backend.")
-                self.enable_chunked_prefill = False
-        else:
-            self._set_default_args_v0(model_config)
+        # Set default arguments for V1 Engine.
+        self._set_default_args(usage_context, model_config)
+        # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
+        if current_platform.is_cpu() and current_platform.get_cpu_architecture(
+        ) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
+            logger.info("Chunked prefill is not supported for ARM and POWER "
+                        "and S390X CPUs; "
+                        "disabling it for V1 backend.")
+            self.enable_chunked_prefill = False
        assert self.enable_chunked_prefill is not None

        sliding_window: Optional[int] = None
@ -1528,64 +1523,8 @@ class EngineArgs:

        return True

-    def _set_default_args_v0(self, model_config: ModelConfig) -> None:
-        """Set Default Arguments for V0 Engine."""
-
-        max_model_len = model_config.max_model_len
-        use_long_context = max_model_len > 32768
-        if self.enable_chunked_prefill is None:
-            # Chunked prefill not supported for Multimodal or MLA in V0.
-            if model_config.is_multimodal_model or model_config.use_mla:
-                self.enable_chunked_prefill = False
-
-            # Enable chunked prefill by default for long context (> 32K)
-            # models to avoid OOM errors in initial memory profiling phase.
-            elif use_long_context:
-                is_gpu = current_platform.is_cuda()
-                use_sliding_window = (model_config.get_sliding_window()
-                                      is not None)
-                use_spec_decode = self.speculative_config is not None
-
-                if (is_gpu and not use_sliding_window and not use_spec_decode
-                        and not self.enable_lora):
-                    self.enable_chunked_prefill = True
-                    logger.warning(
-                        "Chunked prefill is enabled by default for models "
-                        "with max_model_len > 32K. Chunked prefill might "
-                        "not work with some features or models. If you "
-                        "encounter any issues, please disable by launching "
-                        "with --enable-chunked-prefill=False.")
-
-            if self.enable_chunked_prefill is None:
-                self.enable_chunked_prefill = False
-
-        if not self.enable_chunked_prefill and use_long_context:
-            logger.warning(
-                "The model has a long context length (%s). This may cause"
-                "OOM during the initial memory profiling phase, or result "
-                "in low performance due to small KV cache size. Consider "
-                "setting --max-model-len to a smaller value.", max_model_len)
-
-        # Disable prefix caching for multimodal models for VLLM_V0.
-        if self.enable_prefix_caching and model_config.is_multimodal_model:
-            logger.warning(
-                "--enable-prefix-caching is not supported for multimodal "
-                "models in V0 and has been disabled.")
-            self.enable_prefix_caching = False
-
-            if self.enable_prompt_embeds:
-                logger.warning(
-                    "--enable-prompt-embeds and --enable-prefix-caching "
-                    "are not supported together in V0. Prefix caching has "
-                    "been disabled.")
-                self.enable_prefix_caching = False
-
-        # Set max_num_seqs to 256 for VLLM_V0.
-        if self.max_num_seqs is None:
-            self.max_num_seqs = 256
-
-    def _set_default_args_v1(self, usage_context: UsageContext,
-                             model_config: ModelConfig) -> None:
+    def _set_default_args(self, usage_context: UsageContext,
+                          model_config: ModelConfig) -> None:
        """Set Default Arguments for V1 Engine."""

        # V1 always uses chunked prefills and prefix caching