diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 6e1d472244..d3f69b2e50 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -12,7 +12,7 @@ from typing import Any, Optional, TypeAlias import torch import tqdm -from vllm import LLM, SamplingParams +from vllm import LLM, SamplingParams, envs from vllm.engine.arg_utils import EngineArgs from vllm.profiler import layerwise_profile from vllm.utils import FlexibleArgumentParser @@ -261,8 +261,13 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], decode_profs = [] for _ in tqdm.tqdm(range(num_steps_to_profile - 1)): - num_running_seqs = llm.llm_engine.scheduler[ - 0].get_num_unfinished_seq_groups() + if envs.VLLM_USE_V1: + num_running_seqs = llm.llm_engine.scheduler[ + 0].get_num_unfinished_requests() + else: + num_running_seqs = llm.llm_engine.scheduler[ + 0].get_num_unfinished_seq_groups() + with layerwise_profile( num_running_seqs=num_running_seqs) as decode_prof: llm.llm_engine.step() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c67186f70..e771362cce 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -69,6 +69,8 @@ class LLMEngine: self.dp_group = None self.should_execute_dummy_batch = False + self.scheduler_config = vllm_config.scheduler_config + # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -98,6 +100,9 @@ class LLMEngine: if not multiprocess_mode: # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore + self.scheduler = [ + self.engine_core.engine_core.scheduler # type: ignore + ] # type: ignore @classmethod def from_vllm_config(