mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[BugFix] Pad input buffers in _dummy_run (#26209)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
This commit is contained in:
committed by
GitHub
parent
93540958b8
commit
f23b4c04fd
@ -3434,26 +3434,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
with self.maybe_dummy_run_with_lora(
|
||||
self.lora_config, num_scheduled_tokens, remove_lora
|
||||
):
|
||||
model_kwargs = self._init_model_kwargs(num_tokens)
|
||||
# Make sure padding doesn't exceed max_num_tokens
|
||||
assert num_tokens_after_padding <= self.max_num_tokens
|
||||
model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
|
||||
if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
|
||||
input_ids = None
|
||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
|
||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
|
||||
model_kwargs = {
|
||||
**model_kwargs,
|
||||
**self._dummy_mm_kwargs(num_reqs),
|
||||
}
|
||||
elif self.enable_prompt_embeds:
|
||||
input_ids = None
|
||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
|
||||
model_kwargs = self._init_model_kwargs(num_tokens)
|
||||
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
|
||||
model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
|
||||
else:
|
||||
input_ids = self.input_ids.gpu[:num_tokens]
|
||||
input_ids = self.input_ids.gpu[:num_tokens_after_padding]
|
||||
inputs_embeds = None
|
||||
|
||||
if self.uses_mrope:
|
||||
positions = self.mrope_positions.gpu[:, :num_tokens]
|
||||
positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
|
||||
else:
|
||||
positions = self.positions.gpu[:num_tokens]
|
||||
positions = self.positions.gpu[:num_tokens_after_padding]
|
||||
|
||||
if get_pp_group().is_first_rank:
|
||||
intermediate_tensors = None
|
||||
@ -3468,7 +3470,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
)
|
||||
|
||||
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
|
||||
num_tokens, None, False
|
||||
num_tokens_after_padding, None, False
|
||||
)
|
||||
|
||||
# filter out the valid batch descriptor
|
||||
|
Reference in New Issue
Block a user