mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Bugfix] Fix error with penalties when speculative decoding and structural output are enabled (#26586)
Signed-off-by: southfreebird <yvorott@gmail.com>
This commit is contained in:
@ -780,7 +780,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
# Add spec_token_ids to token_ids_cpu.
|
# Add spec_token_ids to token_ids_cpu.
|
||||||
spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
|
spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
|
||||||
req_id, ()
|
req_id, []
|
||||||
)
|
)
|
||||||
if spec_token_ids:
|
if spec_token_ids:
|
||||||
num_spec_tokens = len(spec_token_ids)
|
num_spec_tokens = len(spec_token_ids)
|
||||||
@ -791,7 +791,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
] = spec_token_ids
|
] = spec_token_ids
|
||||||
# NOTE(woosuk): `num_tokens` here may include spec tokens.
|
# NOTE(woosuk): `num_tokens` here may include spec tokens.
|
||||||
self.input_batch.num_tokens[req_index] += num_spec_tokens
|
self.input_batch.num_tokens[req_index] += num_spec_tokens
|
||||||
self.input_batch.spec_token_ids[req_index] = spec_token_ids
|
|
||||||
|
# When speculative decoding is used with structured output,
|
||||||
|
# the scheduler can drop draft tokens that do not
|
||||||
|
# conform to the schema. This can result in
|
||||||
|
# scheduler_output.scheduled_spec_decode_tokens being empty,
|
||||||
|
# even when speculative decoding is enabled.
|
||||||
|
self.input_batch.spec_token_ids[req_index] = spec_token_ids
|
||||||
|
|
||||||
# Add the new or resumed requests to the persistent batch.
|
# Add the new or resumed requests to the persistent batch.
|
||||||
# The smaller empty indices are filled first.
|
# The smaller empty indices are filled first.
|
||||||
|
Reference in New Issue
Block a user