diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 258f2f460f..9a37803a7f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -780,7 +780,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Add spec_token_ids to token_ids_cpu. spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( - req_id, () + req_id, [] ) if spec_token_ids: num_spec_tokens = len(spec_token_ids) @@ -791,7 +791,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ] = spec_token_ids # NOTE(woosuk): `num_tokens` here may include spec tokens. self.input_batch.num_tokens[req_index] += num_spec_tokens - self.input_batch.spec_token_ids[req_index] = spec_token_ids + + # When speculative decoding is used with structured output, + # the scheduler can drop draft tokens that do not + # conform to the schema. This can result in + # scheduler_output.scheduled_spec_decode_tokens being empty, + # even when speculative decoding is enabled. + self.input_batch.spec_token_ids[req_index] = spec_token_ids # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first.