Compare commits

...

2 Commits

2 changed files with 9 additions and 3 deletions

View File

@ -94,7 +94,7 @@ class KVTransferConfig:
if self.kv_connector is not None and self.kv_role is None:
raise ValueError(
"Please specify kv_disagg_role when kv_connector "
"Please specify kv_role when kv_connector "
f"is set, supported roles are {get_args(KVRole)}"
)

View File

@ -780,7 +780,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Add spec_token_ids to token_ids_cpu.
spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
req_id, ()
req_id, []
)
if spec_token_ids:
num_spec_tokens = len(spec_token_ids)
@ -791,7 +791,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
] = spec_token_ids
# NOTE(woosuk): `num_tokens` here may include spec tokens.
self.input_batch.num_tokens[req_index] += num_spec_tokens
self.input_batch.spec_token_ids[req_index] = spec_token_ids
# When speculative decoding is used with structured output,
# the scheduler can drop draft tokens that do not
# conform to the schema. This can result in
# scheduler_output.scheduled_spec_decode_tokens being empty,
# even when speculative decoding is enabled.
self.input_batch.spec_token_ids[req_index] = spec_token_ids
# Add the new or resumed requests to the persistent batch.
# The smaller empty indices are filled first.