mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
@ -20,7 +20,6 @@ def _make_model_runner_output(
|
||||
req_ids = list(scheduler_output.num_scheduled_tokens.keys())
|
||||
return ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
|
||||
sampled_token_ids=[[i] for i in range(len(req_ids))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
|
@ -163,10 +163,8 @@ def test_schedule_partial_requests():
|
||||
# The third request is also scheduled partially.
|
||||
# The <img> tokens are not scheduled because of the encoder budget.
|
||||
assert output.num_scheduled_tokens[requests[2].request_id] == 100
|
||||
req_to_index = {request.request_id: i for i, request in enumerate(requests)}
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[request.request_id for request in requests],
|
||||
req_id_to_index=req_to_index,
|
||||
# Only the first request has a sampled token id because
|
||||
# the rest requests are still being prefilled.
|
||||
sampled_token_ids=[[0], [], []],
|
||||
@ -212,10 +210,8 @@ def test_no_mm_input_chunking():
|
||||
# We want to only see the 400 text tokens at the start scheduled
|
||||
assert output.num_scheduled_tokens[requests[0].request_id] == 400
|
||||
|
||||
req_to_index = {request.request_id: i for i, request in enumerate(requests)}
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[request.request_id for request in requests],
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -272,10 +268,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
|
||||
assert output.num_scheduled_tokens[requests[1].request_id] == 400
|
||||
# The third request is also scheduled partially - 1024 - 400 - 400 = 224.
|
||||
assert output.num_scheduled_tokens[requests[2].request_id] == 224
|
||||
req_to_index = {request.request_id: i for i, request in enumerate(requests)}
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[request.request_id for request in requests],
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -299,7 +293,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
|
||||
# All the remaining tokens in the third request are processed.
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[request.request_id for request in requests],
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -347,7 +340,6 @@ def test_stop_via_update_from_output():
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
|
||||
sampled_token_ids=[
|
||||
[EOS_TOKEN_ID],
|
||||
[10, 11],
|
||||
@ -395,7 +387,6 @@ def test_stop_via_update_from_output():
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
|
||||
sampled_token_ids=[[10, 42, 12], [13, 14]], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -441,7 +432,6 @@ def test_stop_via_update_from_output():
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
|
||||
sampled_token_ids=[[10, 11, 12], [13]], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -482,7 +472,6 @@ def test_stop_via_update_from_output():
|
||||
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -623,7 +612,6 @@ def test_schedule_concurrent_batches(
|
||||
# Model output of the first request.
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -640,7 +628,6 @@ def test_schedule_concurrent_batches(
|
||||
# Model output of the second request.
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -677,7 +664,6 @@ def test_preempt_during_execution():
|
||||
# Get the output of the first request.
|
||||
model_runner_output0 = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -694,7 +680,6 @@ def test_preempt_during_execution():
|
||||
|
||||
model_runner_output1 = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[[42]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -735,11 +720,9 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens)
|
||||
requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
|
||||
req_ids = []
|
||||
req_to_index = {}
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
req_ids.append(request.request_id)
|
||||
req_to_index[request.request_id] = i
|
||||
|
||||
# Schedule a decode, which will also draft speculative tokens
|
||||
output = scheduler.schedule()
|
||||
@ -752,7 +735,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -795,7 +777,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=output_tokens,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -927,15 +908,12 @@ def test_kv_connector_basic():
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
req_ids = []
|
||||
req_to_index = {}
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
req_ids.append(request.request_id)
|
||||
req_to_index[request.request_id] = i
|
||||
|
||||
MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[1000]] * len(req_ids),
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -977,15 +955,12 @@ def test_kv_connector_basic():
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
req_ids = []
|
||||
req_to_index = {}
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
req_ids.append(request.request_id)
|
||||
req_to_index[request.request_id] = i
|
||||
|
||||
MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[1000]] * len(req_ids),
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -1052,15 +1027,12 @@ def test_kv_connector_unable_to_allocate():
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
req_ids = []
|
||||
req_to_index = {}
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
req_ids.append(request.request_id)
|
||||
req_to_index[request.request_id] = i
|
||||
|
||||
MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[1000]] * len(req_ids),
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -1137,15 +1109,12 @@ def test_kv_connector_handles_preemption():
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
req_ids = []
|
||||
req_to_index = {}
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
req_ids.append(request.request_id)
|
||||
req_to_index[request.request_id] = i
|
||||
|
||||
MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[1000]] * len(req_ids),
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -1238,7 +1207,6 @@ def test_kv_connector_handles_preemption():
|
||||
def make_output(scheduler: Scheduler):
|
||||
return ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in scheduler.running],
|
||||
req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)},
|
||||
sampled_token_ids=[[1000]] * len(scheduler.running),
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -1586,9 +1554,6 @@ def test_priority_scheduling_preemption():
|
||||
# Simulate model execution to move requests to running state
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in low_priority_requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i for i, req in enumerate(low_priority_requests)
|
||||
},
|
||||
sampled_token_ids=[[100] for _ in low_priority_requests],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -1655,9 +1620,6 @@ def test_priority_scheduling_no_preemption_when_space_available():
|
||||
output = scheduler.schedule()
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in low_priority_requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i for i, req in enumerate(low_priority_requests)
|
||||
},
|
||||
sampled_token_ids=[[100] for _ in low_priority_requests],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -1903,7 +1865,6 @@ def test_priority_scheduling_heap_property():
|
||||
# Simulate completion to make room for next request
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.req_id],
|
||||
req_id_to_index={req.req_id: 0},
|
||||
sampled_token_ids=[[100]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -1983,7 +1944,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
|
||||
# Simulate model execution - 1st decode
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[request_low.request_id],
|
||||
req_id_to_index={request_low.request_id: 0},
|
||||
sampled_token_ids=[[100]],
|
||||
# spec_token_ids=None,
|
||||
logprobs=None,
|
||||
@ -2014,7 +1974,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
|
||||
requests = [request_low, request_high]
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
|
||||
sampled_token_ids=[[100] for _ in requests],
|
||||
# spec_token_ids=None,
|
||||
logprobs=None,
|
||||
@ -2040,7 +1999,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
|
||||
# Simulate model execution - 3rd decode
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={req.request_id: i for i, req in enumerate(requests)},
|
||||
sampled_token_ids=[[], [100]],
|
||||
# spec_token_ids=None,
|
||||
logprobs=None,
|
||||
|
@ -680,7 +680,6 @@ def test_kv_connector_stats_aggregation():
|
||||
for i, worker_stats in enumerate([worker1_stats, worker2_stats, worker3_stats]):
|
||||
output = ModelRunnerOutput(
|
||||
req_ids=[f"req_{i}"],
|
||||
req_id_to_index={f"req_{i}": 0},
|
||||
sampled_token_ids=[[123]], # dummy token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -761,7 +760,6 @@ def test_multi_kv_connector_stats_aggregation():
|
||||
stats = make_multi_stats(nixl, foo)
|
||||
output = ModelRunnerOutput(
|
||||
req_ids=[f"req_{i}"],
|
||||
req_id_to_index={f"req_{i}": 0},
|
||||
sampled_token_ids=[[123]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
|
@ -215,7 +215,6 @@ def create_model_runner_output(
|
||||
|
||||
# Make request data.
|
||||
req_ids = [req.request_id for req in reqs]
|
||||
req_id_to_index = {req_id: idx for idx, req_id in enumerate(req_ids)}
|
||||
|
||||
# Make sampled tokens.
|
||||
sampled_token = EOS_TOKEN_ID if use_eos else token_id
|
||||
@ -238,7 +237,6 @@ def create_model_runner_output(
|
||||
# Make output data structure.
|
||||
return ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
|
@ -102,8 +102,6 @@ class KVConnectorOutput:
|
||||
class ModelRunnerOutput:
|
||||
# [num_reqs]
|
||||
req_ids: list[str]
|
||||
# req_id -> index
|
||||
req_id_to_index: dict[str, int]
|
||||
|
||||
# num_reqs x num_generated_tokens
|
||||
# num_generated_tokens is the number of tokens
|
||||
@ -154,7 +152,6 @@ class DraftTokenIds:
|
||||
|
||||
EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
|
||||
req_ids=[],
|
||||
req_id_to_index={},
|
||||
sampled_token_ids=[],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
|
@ -2059,7 +2059,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# NOTE(woosuk): input_batch.req_ids may include requests that are
|
||||
# not scheduled in this step. Therefore, we truncate it here.
|
||||
req_ids=self.input_batch.req_ids[: self.input_batch.num_reqs],
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=[],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@ -2254,7 +2253,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
list[list[int]],
|
||||
dict[str, Optional[LogprobsTensors]],
|
||||
list[str],
|
||||
dict[str, int],
|
||||
list[int],
|
||||
]:
|
||||
num_nans_in_logits = {}
|
||||
@ -2275,7 +2273,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# not scheduled in this step. Therefore, we truncate it here.
|
||||
num_reqs = self.input_batch.num_reqs
|
||||
req_ids_output_copy = self.input_batch.req_ids[:num_reqs].copy()
|
||||
req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy()
|
||||
|
||||
# NOTE: GPU -> CPU Sync happens here.
|
||||
# Move as many CPU operations as possible before this sync point.
|
||||
@ -2361,7 +2358,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
valid_sampled_token_ids,
|
||||
prompt_logprobs_dict,
|
||||
req_ids_output_copy,
|
||||
req_id_to_index_output_copy,
|
||||
invalid_req_indices,
|
||||
)
|
||||
|
||||
@ -2631,7 +2627,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
valid_sampled_token_ids,
|
||||
prompt_logprobs_dict,
|
||||
req_ids_output_copy,
|
||||
req_id_to_index_output_copy,
|
||||
invalid_req_indices,
|
||||
) = self._bookkeeping_sync(
|
||||
scheduler_output,
|
||||
@ -2655,7 +2650,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
output = ModelRunnerOutput(
|
||||
req_ids=req_ids_output_copy,
|
||||
req_id_to_index=req_id_to_index_output_copy,
|
||||
sampled_token_ids=valid_sampled_token_ids,
|
||||
logprobs=logprobs_lists,
|
||||
prompt_logprobs_dict=prompt_logprobs_dict,
|
||||
|
@ -1266,7 +1266,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=valid_sampled_token_ids,
|
||||
logprobs=logprobs_lists,
|
||||
prompt_logprobs_dict=prompt_logprobs_dict,
|
||||
|
Reference in New Issue
Block a user