mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[TPU][V1][Bugfix] Fix chunked prefill with padding (#15037)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@ -410,6 +410,9 @@ class TPUModelRunner:
|
||||
# Do the padding and copy the tensors to the TPU.
|
||||
padded_total_num_scheduled_tokens = _get_padded_token_len(
|
||||
total_num_scheduled_tokens)
|
||||
# Zero out to avoid spurious values from prev iteration (last cp chunk)
|
||||
self.input_ids_cpu[
|
||||
total_num_scheduled_tokens:padded_total_num_scheduled_tokens] = 0
|
||||
self.input_ids = self.input_ids_cpu[:
|
||||
padded_total_num_scheduled_tokens].to(
|
||||
self.device)
|
||||
|
Reference in New Issue
Block a user