mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
[RL][BugFix] Fix missing tokenizer error for token-in-token-out (#23904)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
73
tests/entrypoints/openai/test_token_in_token_out.py
Normal file
73
tests/entrypoints/openai/test_token_in_token_out.py
Normal file
@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
global MODEL_PATH
|
||||
MODEL_PATH = download_weights_from_hf(
|
||||
MODEL_NAME,
|
||||
allow_patterns=["*"],
|
||||
cache_dir=MODEL_PATH,
|
||||
ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--skip-tokenizer-init",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
]
|
||||
with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_token_in_token_out_and_logprobs(server):
|
||||
"""
|
||||
Test token-in-token-out and token_ids align with prompt_logprobs
|
||||
& logprobs when return_tokens_as_token_ids is enabled.
|
||||
"""
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
text = "Hello, world! How are you today?"
|
||||
token_ids = tokenizer.encode(text)
|
||||
async with server.get_async_client() as client:
|
||||
# Test with both return_token_ids and return_tokens_as_token_ids enabled
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_PATH,
|
||||
prompt=token_ids,
|
||||
max_tokens=20,
|
||||
temperature=0,
|
||||
echo=True,
|
||||
extra_body={
|
||||
"return_token_ids": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Verify all fields are present
|
||||
assert (completion.choices[0].token_ids is not None
|
||||
and 0 < len(completion.choices[0].token_ids) <= 20)
|
||||
assert completion.choices[0].prompt_token_ids is not None
|
||||
|
||||
# Decode prompt tokens
|
||||
if completion.choices[0].prompt_token_ids:
|
||||
prompt_text = tokenizer.decode(
|
||||
completion.choices[0].prompt_token_ids)
|
||||
# The decoded prompt should match or close to original prompt
|
||||
assert prompt_text == text
|
@ -127,7 +127,11 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request
|
||||
)
|
||||
|
||||
request_prompts, engine_prompts = await self._preprocess_completion(
|
||||
request,
|
||||
|
@ -526,8 +526,8 @@ class OpenAIServing:
|
||||
async def _normalize_prompt_text_to_input(
|
||||
self,
|
||||
request: AnyRequest,
|
||||
tokenizer: AnyTokenizer,
|
||||
prompt: str,
|
||||
tokenizer: AnyTokenizer,
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
|
||||
add_special_tokens: bool,
|
||||
) -> TextTokensPrompt:
|
||||
@ -563,12 +563,10 @@ class OpenAIServing:
|
||||
async def _normalize_prompt_tokens_to_input(
|
||||
self,
|
||||
request: AnyRequest,
|
||||
tokenizer: AnyTokenizer,
|
||||
prompt_ids: list[int],
|
||||
tokenizer: Optional[AnyTokenizer],
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
|
||||
) -> TextTokensPrompt:
|
||||
async_tokenizer = self._get_async_tokenizer(tokenizer)
|
||||
|
||||
if truncate_prompt_tokens is None:
|
||||
input_ids = prompt_ids
|
||||
elif truncate_prompt_tokens < 0:
|
||||
@ -576,7 +574,11 @@ class OpenAIServing:
|
||||
else:
|
||||
input_ids = prompt_ids[-truncate_prompt_tokens:]
|
||||
|
||||
input_text = await async_tokenizer.decode(input_ids)
|
||||
if tokenizer is None:
|
||||
input_text = ""
|
||||
else:
|
||||
async_tokenizer = self._get_async_tokenizer(tokenizer)
|
||||
input_text = await async_tokenizer.decode(input_ids)
|
||||
|
||||
return self._validate_input(request, input_ids, input_text)
|
||||
|
||||
@ -681,27 +683,27 @@ class OpenAIServing:
|
||||
[`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
|
||||
that assumes multiple inputs.
|
||||
"""
|
||||
for text in prompt_inputs:
|
||||
if isinstance(text, str):
|
||||
for prompt in prompt_inputs:
|
||||
if isinstance(prompt, str):
|
||||
yield await self._normalize_prompt_text_to_input(
|
||||
request,
|
||||
tokenizer,
|
||||
prompt=text,
|
||||
prompt=prompt,
|
||||
tokenizer=tokenizer,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
add_special_tokens=add_special_tokens,
|
||||
)
|
||||
else:
|
||||
yield await self._normalize_prompt_tokens_to_input(
|
||||
request,
|
||||
tokenizer,
|
||||
prompt_ids=text,
|
||||
prompt_ids=prompt,
|
||||
tokenizer=tokenizer,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
)
|
||||
|
||||
async def _tokenize_prompt_input_or_inputs_async(
|
||||
self,
|
||||
request: AnyRequest,
|
||||
tokenizer: AnyTokenizer,
|
||||
tokenizer: Optional[AnyTokenizer],
|
||||
input_or_inputs: Optional[Union[str, list[str], list[int],
|
||||
list[list[int]]]],
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
|
||||
@ -740,17 +742,19 @@ class OpenAIServing:
|
||||
tasks = []
|
||||
for prompt_input in batch_inputs:
|
||||
if prompt_input["is_tokens"] is False:
|
||||
assert tokenizer is not None, \
|
||||
"Tokenizer is required for text prompts"
|
||||
task = self._normalize_prompt_text_to_input(
|
||||
request,
|
||||
tokenizer,
|
||||
prompt_input["content"],
|
||||
tokenizer=tokenizer,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
add_special_tokens=add_special_tokens)
|
||||
else:
|
||||
task = self._normalize_prompt_tokens_to_input(
|
||||
request,
|
||||
tokenizer,
|
||||
prompt_input["content"],
|
||||
tokenizer=tokenizer,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens)
|
||||
tasks.append(task)
|
||||
|
||||
@ -766,7 +770,7 @@ class OpenAIServing:
|
||||
request: Union[DetokenizeRequest, EmbeddingCompletionRequest,
|
||||
RerankRequest, ClassificationRequest, ScoreRequest,
|
||||
TokenizeCompletionRequest],
|
||||
tokenizer: AnyTokenizer,
|
||||
tokenizer: Optional[AnyTokenizer],
|
||||
input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
|
||||
add_special_tokens: bool = ...,
|
||||
@ -777,7 +781,7 @@ class OpenAIServing:
|
||||
async def _preprocess_completion(
|
||||
self,
|
||||
request: CompletionRequest,
|
||||
tokenizer: AnyTokenizer,
|
||||
tokenizer: Optional[AnyTokenizer],
|
||||
input_or_inputs: Optional[Union[str, list[str], list[int],
|
||||
list[list[int]]]],
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
|
||||
@ -789,7 +793,7 @@ class OpenAIServing:
|
||||
async def _preprocess_completion(
|
||||
self,
|
||||
request: CompletionLikeRequest,
|
||||
tokenizer: AnyTokenizer,
|
||||
tokenizer: Optional[AnyTokenizer],
|
||||
input_or_inputs: Optional[Union[str, list[str], list[int],
|
||||
list[list[int]]]],
|
||||
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
|
||||
|
Reference in New Issue
Block a user