diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py new file mode 100644 index 0000000000..ed003939c4 --- /dev/null +++ b/tests/entrypoints/openai/test_token_in_token_out.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import tempfile + +import pytest + +from vllm.model_executor.model_loader.weight_utils import ( + download_weights_from_hf) +from vllm.transformers_utils.tokenizer import get_tokenizer + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b") + + +@pytest.fixture(scope="module") +def server(): + global MODEL_PATH + MODEL_PATH = download_weights_from_hf( + MODEL_NAME, + allow_patterns=["*"], + cache_dir=MODEL_PATH, + ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"]) + args = [ + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + "--enforce-eager", + "--skip-tokenizer-init", + "--load-format", + "dummy", + ] + with RemoteOpenAIServer(MODEL_PATH, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +async def test_token_in_token_out_and_logprobs(server): + """ + Test token-in-token-out and token_ids align with prompt_logprobs + & logprobs when return_tokens_as_token_ids is enabled. + """ + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + text = "Hello, world! How are you today?" + token_ids = tokenizer.encode(text) + async with server.get_async_client() as client: + # Test with both return_token_ids and return_tokens_as_token_ids enabled + completion = await client.completions.create( + model=MODEL_PATH, + prompt=token_ids, + max_tokens=20, + temperature=0, + echo=True, + extra_body={ + "return_token_ids": True, + }, + ) + + # Verify all fields are present + assert (completion.choices[0].token_ids is not None + and 0 < len(completion.choices[0].token_ids) <= 20) + assert completion.choices[0].prompt_token_ids is not None + + # Decode prompt tokens + if completion.choices[0].prompt_token_ids: + prompt_text = tokenizer.decode( + completion.choices[0].prompt_token_ids) + # The decoded prompt should match or close to original prompt + assert prompt_text == text diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index b81fd63ece..f461d7609b 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -127,7 +127,11 @@ class OpenAIServingCompletion(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + if self.model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = await self.engine_client.get_tokenizer(lora_request + ) request_prompts, engine_prompts = await self._preprocess_completion( request, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index a97935e109..ca6f398793 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -526,8 +526,8 @@ class OpenAIServing: async def _normalize_prompt_text_to_input( self, request: AnyRequest, - tokenizer: AnyTokenizer, prompt: str, + tokenizer: AnyTokenizer, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]], add_special_tokens: bool, ) -> TextTokensPrompt: @@ -563,12 +563,10 @@ class OpenAIServing: async def _normalize_prompt_tokens_to_input( self, request: AnyRequest, - tokenizer: AnyTokenizer, prompt_ids: list[int], + tokenizer: Optional[AnyTokenizer], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]], ) -> TextTokensPrompt: - async_tokenizer = self._get_async_tokenizer(tokenizer) - if truncate_prompt_tokens is None: input_ids = prompt_ids elif truncate_prompt_tokens < 0: @@ -576,7 +574,11 @@ class OpenAIServing: else: input_ids = prompt_ids[-truncate_prompt_tokens:] - input_text = await async_tokenizer.decode(input_ids) + if tokenizer is None: + input_text = "" + else: + async_tokenizer = self._get_async_tokenizer(tokenizer) + input_text = await async_tokenizer.decode(input_ids) return self._validate_input(request, input_ids, input_text) @@ -681,27 +683,27 @@ class OpenAIServing: [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] that assumes multiple inputs. """ - for text in prompt_inputs: - if isinstance(text, str): + for prompt in prompt_inputs: + if isinstance(prompt, str): yield await self._normalize_prompt_text_to_input( request, - tokenizer, - prompt=text, + prompt=prompt, + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens, ) else: yield await self._normalize_prompt_tokens_to_input( request, - tokenizer, - prompt_ids=text, + prompt_ids=prompt, + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens, ) async def _tokenize_prompt_input_or_inputs_async( self, request: AnyRequest, - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, @@ -740,17 +742,19 @@ class OpenAIServing: tasks = [] for prompt_input in batch_inputs: if prompt_input["is_tokens"] is False: + assert tokenizer is not None, \ + "Tokenizer is required for text prompts" task = self._normalize_prompt_text_to_input( request, - tokenizer, prompt_input["content"], + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=add_special_tokens) else: task = self._normalize_prompt_tokens_to_input( request, - tokenizer, prompt_input["content"], + tokenizer=tokenizer, truncate_prompt_tokens=truncate_prompt_tokens) tasks.append(task) @@ -766,7 +770,7 @@ class OpenAIServing: request: Union[DetokenizeRequest, EmbeddingCompletionRequest, RerankRequest, ClassificationRequest, ScoreRequest, TokenizeCompletionRequest], - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Union[str, list[str], list[int], list[list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ..., add_special_tokens: bool = ..., @@ -777,7 +781,7 @@ class OpenAIServing: async def _preprocess_completion( self, request: CompletionRequest, - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ..., @@ -789,7 +793,7 @@ class OpenAIServing: async def _preprocess_completion( self, request: CompletionLikeRequest, - tokenizer: AnyTokenizer, + tokenizer: Optional[AnyTokenizer], input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,