mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
[Doc] Unify structured outputs examples (#18196)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
@ -142,51 +142,6 @@ for chunk in stream:
|
||||
|
||||
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
|
||||
## Structured output
|
||||
|
||||
The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
The following is an example client:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
class People(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
json_schema = People.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the name and age of one random person.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
## Tool Calling
|
||||
|
||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
||||
|
||||
@ -39,9 +39,10 @@ client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
model = client.models.list().data[0].id
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
@ -54,7 +55,7 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
|
||||
|
||||
```python
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
@ -92,26 +93,32 @@ class CarDescription(BaseModel):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
"response_format": {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "car-description",
|
||||
"schema": CarDescription.model_json_schema()
|
||||
},
|
||||
},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
!!! tip
|
||||
While not strictly necessary, normally it´s better to indicate in the prompt the
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
results notably in most cases.
|
||||
|
||||
Finally we have the `guided_grammar` option, which is probably the most
|
||||
difficult to use, but it´s really powerful. It allows us to define complete
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
As an example, we can use to define a specific format of simplified SQL queries:
|
||||
|
||||
```python
|
||||
@ -130,7 +137,7 @@ simplified_sql_grammar = """
|
||||
"""
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
@ -142,7 +149,48 @@ completion = client.chat.completions.create(
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
|
||||
See also: [full example](../../examples/online_serving/structured_outputs)
|
||||
|
||||
## Reasoning Outputs
|
||||
|
||||
You can also use structured outputs with <project:#reasoning-outputs> for reasoning models.
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class People(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a JSON with the name and age of one random person.",
|
||||
}
|
||||
],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "people",
|
||||
"schema": People.model_json_schema()
|
||||
}
|
||||
},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
See also: [full example](../../examples/online_serving/structured_outputs)
|
||||
|
||||
## Experimental Automatic Parsing (OpenAI API)
|
||||
|
||||
@ -163,14 +211,14 @@ class Info(BaseModel):
|
||||
age: int
|
||||
|
||||
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
|
||||
model = client.models.list().data[0].id
|
||||
completion = client.beta.chat.completions.parse(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
|
||||
],
|
||||
response_format=Info,
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
)
|
||||
|
||||
message = completion.choices[0].message
|
||||
@ -203,15 +251,13 @@ class MathResponse(BaseModel):
|
||||
steps: list[Step]
|
||||
final_answer: str
|
||||
|
||||
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
|
||||
completion = client.beta.chat.completions.parse(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful expert math tutor."},
|
||||
{"role": "user", "content": "Solve 8x + 31 = 2."},
|
||||
],
|
||||
response_format=MathResponse,
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
)
|
||||
|
||||
message = completion.choices[0].message
|
||||
@ -232,11 +278,11 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
|
||||
Answer: x = -29/8
|
||||
```
|
||||
|
||||
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py>
|
||||
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs>
|
||||
|
||||
## Offline Inference
|
||||
|
||||
Offline inference allows for the same types of guided decoding.
|
||||
Offline inference allows for the same types of structured outputs.
|
||||
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
|
||||
The main available options inside `GuidedDecodingParams` are:
|
||||
|
||||
@ -247,7 +293,7 @@ The main available options inside `GuidedDecodingParams` are:
|
||||
- `structural_tag`
|
||||
|
||||
These parameters can be used in the same way as the parameters from the Online
|
||||
Serving examples above. One example for the usage of the `choice` parameter is
|
||||
Serving examples above. One example for the usage of the `choice` parameter is
|
||||
shown below:
|
||||
|
||||
```python
|
||||
@ -265,4 +311,4 @@ outputs = llm.generate(
|
||||
print(outputs[0].outputs[0].text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/structured_outputs.py>
|
||||
See also: [full example](../../examples/online_serving/structured_outputs)
|
||||
|
||||
@ -1,175 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
To run this example, you need to start the vLLM server:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from openai import BadRequestError, OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
def guided_choice_completion(client: OpenAI, model: str):
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by Regex
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = (
|
||||
"Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n"
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
truck = "Truck"
|
||||
coupe = "Coupe"
|
||||
|
||||
|
||||
class CarDescription(BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = (
|
||||
"Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's"
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by Grammar
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
|
||||
prompt = (
|
||||
"Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table."
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Extra backend options
|
||||
def extra_backend_options_completion(client: OpenAI, model: str):
|
||||
prompt = (
|
||||
"Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n"
|
||||
)
|
||||
|
||||
try:
|
||||
# The guided_decoding_disable_fallback option forces vLLM to use
|
||||
# xgrammar, so when it fails you get a 400 with the reason why
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"guided_regex": r"\w+@\w+\.com\n",
|
||||
"stop": ["\n"],
|
||||
"guided_decoding_disable_fallback": True,
|
||||
},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
except BadRequestError as e:
|
||||
print("This error is expected:", e)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
base_url=openai_api_base,
|
||||
api_key=openai_api_key,
|
||||
)
|
||||
|
||||
model = client.models.list().data[0].id
|
||||
|
||||
print("Guided Choice Completion:")
|
||||
print(guided_choice_completion(client, model))
|
||||
|
||||
print("\nGuided Regex Completion:")
|
||||
print(guided_regex_completion(client, model))
|
||||
|
||||
print("\nGuided JSON Completion:")
|
||||
print(guided_json_completion(client, model))
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
print(guided_grammar_completion(client, model))
|
||||
|
||||
print("\nExtra Backend Options Completion:")
|
||||
print(extra_backend_options_completion(client, model))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,87 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from openai import OpenAI
|
||||
|
||||
# This example demonstrates the `structural_tag` response format.
|
||||
# It can be used to specify a structured output format that occurs between
|
||||
# specific tags in the response. This example shows how it could be used
|
||||
# to enforce the format of a tool call response, but it could be used for
|
||||
# any structured output within a subset of the response.
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
base_url=openai_api_base,
|
||||
api_key=openai_api_key,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
You have access to the following function to retrieve the weather in a city:
|
||||
|
||||
{
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"city": {
|
||||
"param_type": "string",
|
||||
"description": "The city to get the weather for",
|
||||
"required": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If a you choose to call a function ONLY reply in the following format:
|
||||
<{start_tag}={function_name}>{parameters}{end_tag}
|
||||
where
|
||||
|
||||
start_tag => `<function`
|
||||
parameters => a JSON dict with the function argument name as key and function
|
||||
argument value as value.
|
||||
end_tag => `</function>`
|
||||
|
||||
Here is an example,
|
||||
<function=example_function_name>{"example_name": "example_value"}</function>
|
||||
|
||||
Reminder:
|
||||
- Function calls MUST follow the specified format
|
||||
- Required parameters MUST be specified
|
||||
- Only call one function at a time
|
||||
- Put the entire function call reply on one line
|
||||
- Always add your sources when using search results to answer the user query
|
||||
|
||||
You are a helpful assistant.
|
||||
|
||||
Given the previous instructions, what is the weather in New York City, Boston,
|
||||
and San Francisco?
|
||||
""",
|
||||
}
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=messages,
|
||||
response_format={
|
||||
"type": "structural_tag",
|
||||
"structures": [
|
||||
{
|
||||
"begin": "<function=get_weather>",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"city": {"type": "string"}},
|
||||
},
|
||||
"end": "</function>",
|
||||
}
|
||||
],
|
||||
"triggers": ["<function="],
|
||||
},
|
||||
)
|
||||
print(response)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,167 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
An example shows how to generate structured outputs from reasoning models
|
||||
like DeepSeekR1. The thinking process will not be guided by the JSON
|
||||
schema provided by the user. Only the final output will be structured.
|
||||
|
||||
To run this example, you need to start the vLLM server with the reasoning
|
||||
parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
--reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions from reasoning models
|
||||
using the OpenAI Python client library.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from openai import OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
|
||||
def print_completion_details(completion):
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
|
||||
# Guided decoding by Regex
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = "What is the capital of France?"
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"guided_regex": "(Paris|London)",
|
||||
},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
class People(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = People.model_json_schema()
|
||||
|
||||
prompt = "Generate a JSON with the name and age of one random person."
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
truck = "Truck"
|
||||
coupe = "Coupe"
|
||||
|
||||
|
||||
class CarDescription(BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
def guided_car_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = (
|
||||
"Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's"
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
# Guided decoding by Grammar
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
|
||||
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
|
||||
prompt = (
|
||||
"Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table."
|
||||
)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model: str = models.data[0].id
|
||||
|
||||
print("Guided Regex Completion:")
|
||||
guided_regex_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (People):")
|
||||
guided_json_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (CarDescription):")
|
||||
guided_car_json_completion(client, model)
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
guided_grammar_completion(client, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
54
examples/online_serving/structured_outputs/README.md
Normal file
54
examples/online_serving/structured_outputs/README.md
Normal file
@ -0,0 +1,54 @@
|
||||
# Structured Outputs
|
||||
|
||||
This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server.
|
||||
It can run individual constraint type or all of them.
|
||||
It supports both streaming responses and concurrent non-streaming requests.
|
||||
|
||||
To use this example, you must start an vLLM server with any model of your choice.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
|
||||
To serve a reasoning model, you can use the following command:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
|
||||
```
|
||||
|
||||
If you want to run this script standalone with `uv`, you can use the following:
|
||||
|
||||
```bash
|
||||
uvx --from git+https://github.com/vllm-project/vllm#subdirectory=examples/online_serving/structured_outputs structured-output
|
||||
```
|
||||
|
||||
See [feature docs](../../../features/structured_outputs.md) for more information.
|
||||
|
||||
!!! tip
|
||||
If vLLM is running remotely, then set `OPENAI_BASE_URL=<remote_url>` before running the script.
|
||||
|
||||
## Usage
|
||||
|
||||
Run all constraints, non-streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py
|
||||
```
|
||||
|
||||
Run all constraints, streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --stream
|
||||
```
|
||||
|
||||
Run certain constraints, for example `structural_tag` and `regex`, streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --constraint structural_tag regex --stream
|
||||
```
|
||||
|
||||
Run all constraints, with reasoning models and streaming:
|
||||
|
||||
```bash
|
||||
uv run structured_outputs.py --reasoning --stream
|
||||
```
|
||||
@ -0,0 +1,8 @@
|
||||
[project]
|
||||
name = "examples-online-structured-outputs"
|
||||
requires-python = ">=3.9, <3.13"
|
||||
dependencies = ["openai==1.78.1", "pydantic==2.11.4"]
|
||||
version = "0.0.0"
|
||||
|
||||
[project.scripts]
|
||||
structured-outputs = "structured_outputs:main"
|
||||
272
examples/online_serving/structured_outputs/structured_outputs.py
Normal file
272
examples/online_serving/structured_outputs/structured_outputs.py
Normal file
@ -0,0 +1,272 @@
|
||||
# ruff: noqa: E501
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import enum
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import openai
|
||||
import pydantic
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from openai.types.chat import ChatCompletionChunk
|
||||
|
||||
|
||||
ConstraintsFormat = Literal[
|
||||
"choice",
|
||||
"regex",
|
||||
"json",
|
||||
"grammar",
|
||||
"structural_tag",
|
||||
]
|
||||
|
||||
|
||||
async def print_stream_response(
|
||||
stream_response: openai.AsyncStream[ChatCompletionChunk],
|
||||
title: str,
|
||||
args: argparse.Namespace,
|
||||
):
|
||||
print(f"\n\n{title} (Streaming):")
|
||||
|
||||
local_reasoning_header_printed = False
|
||||
local_content_header_printed = False
|
||||
|
||||
async for chunk in stream_response:
|
||||
delta = chunk.choices[0].delta
|
||||
|
||||
reasoning_chunk_text: str | None = getattr(delta, "reasoning_content", None)
|
||||
content_chunk_text = delta.content
|
||||
|
||||
if args.reasoning:
|
||||
if reasoning_chunk_text:
|
||||
if not local_reasoning_header_printed:
|
||||
print(" Reasoning: ", end="")
|
||||
local_reasoning_header_printed = True
|
||||
print(reasoning_chunk_text, end="", flush=True)
|
||||
|
||||
if content_chunk_text:
|
||||
if not local_content_header_printed:
|
||||
if local_reasoning_header_printed:
|
||||
print()
|
||||
print(" Content: ", end="")
|
||||
local_content_header_printed = True
|
||||
print(content_chunk_text, end="", flush=True)
|
||||
else:
|
||||
if content_chunk_text:
|
||||
if not local_content_header_printed:
|
||||
print(" Content: ", end="")
|
||||
local_content_header_printed = True
|
||||
print(content_chunk_text, end="", flush=True)
|
||||
print()
|
||||
|
||||
|
||||
class CarType(str, enum.Enum):
|
||||
SEDAN = "SEDAN"
|
||||
SUV = "SUV"
|
||||
TRUCK = "TRUCK"
|
||||
COUPE = "COUPE"
|
||||
|
||||
|
||||
class CarDescription(pydantic.BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
"choice": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Classify this sentiment: vLLM is wonderful!",
|
||||
}
|
||||
],
|
||||
"extra_body": {"guided_choice": ["positive", "negative"]},
|
||||
},
|
||||
"regex": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate an email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: 'alan.turing@enigma.com\n'",
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
|
||||
},
|
||||
},
|
||||
"json": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
|
||||
}
|
||||
],
|
||||
"response_format": {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "car-description",
|
||||
"schema": CarDescription.model_json_schema(),
|
||||
},
|
||||
},
|
||||
},
|
||||
"grammar": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_grammar": """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
""",
|
||||
},
|
||||
},
|
||||
"structural_tag": {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
You have access to the following function to retrieve the weather in a city:
|
||||
|
||||
{
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"city": {
|
||||
"param_type": "string",
|
||||
"description": "The city to get the weather for",
|
||||
"required": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If a you choose to call a function ONLY reply in the following format:
|
||||
<{start_tag}={function_name}>{parameters}{end_tag}
|
||||
where
|
||||
|
||||
start_tag => `<function`
|
||||
parameters => a JSON dict with the function argument name as key and function
|
||||
argument value as value.
|
||||
end_tag => `</function>`
|
||||
|
||||
Here is an example,
|
||||
<function=example_function_name>{"example_name": "example_value"}</function>
|
||||
|
||||
Reminder:
|
||||
- Function calls MUST follow the specified format
|
||||
- Required parameters MUST be specified
|
||||
- Only call one function at a time
|
||||
- Put the entire function call reply on one line
|
||||
- Always add your sources when using search results to answer the user query
|
||||
|
||||
You are a helpful assistant.
|
||||
|
||||
Given the previous instructions, what is the weather in New York City, Boston,
|
||||
and San Francisco?""",
|
||||
},
|
||||
],
|
||||
"response_format": {
|
||||
"type": "structural_tag",
|
||||
"structures": [
|
||||
{
|
||||
"begin": "<function=get_weather>",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"city": {"type": "string"}},
|
||||
"required": ["city"],
|
||||
},
|
||||
"end": "</function>",
|
||||
}
|
||||
],
|
||||
"triggers": ["<function="],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def cli():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run OpenAI Chat Completion with various structured outputs capabilities",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--constraint",
|
||||
type=str,
|
||||
nargs="+",
|
||||
choices=[*list(PARAMS), "*"],
|
||||
default=["*"],
|
||||
help="Specify which constraint(s) to run.",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--stream",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="Enable streaming output",
|
||||
)
|
||||
_ = parser.add_argument(
|
||||
"--reasoning",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help="Enable printing of reasoning traces if available.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
base_url = os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v1")
|
||||
client = openai.AsyncOpenAI(base_url=base_url, api_key="EMPTY")
|
||||
constraints = list(PARAMS) if "*" in args.constraint else list(set(args.constraint))
|
||||
model = (await client.models.list()).data[0].id
|
||||
|
||||
if args.stream:
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
max_tokens=1024,
|
||||
stream=True,
|
||||
**PARAMS[name],
|
||||
)
|
||||
for name in constraints
|
||||
]
|
||||
)
|
||||
for constraint, stream in zip(constraints, results):
|
||||
await print_stream_response(stream, constraint, args)
|
||||
else:
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
max_tokens=1024,
|
||||
stream=False,
|
||||
**PARAMS[name],
|
||||
)
|
||||
for name in constraints
|
||||
]
|
||||
)
|
||||
for constraint, response in zip(constraints, results):
|
||||
print(f"\n\n{constraint}:")
|
||||
message = response.choices[0].message
|
||||
if args.reasoning and hasattr(message, "reasoning_content"):
|
||||
print(f" Reasoning: {message.reasoning_content or ''}")
|
||||
print(f" Content: {message.content!r}")
|
||||
|
||||
|
||||
def main():
|
||||
asyncio.run(cli())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user