98 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			98 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# SPDX-License-Identifier: Apache-2.0
 | 
						|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
						|
"""
 | 
						|
Demonstrates how to generate prompt embeddings using
 | 
						|
Hugging Face Transformers  and use them as input to vLLM
 | 
						|
for both single and batch inference.
 | 
						|
 | 
						|
Model: meta-llama/Llama-3.2-1B-Instruct
 | 
						|
Note: This model is gated on Hugging Face Hub.
 | 
						|
      You must request access to use it:
 | 
						|
      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
 | 
						|
 | 
						|
Requirements:
 | 
						|
- vLLM
 | 
						|
- transformers
 | 
						|
 | 
						|
Run:
 | 
						|
    python examples/offline_inference/prompt_embed_inference.py
 | 
						|
"""
 | 
						|
 | 
						|
import torch
 | 
						|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
 | 
						|
 | 
						|
from vllm import LLM
 | 
						|
 | 
						|
 | 
						|
def init_tokenizer_and_llm(model_name: str):
 | 
						|
    tokenizer = AutoTokenizer.from_pretrained(model_name)
 | 
						|
    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
 | 
						|
    embedding_layer = transformers_model.get_input_embeddings()
 | 
						|
    llm = LLM(model=model_name, enable_prompt_embeds=True)
 | 
						|
    return tokenizer, embedding_layer, llm
 | 
						|
 | 
						|
 | 
						|
def get_prompt_embeds(
 | 
						|
    chat: list[dict[str, str]],
 | 
						|
    tokenizer: PreTrainedTokenizer,
 | 
						|
    embedding_layer: torch.nn.Module,
 | 
						|
):
 | 
						|
    token_ids = tokenizer.apply_chat_template(
 | 
						|
        chat, add_generation_prompt=True, return_tensors="pt"
 | 
						|
    )
 | 
						|
    prompt_embeds = embedding_layer(token_ids).squeeze(0)
 | 
						|
    return prompt_embeds
 | 
						|
 | 
						|
 | 
						|
def single_prompt_inference(
 | 
						|
    llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
 | 
						|
):
 | 
						|
    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
 | 
						|
    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
 | 
						|
 | 
						|
    outputs = llm.generate(
 | 
						|
        {
 | 
						|
            "prompt_embeds": prompt_embeds,
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    print("\n[Single Inference Output]")
 | 
						|
    print("-" * 30)
 | 
						|
    for o in outputs:
 | 
						|
        print(o.outputs[0].text)
 | 
						|
    print("-" * 30)
 | 
						|
 | 
						|
 | 
						|
def batch_prompt_inference(
 | 
						|
    llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
 | 
						|
):
 | 
						|
    chats = [
 | 
						|
        [{"role": "user", "content": "Please tell me about the capital of France."}],
 | 
						|
        [{"role": "user", "content": "When is the day longest during the year?"}],
 | 
						|
        [{"role": "user", "content": "Where is bigger, the moon or the sun?"}],
 | 
						|
    ]
 | 
						|
 | 
						|
    prompt_embeds_list = [
 | 
						|
        get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
 | 
						|
    ]
 | 
						|
 | 
						|
    outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list])
 | 
						|
 | 
						|
    print("\n[Batch Inference Outputs]")
 | 
						|
    print("-" * 30)
 | 
						|
    for i, o in enumerate(outputs):
 | 
						|
        print(f"Q{i + 1}: {chats[i][0]['content']}")
 | 
						|
        print(f"A{i + 1}: {o.outputs[0].text}\n")
 | 
						|
    print("-" * 30)
 | 
						|
 | 
						|
 | 
						|
def main():
 | 
						|
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
 | 
						|
    tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
 | 
						|
    single_prompt_inference(llm, tokenizer, embedding_layer)
 | 
						|
    batch_prompt_inference(llm, tokenizer, embedding_layer)
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    main()
 |