[Misc][V0 Deprecation] Add __main__ guard to all offline examples (#1837)

### What this PR does / why we need it?
Add `__main__` guard to all offline examples.

- vLLM version: v0.9.2
- vLLM main:
76b494444f

---------

Signed-off-by: shen-shanshan <467638484@qq.com>
This commit is contained in:
Shanshan Shen
2025-07-17 14:13:30 +08:00
committed by GitHub
parent 19e37cd379
commit aeb5aa8b88
15 changed files with 157 additions and 87 deletions

View File

@ -12,6 +12,9 @@ import os
import time import time
from multiprocessing import Event, Process from multiprocessing import Event, Process
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
kv_connector_extra_config = { kv_connector_extra_config = {
"prefill_device_ips": ["1.2.3.1", "1.2.3.2"], "prefill_device_ips": ["1.2.3.1", "1.2.3.2"],
"decode_device_ips": ["1.2.3.9", "1.2.3.10"], "decode_device_ips": ["1.2.3.9", "1.2.3.10"],

View File

@ -13,6 +13,9 @@ import msgpack # type: ignore
import zmq import zmq
from quart import Quart, make_response, request from quart import Quart, make_response, request
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
DP_PROXY_HTTP_PORT = 10004 DP_PROXY_HTTP_PORT = 10004
DP_PROXY_ZMQ_REG_PORT = 30006 DP_PROXY_ZMQ_REG_PORT = 30006
DP_PROXY_ZMQ_NOTIFY_PORT = 30005 DP_PROXY_ZMQ_NOTIFY_PORT = 30005

View File

@ -8,6 +8,9 @@ import msgpack # type: ignore
import zmq import zmq
from quart import Quart, make_response, request from quart import Quart, make_response, request
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
prefill_instances: dict[str, str] = {} # http_address: zmq_address prefill_instances: dict[str, str] = {} # http_address: zmq_address
decode_instances: dict[str, str] = {} # http_address: zmq_address decode_instances: dict[str, str] = {} # http_address: zmq_address

View File

@ -8,6 +8,9 @@ import matplotlib.pyplot as plt # type: ignore
import numpy as np import numpy as np
import torch import torch
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
logger = logging.getLogger("msit_logger") logger = logging.getLogger("msit_logger")

View File

@ -60,6 +60,9 @@ from time import sleep
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.utils import get_open_port from vllm.utils import get_open_port
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def parse_args(): def parse_args():
import argparse import argparse

View File

@ -21,6 +21,8 @@ import os
import time import time
from multiprocessing import Event, Process from multiprocessing import Event, Process
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def clean_up(): def clean_up():
import gc import gc

View File

@ -17,28 +17,37 @@
# Adapted from vllm-project/vllm/examples/offline_inference/basic.py # Adapted from vllm-project/vllm/examples/offline_inference/basic.py
# #
import os
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
prompts = [ os.environ["VLLM_USE_MODELSCOPE"] = "True"
"Hello, my name is", os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object. def main():
sampling_params = SamplingParams(max_tokens=100, temperature=0.0) prompts = [
# Create an LLM. "Hello, my name is",
llm = LLM( "The president of the United States is",
model="Qwen/Qwen2.5-0.5B-Instruct", "The capital of France is",
tensor_parallel_size=2, "The future of AI is",
distributed_executor_backend="mp", ]
trust_remote_code=True,
)
# Generate texts from the prompts. # Create a sampling params object.
outputs = llm.generate(prompts, sampling_params) sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
for output in outputs: # Create an LLM.
prompt = output.prompt llm = LLM(
generated_text = output.outputs[0].text model="Qwen/Qwen2.5-0.5B-Instruct",
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") tensor_parallel_size=2,
distributed_executor_backend="mp",
trust_remote_code=True,
)
# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()

View File

@ -3,6 +3,8 @@ import time
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
# enable dual-batch overlap for vllm ascend # enable dual-batch overlap for vllm ascend
os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1" os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"

View File

@ -19,35 +19,40 @@
import os import os
os.environ["VLLM_USE_MODELSCOPE"] = "True"
import torch import torch
from vllm import LLM from vllm import LLM
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def get_detailed_instruct(task_description: str, query: str) -> str: def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery:{query}' return f'Instruct: {task_description}\nQuery:{query}'
# Each query must come with a one-sentence instruction that describes the task def main():
task = 'Given a web search query, retrieve relevant passages that answer the query' # Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [ queries = [
get_detailed_instruct(task, 'What is the capital of China?'), get_detailed_instruct(task, 'What is the capital of China?'),
get_detailed_instruct(task, 'Explain gravity') get_detailed_instruct(task, 'Explain gravity')
] ]
# No need to add instruction for retrieval documents # No need to add instruction for retrieval documents
documents = [ documents = [
"The capital of China is Beijing.", "The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun." "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
] ]
input_texts = queries + documents input_texts = queries + documents
model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed") model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
outputs = model.embed(input_texts) outputs = model.embed(input_texts)
embeddings = torch.tensor([o.outputs.embedding for o in outputs]) embeddings = torch.tensor([o.outputs.embedding for o in outputs])
# Calculate the similarity scores between the first two queries and the last two documents # Calculate the similarity scores between the first two queries and the last two documents
scores = (embeddings[:2] @ embeddings[2:].T) scores = (embeddings[:2] @ embeddings[2:].T)
print(scores.tolist()) print(scores.tolist())
# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]] # [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
if __name__ == "__main__":
main()

View File

@ -24,9 +24,14 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository. on HuggingFace model repository.
""" """
import os
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
question_per_audio_count = { question_per_audio_count = {
1: "What is recited in the audio?", 1: "What is recited in the audio?",

View File

@ -21,24 +21,31 @@
import os import os
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object. def main():
sampling_params = SamplingParams(max_tokens=100, temperature=0.0) prompts = [
# Create an LLM. "Hello, my name is",
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") "The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Generate texts from the prompts. # Create a sampling params object.
outputs = llm.generate(prompts, sampling_params) sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
for output in outputs: # Create an LLM.
prompt = output.prompt llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()

View File

@ -25,7 +25,8 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
if __name__ == "__main__":
def main():
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
@ -48,3 +49,7 @@ if __name__ == "__main__":
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()

View File

@ -25,7 +25,7 @@ from vllm.utils import GiB_bytes
os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
if __name__ == "__main__": def main():
prompt = "How are you?" prompt = "How are you?"
free, total = torch.npu.mem_get_info() free, total = torch.npu.mem_get_info()
@ -51,3 +51,7 @@ if __name__ == "__main__":
output2 = llm.generate(prompt, sampling_params) output2 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
if __name__ == "__main__":
main()

View File

@ -17,34 +17,45 @@
# limitations under the License. # limitations under the License.
# #
import os
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
prompts = [ os.environ["VLLM_USE_MODELSCOPE"] = "True"
"Hello, my name is", os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
"The president of the United States is",
"The capital of France is",
"The future of AI is",
"China is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
# Create an LLM.
llm = LLM(
model="Qwen/Qwen2.5-0.5B",
block_size=128,
max_model_len=1024, # max length of prompt
tensor_parallel_size=1, # number of NPUs to be used
max_num_seqs=26, # max batch number
enforce_eager=
True, # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
trust_remote_code=
True, # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
num_scheduler_steps=8,
gpu_memory_utilization=0.5)
outputs = llm.generate(prompts, sampling_params) def main():
for output in outputs: prompts = [
prompt = output.prompt "Hello, my name is",
generated_text = output.outputs[0].text "The president of the United States is",
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") "The capital of France is",
"The future of AI is",
"China is",
]
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
# Create an LLM.
llm = LLM(
model="Qwen/Qwen2.5-0.5B",
block_size=128,
max_model_len=1024, # max length of prompt
tensor_parallel_size=1, # number of NPUs to be used
max_num_seqs=26, # max batch number
enforce_eager=
True, # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
trust_remote_code=
True, # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
num_scheduler_steps=8,
gpu_memory_utilization=0.5)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()

View File

@ -1,8 +1,13 @@
import os
import torch import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizer) PreTrainedTokenizer)
from vllm import LLM from vllm import LLM
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def init_tokenizer_and_llm(model_name: str): def init_tokenizer_and_llm(model_name: str):
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)