[Misc][V0 Deprecation] Add __main__ guard to all offline examples (#1837)

### What this PR does / why we need it? Add `__main__` guard to all offline examples. - vLLM version: v0.9.2 - vLLM main: 76b494444f --------- Signed-off-by: shen-shanshan <467638484@qq.com>
2025-10-20 21:53:54 +08:00 · 2025-07-17 14:13:30 +08:00
parent 19e37cd379
commit aeb5aa8b88
15 changed files with 157 additions and 87 deletions
--- a/examples/disaggregated_prefill/disaggregated_prefill_offline.py
+++ b/examples/disaggregated_prefill/disaggregated_prefill_offline.py
@ -12,6 +12,9 @@ import os
 import time
 from multiprocessing import Event, Process
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 kv_connector_extra_config = {
    "prefill_device_ips": ["1.2.3.1", "1.2.3.2"],
    "decode_device_ips": ["1.2.3.9", "1.2.3.10"],
--- a/examples/disaggregated_prefill/dp_proxy.py
+++ b/examples/disaggregated_prefill/dp_proxy.py
@ -13,6 +13,9 @@ import msgpack  # type: ignore
 import zmq
 from quart import Quart, make_response, request
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 DP_PROXY_HTTP_PORT = 10004
 DP_PROXY_ZMQ_REG_PORT = 30006
 DP_PROXY_ZMQ_NOTIFY_PORT = 30005
--- a/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py
+++ b/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py
@ -8,6 +8,9 @@ import msgpack  # type: ignore
 import zmq
 from quart import Quart, make_response, request
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 prefill_instances: dict[str, str] = {}  # http_address: zmq_address
 decode_instances: dict[str, str] = {}  # http_address: zmq_address
--- a/examples/eplb/eplb_strategy.py
+++ b/examples/eplb/eplb_strategy.py
@ -8,6 +8,9 @@ import matplotlib.pyplot as plt  # type: ignore
 import numpy as np
 import torch
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 logger = logging.getLogger("msit_logger")
--- a/examples/offline_data_parallel.py
+++ b/examples/offline_data_parallel.py
@ -60,6 +60,9 @@ from time import sleep
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 def parse_args():
    import argparse
--- a/examples/offline_disaggregated_prefill_npu.py
+++ b/examples/offline_disaggregated_prefill_npu.py
@ -21,6 +21,8 @@ import os
 import time
 from multiprocessing import Event, Process
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 def clean_up():
    import gc
--- a/examples/offline_distributed_inference_npu.py
+++ b/examples/offline_distributed_inference_npu.py
@ -17,28 +17,37 @@
 # Adapted from vllm-project/vllm/examples/offline_inference/basic.py
 #
 import os
 from vllm import LLM, SamplingParams
-prompts = [
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
-    "Hello, my name is",
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
-# Create a sampling params object.
+def main():
-sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    prompts = [
-# Create an LLM.
+        "Hello, my name is",
-llm = LLM(
+        "The president of the United States is",
-    model="Qwen/Qwen2.5-0.5B-Instruct",
+        "The capital of France is",
-    tensor_parallel_size=2,
+        "The future of AI is",
-    distributed_executor_backend="mp",
+    ]
    trust_remote_code=True,
 )
-# Generate texts from the prompts.
+    # Create a sampling params object.
-outputs = llm.generate(prompts, sampling_params)
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-for output in outputs:
+    # Create an LLM.
-    prompt = output.prompt
+    llm = LLM(
-    generated_text = output.outputs[0].text
+        model="Qwen/Qwen2.5-0.5B-Instruct",
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        tensor_parallel_size=2,
        distributed_executor_backend="mp",
        trust_remote_code=True,
    )
    # Generate texts from the prompts.
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 if __name__ == "__main__":
    main()
--- a/examples/offline_dualbatch_overlap_npu.py
+++ b/examples/offline_dualbatch_overlap_npu.py
@ -3,6 +3,8 @@ import time
 from vllm import LLM, SamplingParams
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 # enable dual-batch overlap for vllm ascend
 os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"
--- a/examples/offline_embed.py
+++ b/examples/offline_embed.py
@ -19,35 +19,40 @@
 import os
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 import torch
 from vllm import LLM
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'
-# Each query must come with a one-sentence instruction that describes the task
+def main():
-task = 'Given a web search query, retrieve relevant passages that answer the query'
+    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given a web search query, retrieve relevant passages that answer the query'
-queries = [
+    queries = [
-    get_detailed_instruct(task, 'What is the capital of China?'),
+        get_detailed_instruct(task, 'What is the capital of China?'),
-    get_detailed_instruct(task, 'Explain gravity')
+        get_detailed_instruct(task, 'Explain gravity')
-]
+    ]
-# No need to add instruction for retrieval documents
+    # No need to add instruction for retrieval documents
-documents = [
+    documents = [
-    "The capital of China is Beijing.",
+        "The capital of China is Beijing.",
-    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
+        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
-]
+    ]
-input_texts = queries + documents
+    input_texts = queries + documents
-model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
+    model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
-outputs = model.embed(input_texts)
+    outputs = model.embed(input_texts)
-embeddings = torch.tensor([o.outputs.embedding for o in outputs])
+    embeddings = torch.tensor([o.outputs.embedding for o in outputs])
-# Calculate the similarity scores between the first two queries and the last two documents
+    # Calculate the similarity scores between the first two queries and the last two documents
-scores = (embeddings[:2] @ embeddings[2:].T)
+    scores = (embeddings[:2] @ embeddings[2:].T)
-print(scores.tolist())
+    print(scores.tolist())
-# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
+    # [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
 if __name__ == "__main__":
    main()
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@ -24,9 +24,14 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 import os
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
 question_per_audio_count = {
    1: "What is recited in the audio?",
--- a/examples/offline_inference_npu.py
+++ b/examples/offline_inference_npu.py
@ -21,24 +21,31 @@
 import os
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 from vllm import LLM, SamplingParams
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
-# Create a sampling params object.
+def main():
-sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    prompts = [
-# Create an LLM.
+        "Hello, my name is",
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
-# Generate texts from the prompts.
+    # Create a sampling params object.
-outputs = llm.generate(prompts, sampling_params)
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-for output in outputs:
+    # Create an LLM.
-    prompt = output.prompt
+    llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
-    generated_text = output.outputs[0].text
+
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # Generate texts from the prompts.
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 if __name__ == "__main__":
    main()
--- a/examples/offline_inference_npu_tp2.py
+++ b/examples/offline_inference_npu_tp2.py
@ -25,7 +25,8 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 from vllm import LLM, SamplingParams
-if __name__ == "__main__":
+
 def main():
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
@ -48,3 +49,7 @@ if __name__ == "__main__":
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 if __name__ == "__main__":
    main()
--- a/examples/offline_inference_sleep_mode_npu.py
+++ b/examples/offline_inference_sleep_mode_npu.py
@ -25,7 +25,7 @@ from vllm.utils import GiB_bytes
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-if __name__ == "__main__":
+def main():
    prompt = "How are you?"
    free, total = torch.npu.mem_get_info()
@ -51,3 +51,7 @@ if __name__ == "__main__":
    output2 = llm.generate(prompt, sampling_params)
    # cmp output
    assert output[0].outputs[0].text == output2[0].outputs[0].text
 if __name__ == "__main__":
    main()
--- a/examples/offline_multi_step_custom_ops.py
+++ b/examples/offline_multi_step_custom_ops.py
@ -17,34 +17,45 @@
 # limitations under the License.
 #
 import os
 from vllm import LLM, SamplingParams
-prompts = [
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
-    "Hello, my name is",
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
    "China is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 # Create an LLM.
 llm = LLM(
    model="Qwen/Qwen2.5-0.5B",
    block_size=128,
    max_model_len=1024,  # max length of prompt
    tensor_parallel_size=1,  # number of NPUs to be used
    max_num_seqs=26,  # max batch number
    enforce_eager=
    True,  # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
    trust_remote_code=
    True,  # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
    num_scheduler_steps=8,
    gpu_memory_utilization=0.5)
-outputs = llm.generate(prompts, sampling_params)
+def main():
-for output in outputs:
+    prompts = [
-    prompt = output.prompt
+        "Hello, my name is",
-    generated_text = output.outputs[0].text
+        "The president of the United States is",
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        "The capital of France is",
        "The future of AI is",
        "China is",
    ]
    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
    # Create an LLM.
    llm = LLM(
        model="Qwen/Qwen2.5-0.5B",
        block_size=128,
        max_model_len=1024,  # max length of prompt
        tensor_parallel_size=1,  # number of NPUs to be used
        max_num_seqs=26,  # max batch number
        enforce_eager=
        True,  # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
        trust_remote_code=
        True,  # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
        num_scheduler_steps=8,
        gpu_memory_utilization=0.5)
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 if __name__ == "__main__":
    main()
--- a/examples/prompt_embedding_inference.py
+++ b/examples/prompt_embedding_inference.py
@ -1,8 +1,13 @@
 import os
 import torch
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizer)
 from vllm import LLM
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 def init_tokenizer_and_llm(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)