[Misc][V0 Deprecation] Add __main__ guard to all offline examples (#1837)

### What this PR does / why we need it? Add `__main__` guard to all offline examples. - vLLM version: v0.9.2 - vLLM main: 76b494444f --------- Signed-off-by: shen-shanshan <467638484@qq.com>
2025-10-20 13:43:53 +08:00 · 2025-07-17 14:13:30 +08:00
parent 19e37cd379
commit aeb5aa8b88
15 changed files with 157 additions and 87 deletions
--- a/examples/disaggregated_prefill/disaggregated_prefill_offline.py
+++ b/examples/disaggregated_prefill/disaggregated_prefill_offline.py
@ -12,6 +12,9 @@ import os
 import time
 from multiprocessing import Event, Process

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
 kv_connector_extra_config = {
    "prefill_device_ips": ["1.2.3.1", "1.2.3.2"],
    "decode_device_ips": ["1.2.3.9", "1.2.3.10"],
--- a/examples/disaggregated_prefill/dp_proxy.py
+++ b/examples/disaggregated_prefill/dp_proxy.py
@ -13,6 +13,9 @@ import msgpack  # type: ignore
 import zmq
 from quart import Quart, make_response, request

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
 DP_PROXY_HTTP_PORT = 10004
 DP_PROXY_ZMQ_REG_PORT = 30006
 DP_PROXY_ZMQ_NOTIFY_PORT = 30005
--- a/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py
+++ b/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py
@ -8,6 +8,9 @@ import msgpack  # type: ignore
 import zmq
 from quart import Quart, make_response, request

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
 prefill_instances: dict[str, str] = {}  # http_address: zmq_address
 decode_instances: dict[str, str] = {}  # http_address: zmq_address

--- a/examples/eplb/eplb_strategy.py
+++ b/examples/eplb/eplb_strategy.py
@ -8,6 +8,9 @@ import matplotlib.pyplot as plt  # type: ignore
 import numpy as np
 import torch

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
 logger = logging.getLogger("msit_logger")


--- a/examples/offline_data_parallel.py
+++ b/examples/offline_data_parallel.py
@ -60,6 +60,9 @@ from time import sleep
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+

 def parse_args():
    import argparse
--- a/examples/offline_disaggregated_prefill_npu.py
+++ b/examples/offline_disaggregated_prefill_npu.py
@ -21,6 +21,8 @@ import os
 import time
 from multiprocessing import Event, Process

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

 def clean_up():
    import gc
--- a/examples/offline_distributed_inference_npu.py
+++ b/examples/offline_distributed_inference_npu.py
@ -17,28 +17,37 @@
 # Adapted from vllm-project/vllm/examples/offline_inference/basic.py
 #

+import os
 from vllm import LLM, SamplingParams

-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

-# Create a sampling params object.
-sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-# Create an LLM.
-llm = LLM(
-    model="Qwen/Qwen2.5-0.5B-Instruct",
-    tensor_parallel_size=2,
-    distributed_executor_backend="mp",
-    trust_remote_code=True,
-)
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]

-# Generate texts from the prompts.
-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    # Create an LLM.
+    llm = LLM(
+        model="Qwen/Qwen2.5-0.5B-Instruct",
+        tensor_parallel_size=2,
+        distributed_executor_backend="mp",
+        trust_remote_code=True,
+    )
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_dualbatch_overlap_npu.py
+++ b/examples/offline_dualbatch_overlap_npu.py
@ -3,6 +3,8 @@ import time

 from vllm import LLM, SamplingParams

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 # enable dual-batch overlap for vllm ascend
 os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1"

--- a/examples/offline_embed.py
+++ b/examples/offline_embed.py
@ -19,35 +19,40 @@

 import os

-os.environ["VLLM_USE_MODELSCOPE"] = "True"
-
 import torch
 from vllm import LLM

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

 def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'


-# Each query must come with a one-sentence instruction that describes the task
-task = 'Given a web search query, retrieve relevant passages that answer the query'
+def main():
+    # Each query must come with a one-sentence instruction that describes the task
+    task = 'Given a web search query, retrieve relevant passages that answer the query'

-queries = [
-    get_detailed_instruct(task, 'What is the capital of China?'),
-    get_detailed_instruct(task, 'Explain gravity')
-]
-# No need to add instruction for retrieval documents
-documents = [
-    "The capital of China is Beijing.",
-    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
-]
-input_texts = queries + documents
+    queries = [
+        get_detailed_instruct(task, 'What is the capital of China?'),
+        get_detailed_instruct(task, 'Explain gravity')
+    ]
+    # No need to add instruction for retrieval documents
+    documents = [
+        "The capital of China is Beijing.",
+        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
+    ]
+    input_texts = queries + documents

-model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")
+    model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed")

-outputs = model.embed(input_texts)
-embeddings = torch.tensor([o.outputs.embedding for o in outputs])
-# Calculate the similarity scores between the first two queries and the last two documents
-scores = (embeddings[:2] @ embeddings[2:].T)
-print(scores.tolist())
-# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
+    outputs = model.embed(input_texts)
+    embeddings = torch.tensor([o.outputs.embedding for o in outputs])
+    # Calculate the similarity scores between the first two queries and the last two documents
+    scores = (embeddings[:2] @ embeddings[2:].T)
+    print(scores.tolist())
+    # [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference_audio_language.py
+++ b/examples/offline_inference_audio_language.py
@ -24,9 +24,14 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """

+import os
+
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
 audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
 question_per_audio_count = {
    1: "What is recited in the audio?",
--- a/examples/offline_inference_npu.py
+++ b/examples/offline_inference_npu.py
@ -21,24 +21,31 @@
 import os

 os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

 from vllm import LLM, SamplingParams

-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]

-# Create a sampling params object.
-sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-# Create an LLM.
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]

-# Generate texts from the prompts.
-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    # Create an LLM.
+    llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference_npu_tp2.py
+++ b/examples/offline_inference_npu_tp2.py
@ -25,7 +25,8 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

 from vllm import LLM, SamplingParams

-if __name__ == "__main__":
+
+def main():
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
@ -48,3 +49,7 @@ if __name__ == "__main__":
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference_sleep_mode_npu.py
+++ b/examples/offline_inference_sleep_mode_npu.py
@ -25,7 +25,7 @@ from vllm.utils import GiB_bytes
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

-if __name__ == "__main__":
+def main():
    prompt = "How are you?"

    free, total = torch.npu.mem_get_info()
@ -51,3 +51,7 @@ if __name__ == "__main__":
    output2 = llm.generate(prompt, sampling_params)
    # cmp output
    assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_multi_step_custom_ops.py
+++ b/examples/offline_multi_step_custom_ops.py
@ -17,34 +17,45 @@
 # limitations under the License.
 #

+import os
+
 from vllm import LLM, SamplingParams

-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-    "China is",
-]
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

-# Create a sampling params object.
-sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-# Create an LLM.
-llm = LLM(
-    model="Qwen/Qwen2.5-0.5B",
-    block_size=128,
-    max_model_len=1024,  # max length of prompt
-    tensor_parallel_size=1,  # number of NPUs to be used
-    max_num_seqs=26,  # max batch number
-    enforce_eager=
-    True,  # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
-    trust_remote_code=
-    True,  # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
-    num_scheduler_steps=8,
-    gpu_memory_utilization=0.5)

-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "China is",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    # Create an LLM.
+    llm = LLM(
+        model="Qwen/Qwen2.5-0.5B",
+        block_size=128,
+        max_model_len=1024,  # max length of prompt
+        tensor_parallel_size=1,  # number of NPUs to be used
+        max_num_seqs=26,  # max batch number
+        enforce_eager=
+        True,  # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations)
+        trust_remote_code=
+        True,  # If the model is a cuscd tom model not yet available in the HuggingFace transformers library
+        num_scheduler_steps=8,
+        gpu_memory_utilization=0.5)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/prompt_embedding_inference.py
+++ b/examples/prompt_embedding_inference.py
@ -1,8 +1,13 @@
+import os
+
 import torch
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizer)
 from vllm import LLM

+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+

 def init_tokenizer_and_llm(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)