diff --git a/examples/disaggregated_prefill/disaggregated_prefill_offline.py b/examples/disaggregated_prefill/disaggregated_prefill_offline.py index d7dd4b88b..ea131034b 100644 --- a/examples/disaggregated_prefill/disaggregated_prefill_offline.py +++ b/examples/disaggregated_prefill/disaggregated_prefill_offline.py @@ -12,6 +12,9 @@ import os import time from multiprocessing import Event, Process +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + kv_connector_extra_config = { "prefill_device_ips": ["1.2.3.1", "1.2.3.2"], "decode_device_ips": ["1.2.3.9", "1.2.3.10"], diff --git a/examples/disaggregated_prefill/dp_proxy.py b/examples/disaggregated_prefill/dp_proxy.py index 42bf12039..415e98134 100644 --- a/examples/disaggregated_prefill/dp_proxy.py +++ b/examples/disaggregated_prefill/dp_proxy.py @@ -13,6 +13,9 @@ import msgpack # type: ignore import zmq from quart import Quart, make_response, request +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + DP_PROXY_HTTP_PORT = 10004 DP_PROXY_ZMQ_REG_PORT = 30006 DP_PROXY_ZMQ_NOTIFY_PORT = 30005 diff --git a/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py b/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py index 03192561e..5baa355a0 100644 --- a/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py +++ b/examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py @@ -8,6 +8,9 @@ import msgpack # type: ignore import zmq from quart import Quart, make_response, request +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + prefill_instances: dict[str, str] = {} # http_address: zmq_address decode_instances: dict[str, str] = {} # http_address: zmq_address diff --git a/examples/eplb/eplb_strategy.py b/examples/eplb/eplb_strategy.py index 9470b952b..bcccbf23c 100644 --- a/examples/eplb/eplb_strategy.py +++ b/examples/eplb/eplb_strategy.py @@ -8,6 +8,9 @@ import matplotlib.pyplot as plt # type: ignore import numpy as np import torch +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + logger = logging.getLogger("msit_logger") diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py index 754dfbc7c..024ef9805 100644 --- a/examples/offline_data_parallel.py +++ b/examples/offline_data_parallel.py @@ -60,6 +60,9 @@ from time import sleep from vllm import LLM, SamplingParams from vllm.utils import get_open_port +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + def parse_args(): import argparse diff --git a/examples/offline_disaggregated_prefill_npu.py b/examples/offline_disaggregated_prefill_npu.py index 9cea63ace..84fa3fe65 100644 --- a/examples/offline_disaggregated_prefill_npu.py +++ b/examples/offline_disaggregated_prefill_npu.py @@ -21,6 +21,8 @@ import os import time from multiprocessing import Event, Process +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" def clean_up(): import gc diff --git a/examples/offline_distributed_inference_npu.py b/examples/offline_distributed_inference_npu.py index 1af962b3b..4e2e7ed37 100644 --- a/examples/offline_distributed_inference_npu.py +++ b/examples/offline_distributed_inference_npu.py @@ -17,28 +17,37 @@ # Adapted from vllm-project/vllm/examples/offline_inference/basic.py # +import os from vllm import LLM, SamplingParams -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -# Create a sampling params object. -sampling_params = SamplingParams(max_tokens=100, temperature=0.0) -# Create an LLM. -llm = LLM( - model="Qwen/Qwen2.5-0.5B-Instruct", - tensor_parallel_size=2, - distributed_executor_backend="mp", - trust_remote_code=True, -) +def main(): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] -# Generate texts from the prompts. -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + # Create a sampling params object. + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + # Create an LLM. + llm = LLM( + model="Qwen/Qwen2.5-0.5B-Instruct", + tensor_parallel_size=2, + distributed_executor_backend="mp", + trust_remote_code=True, + ) + + # Generate texts from the prompts. + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_dualbatch_overlap_npu.py b/examples/offline_dualbatch_overlap_npu.py index e721ab2aa..2cc52137c 100644 --- a/examples/offline_dualbatch_overlap_npu.py +++ b/examples/offline_dualbatch_overlap_npu.py @@ -3,6 +3,8 @@ import time from vllm import LLM, SamplingParams +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" # enable dual-batch overlap for vllm ascend os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1" diff --git a/examples/offline_embed.py b/examples/offline_embed.py index 91fba3870..7707e5fb2 100644 --- a/examples/offline_embed.py +++ b/examples/offline_embed.py @@ -19,35 +19,40 @@ import os -os.environ["VLLM_USE_MODELSCOPE"] = "True" - import torch from vllm import LLM +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" def get_detailed_instruct(task_description: str, query: str) -> str: return f'Instruct: {task_description}\nQuery:{query}' -# Each query must come with a one-sentence instruction that describes the task -task = 'Given a web search query, retrieve relevant passages that answer the query' +def main(): + # Each query must come with a one-sentence instruction that describes the task + task = 'Given a web search query, retrieve relevant passages that answer the query' -queries = [ - get_detailed_instruct(task, 'What is the capital of China?'), - get_detailed_instruct(task, 'Explain gravity') -] -# No need to add instruction for retrieval documents -documents = [ - "The capital of China is Beijing.", - "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun." -] -input_texts = queries + documents + queries = [ + get_detailed_instruct(task, 'What is the capital of China?'), + get_detailed_instruct(task, 'Explain gravity') + ] + # No need to add instruction for retrieval documents + documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun." + ] + input_texts = queries + documents -model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed") + model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed") -outputs = model.embed(input_texts) -embeddings = torch.tensor([o.outputs.embedding for o in outputs]) -# Calculate the similarity scores between the first two queries and the last two documents -scores = (embeddings[:2] @ embeddings[2:].T) -print(scores.tolist()) -# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]] + outputs = model.embed(input_texts) + embeddings = torch.tensor([o.outputs.embedding for o in outputs]) + # Calculate the similarity scores between the first two queries and the last two documents + scores = (embeddings[:2] @ embeddings[2:].T) + print(scores.tolist()) + # [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]] + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 7392283a1..03bb1cb52 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -24,9 +24,14 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ +import os + from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] question_per_audio_count = { 1: "What is recited in the audio?", diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py index 3e88c0017..4630bd1dd 100644 --- a/examples/offline_inference_npu.py +++ b/examples/offline_inference_npu.py @@ -21,24 +21,31 @@ import os os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" from vllm import LLM, SamplingParams -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(max_tokens=100, temperature=0.0) -# Create an LLM. -llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") +def main(): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] -# Generate texts from the prompts. -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + # Create a sampling params object. + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + # Create an LLM. + llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") + + # Generate texts from the prompts. + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference_npu_tp2.py b/examples/offline_inference_npu_tp2.py index 9f01c3af5..05082b004 100644 --- a/examples/offline_inference_npu_tp2.py +++ b/examples/offline_inference_npu_tp2.py @@ -25,7 +25,8 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" from vllm import LLM, SamplingParams -if __name__ == "__main__": + +def main(): prompts = [ "Hello, my name is", "The president of the United States is", @@ -48,3 +49,7 @@ if __name__ == "__main__": prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference_sleep_mode_npu.py b/examples/offline_inference_sleep_mode_npu.py index 7b7d42268..5ffcff6fb 100644 --- a/examples/offline_inference_sleep_mode_npu.py +++ b/examples/offline_inference_sleep_mode_npu.py @@ -25,7 +25,7 @@ from vllm.utils import GiB_bytes os.environ["VLLM_USE_MODELSCOPE"] = "True" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -if __name__ == "__main__": +def main(): prompt = "How are you?" free, total = torch.npu.mem_get_info() @@ -51,3 +51,7 @@ if __name__ == "__main__": output2 = llm.generate(prompt, sampling_params) # cmp output assert output[0].outputs[0].text == output2[0].outputs[0].text + + +if __name__ == "__main__": + main() diff --git a/examples/offline_multi_step_custom_ops.py b/examples/offline_multi_step_custom_ops.py index 59c7fafcc..8aa6af4bf 100644 --- a/examples/offline_multi_step_custom_ops.py +++ b/examples/offline_multi_step_custom_ops.py @@ -17,34 +17,45 @@ # limitations under the License. # +import os + from vllm import LLM, SamplingParams -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - "China is", -] +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -# Create a sampling params object. -sampling_params = SamplingParams(max_tokens=100, temperature=0.0) -# Create an LLM. -llm = LLM( - model="Qwen/Qwen2.5-0.5B", - block_size=128, - max_model_len=1024, # max length of prompt - tensor_parallel_size=1, # number of NPUs to be used - max_num_seqs=26, # max batch number - enforce_eager= - True, # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations) - trust_remote_code= - True, # If the model is a cuscd tom model not yet available in the HuggingFace transformers library - num_scheduler_steps=8, - gpu_memory_utilization=0.5) -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +def main(): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + "China is", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + # Create an LLM. + llm = LLM( + model="Qwen/Qwen2.5-0.5B", + block_size=128, + max_model_len=1024, # max length of prompt + tensor_parallel_size=1, # number of NPUs to be used + max_num_seqs=26, # max batch number + enforce_eager= + True, # Force PyTorch eager execution to debug intermediate tensors (disables graph optimizations) + trust_remote_code= + True, # If the model is a cuscd tom model not yet available in the HuggingFace transformers library + num_scheduler_steps=8, + gpu_memory_utilization=0.5) + + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/prompt_embedding_inference.py b/examples/prompt_embedding_inference.py index e375a8b4f..c95323874 100644 --- a/examples/prompt_embedding_inference.py +++ b/examples/prompt_embedding_inference.py @@ -1,8 +1,13 @@ +import os + import torch from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer) from vllm import LLM +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + def init_tokenizer_and_llm(model_name: str): tokenizer = AutoTokenizer.from_pretrained(model_name)