add distributed examples (#2672)

* add distributed examples * typo * uncomment * require multigpu * add stable diffusion example * style * add copyright * style * remove tqdm * Apply suggestions from code review Co-authored-by: Zach Mueller <muellerzr@gmail.com> * add comments * remove print * More comments --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>
2025-10-20 18:13:46 +08:00 · 2024-04-25 11:13:56 +02:00
parent e831bcb3b1
commit 83317b3081
11 changed files with 157 additions and 4 deletions
--- a/docs/source/usage_guides/distributed_inference.md
+++ b/docs/source/usage_guides/distributed_inference.md
@ -140,6 +140,8 @@ with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"],
 On the first GPU, the prompts will be `["a dog", "a cat"]`, and on the second GPU it will be `["a chicken", "a chicken"]`.
 Make sure to drop the final sample, as it will be a duplicate of the previous one.

+You can find more complex examples [here](https://github.com/huggingface/accelerate/tree/main/examples/inference/distributed) such as how to use it with LLMs.
+
 ## Memory-efficient pipeline parallelism (experimental)

 This next part will discuss using *pipeline parallelism*. This is an **experimental** API utilizing the [PiPPy library by PyTorch](https://github.com/pytorch/PiPPy/) as a native solution. 
@ -232,4 +234,4 @@ if PartialState().is_last_process:
    
 </Tip>

-And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference) and our [documentation](../package_reference/inference) as we work to improving this integration. 
+And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference/pippy) and our [documentation](../package_reference/inference) as we work to improving this integration. 
--- a/examples/inference/distributed/README.md
+++ b/examples/inference/distributed/README.md
@ -0,0 +1,25 @@
+# Distributed inference examples
+
+This folder contains a variety of tutorials for running distributed inference with the following strategy: 
+
+Load an entire model onto each GPU and sending chunks of a batch through each GPU’s model copy at a time
+
+## Installation
+
+```bash
+pip install accelerate torch
+```
+
+## Running code
+
+You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
+
+```bash
+accelerate launch --num_processes {NUM_GPUS} phi2.py
+```
+
+Or:
+
+```bash
+torchrun --nproc-per-node {NUM_GPUS} phi2.py
+```
--- a/examples/inference/distributed/phi2.py
+++ b/examples/inference/distributed/phi2.py
@ -0,0 +1,86 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from accelerate import PartialState
+from accelerate.utils import gather_object
+
+
+# Start up the distributed environment without needing the Accelerator.
+distributed_state = PartialState()
+
+# You can change the model to any LLM such as mistralai/Mistral-7B-v0.1 or meta-llama/Llama-2-7b-chat-hf
+model_name = "microsoft/phi-2"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map=distributed_state.device, torch_dtype=torch.float16
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Need to set the padding token to the eos token for generation
+tokenizer.pad_token = tokenizer.eos_token
+
+prompts = [
+    "I would like to",
+    "hello how are you",
+    "what is going on",
+    "roses are red and",
+    "welcome to the hotel",
+]
+
+# You can change the batch size depending on your GPU RAM
+batch_size = 2
+# We set it to 8 since it is better for some hardware. More information here https://github.com/huggingface/tokenizers/issues/991
+pad_to_multiple_of = 8
+
+# Split into batches
+# We will get the following results:
+# [ ["I would like to", "hello how are you"], [ "what is going on", "roses are red and"], [ "welcome to the hotel"] ]
+formatted_prompts = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]
+
+# Apply padding on the left since we are doing generation
+padding_side_default = tokenizer.padding_side
+tokenizer.padding_side = "left"
+# Tokenize each batch
+tokenized_prompts = [
+    tokenizer(formatted_prompt, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt")
+    for formatted_prompt in formatted_prompts
+]
+# Put back the original padding behavior
+tokenizer.padding_side = padding_side_default
+
+completions_per_process = []
+# We automatically split the batched data we passed to it across all the processes. We also set apply_padding=True
+# so that the GPUs will have the same number of prompts, and you can then gather the results.
+# For example, if we have 2 gpus, the distribution will be:
+# GPU 0: ["I would like to", "hello how are you"],  "what is going on", "roses are red and"]
+# GPU 1: ["welcome to the hotel"], ["welcome to the hotel"] -> this prompt is duplicated to ensure that all gpus have the same number of prompts
+with distributed_state.split_between_processes(tokenized_prompts, apply_padding=True) as batched_prompts:
+    for batch in batched_prompts:
+        # Move the batch to the device
+        batch = batch.to(distributed_state.device)
+        # We generate the text, decode it and add it to the list completions_per_process
+        outputs = model.generate(**batch, max_new_tokens=20)
+        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        completions_per_process.extend(generated_text)
+
+# We are gathering string, so we need to use gather_object.
+# If you need to gather tensors, you can use gather from accelerate.utils
+completions_gather = gather_object(completions_per_process)
+
+# Drop duplicates produced by apply_padding in split_between_processes
+completions = completions_gather[: len(prompts)]
+
+distributed_state.print(completions)
--- a/examples/inference/distributed/stable_diffusion.py
+++ b/examples/inference/distributed/stable_diffusion.py
@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from diffusers import DiffusionPipeline
+
+from accelerate import PartialState  # Can also be Accelerator or AcceleratorState
+
+
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+distributed_state = PartialState()
+pipe.to(distributed_state.device)
+
+# Assume two processes
+# On the first GPU, the prompts will be ["a dog", "a cat"],
+# and on the second GPU it will be ["a chicken", "a chicken"].
+# Make sure to drop the final sample, as it will be a duplicate of the previous one.
+with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
+    result = pipe(prompt).images
--- a/examples/inference/pippy/README.md
+++ b/examples/inference/pippy/README.md
--- a/examples/inference/pippy/bert.py
+++ b/examples/inference/pippy/bert.py
--- a/examples/inference/pippy/gpt2.py
+++ b/examples/inference/pippy/gpt2.py
--- a/examples/inference/pippy/llama.py
+++ b/examples/inference/pippy/llama.py
--- a/examples/inference/pippy/requirements.txt
+++ b/examples/inference/pippy/requirements.txt
--- a/examples/inference/pippy/t5.py
+++ b/examples/inference/pippy/t5.py
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@ -247,20 +247,30 @@ class FeatureExamplesTests(TempDirTestCase):
        testargs = ["examples/by_feature/early_stopping.py"]
        run_command(self.launch_args + testargs)

+    @require_multi_gpu
+    def test_distributed_inference_examples_stable_diffusion(self):
+        testargs = ["examples/inference/distributed/stable_diffusion.py"]
+        run_command(self.launch_args + testargs)
+
+    @require_multi_gpu
+    def test_distributed_inference_examples_phi2(self):
+        testargs = ["examples/inference/distributed/phi2.py"]
+        run_command(self.launch_args + testargs)
+
    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_bert(self):
-        testargs = ["examples/inference/bert.py"]
+        testargs = ["examples/inference/pippy/bert.py"]
        run_command(self.launch_args + testargs)

    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_gpt2(self):
-        testargs = ["examples/inference/gpt2.py"]
+        testargs = ["examples/inference/pippy/gpt2.py"]
        run_command(self.launch_args + testargs)

    @require_pippy
    @require_multi_gpu
    def test_pippy_examples_t5(self):
-        testargs = ["examples/inference/t5.py"]
+        testargs = ["examples/inference/pippy/t5.py"]
        run_command(self.launch_args + testargs)