add distributed examples (#2672)

* add distributed examples

* typo

* uncomment

* require multigpu

* add stable diffusion example

* style

* add copyright

* style

* remove tqdm

* Apply suggestions from code review

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* add comments

* remove print

* More comments

---------

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
This commit is contained in:
Marc Sun
2024-04-25 11:13:56 +02:00
committed by GitHub
parent e831bcb3b1
commit 83317b3081
11 changed files with 157 additions and 4 deletions

View File

@ -140,6 +140,8 @@ with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"],
On the first GPU, the prompts will be `["a dog", "a cat"]`, and on the second GPU it will be `["a chicken", "a chicken"]`.
Make sure to drop the final sample, as it will be a duplicate of the previous one.
You can find more complex examples [here](https://github.com/huggingface/accelerate/tree/main/examples/inference/distributed) such as how to use it with LLMs.
## Memory-efficient pipeline parallelism (experimental)
This next part will discuss using *pipeline parallelism*. This is an **experimental** API utilizing the [PiPPy library by PyTorch](https://github.com/pytorch/PiPPy/) as a native solution.
@ -232,4 +234,4 @@ if PartialState().is_last_process:
</Tip>
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference) and our [documentation](../package_reference/inference) as we work to improving this integration.
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference/pippy) and our [documentation](../package_reference/inference) as we work to improving this integration.

View File

@ -0,0 +1,25 @@
# Distributed inference examples
This folder contains a variety of tutorials for running distributed inference with the following strategy:
Load an entire model onto each GPU and sending chunks of a batch through each GPUs model copy at a time
## Installation
```bash
pip install accelerate torch
```
## Running code
You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
```bash
accelerate launch --num_processes {NUM_GPUS} phi2.py
```
Or:
```bash
torchrun --nproc-per-node {NUM_GPUS} phi2.py
```

View File

@ -0,0 +1,86 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import PartialState
from accelerate.utils import gather_object
# Start up the distributed environment without needing the Accelerator.
distributed_state = PartialState()
# You can change the model to any LLM such as mistralai/Mistral-7B-v0.1 or meta-llama/Llama-2-7b-chat-hf
model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map=distributed_state.device, torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Need to set the padding token to the eos token for generation
tokenizer.pad_token = tokenizer.eos_token
prompts = [
"I would like to",
"hello how are you",
"what is going on",
"roses are red and",
"welcome to the hotel",
]
# You can change the batch size depending on your GPU RAM
batch_size = 2
# We set it to 8 since it is better for some hardware. More information here https://github.com/huggingface/tokenizers/issues/991
pad_to_multiple_of = 8
# Split into batches
# We will get the following results:
# [ ["I would like to", "hello how are you"], [ "what is going on", "roses are red and"], [ "welcome to the hotel"] ]
formatted_prompts = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]
# Apply padding on the left since we are doing generation
padding_side_default = tokenizer.padding_side
tokenizer.padding_side = "left"
# Tokenize each batch
tokenized_prompts = [
tokenizer(formatted_prompt, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt")
for formatted_prompt in formatted_prompts
]
# Put back the original padding behavior
tokenizer.padding_side = padding_side_default
completions_per_process = []
# We automatically split the batched data we passed to it across all the processes. We also set apply_padding=True
# so that the GPUs will have the same number of prompts, and you can then gather the results.
# For example, if we have 2 gpus, the distribution will be:
# GPU 0: ["I would like to", "hello how are you"], "what is going on", "roses are red and"]
# GPU 1: ["welcome to the hotel"], ["welcome to the hotel"] -> this prompt is duplicated to ensure that all gpus have the same number of prompts
with distributed_state.split_between_processes(tokenized_prompts, apply_padding=True) as batched_prompts:
for batch in batched_prompts:
# Move the batch to the device
batch = batch.to(distributed_state.device)
# We generate the text, decode it and add it to the list completions_per_process
outputs = model.generate(**batch, max_new_tokens=20)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
completions_per_process.extend(generated_text)
# We are gathering string, so we need to use gather_object.
# If you need to gather tensors, you can use gather from accelerate.utils
completions_gather = gather_object(completions_per_process)
# Drop duplicates produced by apply_padding in split_between_processes
completions = completions_gather[: len(prompts)]
distributed_state.print(completions)

View File

@ -0,0 +1,30 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from diffusers import DiffusionPipeline
from accelerate import PartialState # Can also be Accelerator or AcceleratorState
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
distributed_state = PartialState()
pipe.to(distributed_state.device)
# Assume two processes
# On the first GPU, the prompts will be ["a dog", "a cat"],
# and on the second GPU it will be ["a chicken", "a chicken"].
# Make sure to drop the final sample, as it will be a duplicate of the previous one.
with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
result = pipe(prompt).images

View File

@ -247,20 +247,30 @@ class FeatureExamplesTests(TempDirTestCase):
testargs = ["examples/by_feature/early_stopping.py"]
run_command(self.launch_args + testargs)
@require_multi_gpu
def test_distributed_inference_examples_stable_diffusion(self):
testargs = ["examples/inference/distributed/stable_diffusion.py"]
run_command(self.launch_args + testargs)
@require_multi_gpu
def test_distributed_inference_examples_phi2(self):
testargs = ["examples/inference/distributed/phi2.py"]
run_command(self.launch_args + testargs)
@require_pippy
@require_multi_gpu
def test_pippy_examples_bert(self):
testargs = ["examples/inference/bert.py"]
testargs = ["examples/inference/pippy/bert.py"]
run_command(self.launch_args + testargs)
@require_pippy
@require_multi_gpu
def test_pippy_examples_gpt2(self):
testargs = ["examples/inference/gpt2.py"]
testargs = ["examples/inference/pippy/gpt2.py"]
run_command(self.launch_args + testargs)
@require_pippy
@require_multi_gpu
def test_pippy_examples_t5(self):
testargs = ["examples/inference/t5.py"]
testargs = ["examples/inference/pippy/t5.py"]
run_command(self.launch_args + testargs)