mirror of
https://github.com/huggingface/accelerate.git
synced 2025-10-20 18:13:46 +08:00
add distributed examples (#2672)
* add distributed examples * typo * uncomment * require multigpu * add stable diffusion example * style * add copyright * style * remove tqdm * Apply suggestions from code review Co-authored-by: Zach Mueller <muellerzr@gmail.com> * add comments * remove print * More comments --------- Co-authored-by: Zach Mueller <muellerzr@gmail.com>
This commit is contained in:
@ -140,6 +140,8 @@ with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"],
|
||||
On the first GPU, the prompts will be `["a dog", "a cat"]`, and on the second GPU it will be `["a chicken", "a chicken"]`.
|
||||
Make sure to drop the final sample, as it will be a duplicate of the previous one.
|
||||
|
||||
You can find more complex examples [here](https://github.com/huggingface/accelerate/tree/main/examples/inference/distributed) such as how to use it with LLMs.
|
||||
|
||||
## Memory-efficient pipeline parallelism (experimental)
|
||||
|
||||
This next part will discuss using *pipeline parallelism*. This is an **experimental** API utilizing the [PiPPy library by PyTorch](https://github.com/pytorch/PiPPy/) as a native solution.
|
||||
@ -232,4 +234,4 @@ if PartialState().is_last_process:
|
||||
|
||||
</Tip>
|
||||
|
||||
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference) and our [documentation](../package_reference/inference) as we work to improving this integration.
|
||||
And that's it! To explore more, please check out the inference examples in the [Accelerate repo](https://github.com/huggingface/accelerate/tree/main/examples/inference/pippy) and our [documentation](../package_reference/inference) as we work to improving this integration.
|
||||
|
25
examples/inference/distributed/README.md
Normal file
25
examples/inference/distributed/README.md
Normal file
@ -0,0 +1,25 @@
|
||||
# Distributed inference examples
|
||||
|
||||
This folder contains a variety of tutorials for running distributed inference with the following strategy:
|
||||
|
||||
Load an entire model onto each GPU and sending chunks of a batch through each GPU’s model copy at a time
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install accelerate torch
|
||||
```
|
||||
|
||||
## Running code
|
||||
|
||||
You can either use `torchrun` or the recommended way of `accelerate launch` (without needing to run `accelerate config`) on each script:
|
||||
|
||||
```bash
|
||||
accelerate launch --num_processes {NUM_GPUS} phi2.py
|
||||
```
|
||||
|
||||
Or:
|
||||
|
||||
```bash
|
||||
torchrun --nproc-per-node {NUM_GPUS} phi2.py
|
||||
```
|
86
examples/inference/distributed/phi2.py
Normal file
86
examples/inference/distributed/phi2.py
Normal file
@ -0,0 +1,86 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from accelerate import PartialState
|
||||
from accelerate.utils import gather_object
|
||||
|
||||
|
||||
# Start up the distributed environment without needing the Accelerator.
|
||||
distributed_state = PartialState()
|
||||
|
||||
# You can change the model to any LLM such as mistralai/Mistral-7B-v0.1 or meta-llama/Llama-2-7b-chat-hf
|
||||
model_name = "microsoft/phi-2"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map=distributed_state.device, torch_dtype=torch.float16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
# Need to set the padding token to the eos token for generation
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
prompts = [
|
||||
"I would like to",
|
||||
"hello how are you",
|
||||
"what is going on",
|
||||
"roses are red and",
|
||||
"welcome to the hotel",
|
||||
]
|
||||
|
||||
# You can change the batch size depending on your GPU RAM
|
||||
batch_size = 2
|
||||
# We set it to 8 since it is better for some hardware. More information here https://github.com/huggingface/tokenizers/issues/991
|
||||
pad_to_multiple_of = 8
|
||||
|
||||
# Split into batches
|
||||
# We will get the following results:
|
||||
# [ ["I would like to", "hello how are you"], [ "what is going on", "roses are red and"], [ "welcome to the hotel"] ]
|
||||
formatted_prompts = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]
|
||||
|
||||
# Apply padding on the left since we are doing generation
|
||||
padding_side_default = tokenizer.padding_side
|
||||
tokenizer.padding_side = "left"
|
||||
# Tokenize each batch
|
||||
tokenized_prompts = [
|
||||
tokenizer(formatted_prompt, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt")
|
||||
for formatted_prompt in formatted_prompts
|
||||
]
|
||||
# Put back the original padding behavior
|
||||
tokenizer.padding_side = padding_side_default
|
||||
|
||||
completions_per_process = []
|
||||
# We automatically split the batched data we passed to it across all the processes. We also set apply_padding=True
|
||||
# so that the GPUs will have the same number of prompts, and you can then gather the results.
|
||||
# For example, if we have 2 gpus, the distribution will be:
|
||||
# GPU 0: ["I would like to", "hello how are you"], "what is going on", "roses are red and"]
|
||||
# GPU 1: ["welcome to the hotel"], ["welcome to the hotel"] -> this prompt is duplicated to ensure that all gpus have the same number of prompts
|
||||
with distributed_state.split_between_processes(tokenized_prompts, apply_padding=True) as batched_prompts:
|
||||
for batch in batched_prompts:
|
||||
# Move the batch to the device
|
||||
batch = batch.to(distributed_state.device)
|
||||
# We generate the text, decode it and add it to the list completions_per_process
|
||||
outputs = model.generate(**batch, max_new_tokens=20)
|
||||
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||
completions_per_process.extend(generated_text)
|
||||
|
||||
# We are gathering string, so we need to use gather_object.
|
||||
# If you need to gather tensors, you can use gather from accelerate.utils
|
||||
completions_gather = gather_object(completions_per_process)
|
||||
|
||||
# Drop duplicates produced by apply_padding in split_between_processes
|
||||
completions = completions_gather[: len(prompts)]
|
||||
|
||||
distributed_state.print(completions)
|
30
examples/inference/distributed/stable_diffusion.py
Normal file
30
examples/inference/distributed/stable_diffusion.py
Normal file
@ -0,0 +1,30 @@
|
||||
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
from accelerate import PartialState # Can also be Accelerator or AcceleratorState
|
||||
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
distributed_state = PartialState()
|
||||
pipe.to(distributed_state.device)
|
||||
|
||||
# Assume two processes
|
||||
# On the first GPU, the prompts will be ["a dog", "a cat"],
|
||||
# and on the second GPU it will be ["a chicken", "a chicken"].
|
||||
# Make sure to drop the final sample, as it will be a duplicate of the previous one.
|
||||
with distributed_state.split_between_processes(["a dog", "a cat", "a chicken"], apply_padding=True) as prompt:
|
||||
result = pipe(prompt).images
|
@ -247,20 +247,30 @@ class FeatureExamplesTests(TempDirTestCase):
|
||||
testargs = ["examples/by_feature/early_stopping.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_distributed_inference_examples_stable_diffusion(self):
|
||||
testargs = ["examples/inference/distributed/stable_diffusion.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_distributed_inference_examples_phi2(self):
|
||||
testargs = ["examples/inference/distributed/phi2.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_pippy
|
||||
@require_multi_gpu
|
||||
def test_pippy_examples_bert(self):
|
||||
testargs = ["examples/inference/bert.py"]
|
||||
testargs = ["examples/inference/pippy/bert.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_pippy
|
||||
@require_multi_gpu
|
||||
def test_pippy_examples_gpt2(self):
|
||||
testargs = ["examples/inference/gpt2.py"]
|
||||
testargs = ["examples/inference/pippy/gpt2.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
||||
@require_pippy
|
||||
@require_multi_gpu
|
||||
def test_pippy_examples_t5(self):
|
||||
testargs = ["examples/inference/t5.py"]
|
||||
testargs = ["examples/inference/pippy/t5.py"]
|
||||
run_command(self.launch_args + testargs)
|
||||
|
Reference in New Issue
Block a user