Compare commits

..

1 Commits

Author SHA1 Message Date
7c8eb789b5 Release: v0.12.0 2022-08-04 09:00:35 -04:00
71 changed files with 348 additions and 2315 deletions

View File

@ -1,5 +1,6 @@
name: "\U0001F41B Bug Report"
description: Submit a bug report to help us improve Accelerate
labels: [ "bug" ]
body:
- type: textarea
id: system-info

View File

@ -1,64 +0,0 @@
name: Build Docker images (releases)
on:
workflow_dispatch:
release:
types: [published]
concurrency:
group: docker-image-builds
cancel-in-progress: false
jobs:
get-version:
runs-on: ubuntu-latest
outputs:
version: ${{ steps.step1.outputs.version }}
steps:
- uses: actions/checkout@v3
- id: step1
run: echo "::set-output name=version::$(python setup.py --version)"
version-cpu:
name: "Latest Accelerate CPU [version]"
runs-on: ubuntu-latest
needs: get-version
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Check out code
uses: actions/checkout@v2
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Build and Push CPU
uses: docker/build-push-action@v2
with:
context: ./docker/accelerate-cpu
push: true
tags: huggingface/accelerate-cpu:${{needs.get-version.outputs.version}}
version-cuda:
name: "Latest Accelerate GPU [version]"
runs-on: ubuntu-latest
needs: get-version
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Check out code
uses: actions/checkout@v2
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Build and Push GPU
uses: docker/build-push-action@v2
with:
context: ./docker/accelerate-gpu
push: true
tags: huggingface/accelerate-gpu:${{needs.get-version.outputs.version}}

View File

@ -10,7 +10,7 @@ env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
jobs:
check-for-source:
check-for-setup:
runs-on: ubuntu-latest
name: Check if setup was changed
outputs:
@ -28,18 +28,18 @@ jobs:
id: was_changed
run: |
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
if [ `basename "${file}"` == "setup.py" ]; then
if [ `basename "${file}"` = "setup.py" ]; then
echo ::set-output name=changed::"1"
fi
done
build-docker-containers:
needs: check-for-source
if: (github.event_name == 'push') && (needs.check-for-source.outputs.changed == '1')
uses: ./.github/workflows/build_docker_images.yml
needs: check-for-setup
if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
uses: ./.github/workflows/build-docker-images.yml
secrets: inherit
run-merge-tests:
run-tests:
needs: build-docker-containers
if: always()
uses: ./.github/workflows/run_merge_tests.yml
uses: ./.github/workflows/on-merge.yml

View File

@ -7,7 +7,6 @@ on:
env:
RUN_SLOW: "yes"
IS_GITHUB_CI: "1"
jobs:
run_all_tests_single_gpu:
@ -28,7 +27,6 @@ jobs:
git config --global --add safe.directory '*'
git fetch && git checkout ${{ github.sha }}
pip install -e . --no-deps
pip install pytest-reportlog
- name: Run test on GPUs
run: |
@ -39,11 +37,6 @@ jobs:
source activate accelerate
pip uninstall comet_ml -y
make test_examples
- name: Generate Report
if: always()
run: |
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_all_tests_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
@ -63,26 +56,14 @@ jobs:
git config --global --add safe.directory '*'
git fetch && git checkout ${{ github.sha }}
pip install -e . --no-deps
pip install pytest-reportlog
- name: Run core and big modeling tests on GPUs
- name: Run test on GPUs
run: |
source activate accelerate
make test_big_modeling
make test_core
- name: Run Integration tests on GPUs
run: |
source activate accelerate
make test_integrations
make test
- name: Run examples on GPUs
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
- name: Generate Report
if: always()
run: |
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
make test_examples

View File

@ -6,7 +6,6 @@ on:
env:
TESTING_MOCKED_DATALOADERS: "1"
IS_GITHUB_CI: "1"
jobs:
run_all_tests_single_gpu:
@ -27,7 +26,6 @@ jobs:
git config --global --add safe.directory '*'
git fetch && git checkout ${{ github.sha }}
pip install -e .[testing,test_trackers]
pip install pytest-reportlog
- name: Run test on GPUs
run: |
@ -39,11 +37,6 @@ jobs:
pip uninstall comet_ml -y
make test_examples
- name: Generate Report
if: always()
run: |
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
run_all_tests_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
@ -60,7 +53,6 @@ jobs:
git config --global --add safe.directory '*'
git fetch && git checkout ${{ github.sha }}
pip install -e .[testing,test_trackers]
pip install pytest-reportlog
- name: Run test on GPUs
run: |
@ -71,9 +63,4 @@ jobs:
run: |
source activate accelerate
pip uninstall comet_ml -y
make test_examples
- name: Generate Report
if: always()
run: |
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
make test_examples

View File

@ -1,30 +1,16 @@
name: Run Tests
on:
pull_request:
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "examples/**"
- "setup.py"
types: [opened, synchronize, reopened]
on: [pull_request]
env:
HF_HOME: ~/hf_cache
TESTING_MOCKED_DATALOADERS: "1"
IS_GITHUB_CI: "1"
jobs:
run-tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
pytorch-version: [
latest,
minimum
]
test-kind: [
test_prod,
test_core,
@ -57,14 +43,7 @@ jobs:
if [[ ${{ matrix.test-kind }} = test_prod ]]; then pip install -e .[test_prod]; fi
if [[ ${{ matrix.test-kind }} != test_prod ]]; then pip install -e .[testing,test_trackers]; fi
if [[ ${{ matrix.test-kind }} = test_rest ]]; then pip uninstall comet_ml -y; fi
if [[ ${{ matrix.pytorch-version }} = minimum ]]; then pip install torch==1.6.0; fi
pip install pytest-reportlog
- name: Run Tests
run: |
make ${{ matrix.test-kind }}
- name: Generate Report
if: always()
run: |
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
make ${{ matrix.test-kind }}

View File

@ -25,40 +25,37 @@ style:
# Run tests for the library
test:
python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log 'all.log',)
python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py
test_big_modeling:
python -m pytest -s -v ./tests/test_big_modeling.py $(if $(IS_GITHUB_CI),--report-log 'big_modeling.log',)
python -m pytest -s -v ./tests/test_big_modeling.py
test_core:
python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py --ignore=./tests/deepspeed --ignore=./tests/test_big_modeling.py \
--ignore=./tests/fsdp $(if $(IS_GITHUB_CI),--report-log 'core.log',)
--ignore=./tests/fsdp
test_deepspeed:
python -m pytest -s -v ./tests/deepspeed $(if $(IS_GITHUB_CI),--report-log 'deepspeed.log',)
python -m pytest -s -v ./tests/deepspeed
test_fsdp:
python -m pytest -s -v ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log 'fsdp.log',)
python -m pytest -s -v ./tests/fsdp
test_examples:
python -m pytest -s -v ./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log 'examples.log',)
python -m pytest -s -v ./tests/test_examples.py
# Broken down example tests for the CI runners
test_integrations:
python -m pytest -s -v ./tests/deepspeed ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log 'integrations.log',)
test_example_differences:
python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests $(if $(IS_GITHUB_CI),--report-log 'example_diff.log',)
python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests
test_checkpoint_epoch:
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_epoch" $(if $(IS_GITHUB_CI),--report-log 'checkpoint_epoch.log',)
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_epoch"
test_checkpoint_step:
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_step" $(if $(IS_GITHUB_CI),--report-log 'checkpoint_step.log',)
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_step"
# Same as test but used to install only the base dependencies
test_prod:
$(MAKE) test_core
test_rest:
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "not by_step and not by_epoch" $(if $(IS_GITHUB_CI),--report-log 'rest.log',)
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "not by_step and not by_epoch"

View File

@ -196,7 +196,7 @@ from accelerate import notebook_launcher
notebook_launcher(training_function)
```
An example can be found in [this notebook](https://github.com/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb). [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/accelerate_examples/simple_nlp_example.ipynb)
An example can be found in [this notebook](https://github.com/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb). [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb)
## Why should I use 🤗 Accelerate?
@ -243,16 +243,3 @@ pip install accelerate
- FP16 with native AMP (apex on the roadmap)
- DeepSpeed support (Experimental)
- PyTorch Fully Sharded Data Parallel (FSDP) support (Experimental)
## Citing 🤗 Accelerate
If you use 🤗 Accelerate in your publication, please cite it by using the following BibTeX entry.
```bibtex
@Misc{accelerate,
title = {Accelerate: Training and inference at scale made simple, efficient and adaptable.},
author = {Sylvain Gugger, Lysandre Debut, Thomas Wolf, Philipp Schmid, Zachary Mueller, Sourab Mangrulkar},
howpublished = {\url{https://github.com/huggingface/accelerate}},
year = {2022}
}
```

View File

@ -32,15 +32,11 @@
- local: usage_guides/memory
title: How to avoid CUDA Out-of-Memory
- local: usage_guides/sagemaker
title: Using 🤗 Accelerate on SageMaker
title: Using Accelerate on SageMaker
- local: usage_guides/mps
title: How to use Apple Silicon M1 GPUs
- local: usage_guides/training_zoo
title: 🤗 Accelerate Example Zoo
title: How-To Guides
- sections:
- local: concept_guides/performance
title: Comparing performance across distributed setups
- local: concept_guides/gradient_synchronization
title: Gradient synchronization
- local: concept_guides/deferring_execution

View File

@ -1,15 +1,3 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# Deferring Executions
When you run your usual script, instructions are executed in order. Using 🤗 Accelerate to deploy your script on several

View File

@ -114,6 +114,4 @@ for batch in dataloader:
outputs = model(inputs)
loss = loss_function(outputs, targets)
accelerator.backward(loss)
```
As a result, you should either use *`accelerator.accumulate` or `accelerator.no_sync`* when it comes to API choice.
```

View File

@ -1,91 +0,0 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# Comparing performance between different device setups
Evaluating and comparing the performance from different setups can be quite tricky if you don't know what to look for.
For example, you cannot run the same script with the same batch size across TPU, multi-GPU, and single-GPU with Accelerate
and expect your results to line up.
But why?
There's three reasons for this that this tutorial will cover:
1. **Setting the right seeds**
2. **Observed Batch Sizes**
3. **Learning Rates**
## Setting the Seed
While this issue has not come up as much, make sure to use [`utils.set_seed`] to fully set the seed in all distributed cases so training will be reproducable:
```python
from accelerate import set_seed
set_seed(42)
```
Why is this important? Under the hood this will set **5** different seed settings:
```python
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# ^^ safe to call this function even if cuda is not available
if is_tpu_available():
xm.set_rng_state(seed)
```
The random state, numpy's state, torch, torch's cuda state, and if TPUs are available torch_xla's cuda state.
## Observed Batch Sizes
When training with Accelerate, the batch size passed to the dataloader is the **batch size per GPU**. What this entails is
a batch size of 64 on two GPUs is truly a batch size of 128. As a result, when testing on a single GPU this needs to be accounted for,
as well as similarly for TPUs.
The below table can be used as a quick reference to try out different batch sizes:
<Tip>
In this example there are two GPUs for "Multi-GPU" and a TPU pod with 8 workers
</Tip>
| Single GPU Batch Size | Multi-GPU Equivalent Batch Size | TPU Equivalent Batch Size |
|-----------------------|---------------------------------|---------------------------|
| 256 | 128 | 32 |
| 128 | 64 | 16 |
| 64 | 32 | 8 |
| 32 | 16 | 4 |
## Learning Rates
As noted in multiple sources[[1](https://aws.amazon.com/blogs/machine-learning/scalable-multi-node-deep-learning-training-using-gpus-in-the-aws-cloud/)][[2](https://docs.nvidia.com/clara/tlt-mi_archive/clara-train-sdk-v2.0/nvmidl/appendix/training_with_multiple_gpus.html)], the learning rate should be scaled *linearly* based on the number of devices present. The below
snippet shows doing so with Accelerate:
<Tip>
Since users can have their own learning rate schedulers defined, we leave this up to the user to decide if they wish to scale their
learning rate or not.
</Tip>
```python
learning_rate = 1e-3
accelerator = Accelerator()
learning_rate *= accelerator.num_processes
optimizer = AdamW(params=model.parameters(), lr=learning_rate)
```

View File

@ -55,7 +55,7 @@ accelerate launch {my_script.py}
><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
<p class="text-gray-700">Learn the basics and become familiar with using 🤗 Accelerate. Start here if you are using 🤗 Accelerate for the first time!</p>
</a>
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="/docs/accelerate/usage_guides/gradient_accumulation"
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="/docs/accelerate/utility_guides/gradient_accumulation"
><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
<p class="text-gray-700">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Accelerate to solve real-world problems.</p>
</a>
@ -68,4 +68,4 @@ accelerate launch {my_script.py}
<p class="text-gray-700">Technical descriptions of how 🤗 Accelerate classes and methods work.</p>
</a>
</div>
</div>
</div>

View File

@ -94,7 +94,6 @@ The following arguments are useful for customization of worker machines
* `--machine_rank MACHINE_RANK` (`int`) -- The rank of the machine on which this script is launched.
* `--num_machines NUM_MACHINES` (`int`) -- The total number of machines used in this training.
* `--num_processes NUM_PROCESSES` (`int`) -- The total number of processes to be launched in parallel.
* `--gpu_ids` (`str`) -- What GPUs (by id) should be used for training on this machine as a comma-seperated list
* `--main_process_ip MAIN_PROCESS_IP` (`str`) -- The IP address of the machine of rank 0.
* `--main_process_port MAIN_PROCESS_PORT` (`int`) -- The port to use to communicate with the machine of rank 0.
* `--num_cpu_threads_per_process NUM_CPU_THREADS_PER_PROCESS` (`int`) -- The number of CPU threads per process. Can be tuned for optimal performance.

View File

@ -35,13 +35,7 @@ While this works very well for regularly sized models, this workflow has some cl
</Tip>
## How the Process Works: A Quick Overview
<Youtube id="MWCSGj9jEAo" />
## How the Process Works: Working with Code
### Instantiating an empty model
## Instantiating an empty model
The first tool 🤗 Accelerate introduces to help with big models is a context manager [`init_empty_weights`] that helps you initialize a model without using any RAM, so that step 1 can be done on models of any size. Here is how it works:
@ -67,7 +61,7 @@ initializes an empty model with a bit more than 100B parameters. Behind the scen
</Tip>
### Sharded checkpoints
## Sharded checkpoints
It's possible your model is so big that even a single copy won't fit in RAM. That doesn't mean it can't be loaded: if you have one or several GPUs, this is more memory available to store your model. In this case, it's better if your checkpoint is split in several smaller files that we call checkpoint shards.
@ -92,7 +86,7 @@ with index.json being the following file:
and `first_state_dict.bin` containing the weights for `"linear1.weight"` and `"linear1.bias"`, `second_state_dict.bin` the ones for `"linear2.weight"` and `"linear2.bias"`
### Loading weights
## Loading weights
The second tool 🤗 Accelerate introduces is a function [`load_checkpoint_and_dispatch`], that will allow you to load a checkpoint inside your empty model. This supports full checkpoints (a single file containing the whole state dict) as well as sharded checkpoints. It will also automatically dispatch those weights across the devices you have available (GPUs, CPU RAM), so if you are loading a sharded checkpoint, the maximum RAM usage will be the size of the biggest shard.
@ -182,7 +176,7 @@ You can also design your `device_map` yourself, if you prefer to explicitly deci
model = load_checkpoint_and_dispatch(model, "sharded-gpt-j-6B", device_map=my_device_map)
```
### Run the model
## Run the model
Now that we have done this, our model lies across several devices, and maybe the hard drive. But it can still be used as a regular PyTorch model:
@ -209,7 +203,7 @@ This way, you model can run for inference even if it doesn't fit on one of the G
</Tip>
### Designing a device map
## Designing a device map
You can let 🤗 Accelerate handle the device map computation by setting `device_map` to one of the supported options (`"auto"`, `"balanced"`, `"balanced_low_0"`, `"sequential"`) or create one yourself, if you want more control over where each layer should go.

View File

@ -31,6 +31,7 @@ model.to(device)
gradient_accumulation_steps = 2
for index, batch in enumerate(training_dataloader):
optimizer.zero_grad()
inputs, targets = batch
inputs = inputs.to(device)
targets = targets.to(device)
@ -41,7 +42,6 @@ for index, batch in enumerate(training_dataloader):
if (index + 1) % gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
```
## Converting it to 🤗 Accelerate
@ -57,6 +57,7 @@ First the code shown earlier will be converted to utilize 🤗 Accelerate withou
+ )
for index, batch in enumerate(training_dataloader):
optimizer.zero_grad()
inputs, targets = batch
- inputs = inputs.to(device)
- targets = targets.to(device)
@ -67,7 +68,6 @@ First the code shown earlier will be converted to utilize 🤗 Accelerate withou
if (index+1) % gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
```
<Tip warning={true}>
@ -94,6 +94,7 @@ You just wrap it around the entire training part of our code:
- for index, batch in enumerate(training_dataloader):
+ for batch in training_dataloader:
+ with accelerator.accumulate(model):
optimizer.zero_grad()
inputs, targets = batch
outputs = model(inputs)
```
@ -106,7 +107,6 @@ You can remove all the special checks for the step number and the loss adjustmen
- if (index+1) % gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
```
As you can see the [`Accelerator`] is able to keep track of the batch number you are on and it will automatically know whether to step through the prepared optimizer and how to adjust the loss.
@ -118,13 +118,11 @@ Below is the finished implementation for performing gradient accumulation with
```python
for batch in training_dataloader:
with accelerator.accumulate(model):
optimizer.zero_grad()
inputs, targets = batch
outputs = model(inputs)
loss = loss_function(outputs, targets)
accelerator.backward(loss)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
```
To learn more about what magic this wraps around, read the [Gradient Synchronization concept guide](/concept_guides/gradient_synchronization)

View File

@ -25,20 +25,16 @@ training script. To use it, restructure your training function to include an inn
and build your dataloaders inside it. At a minimum, this could look like 4 new lines of code.
> Note: The inner function *must* take in the batch size as the first parameter, but we do not pass one to it when called. The wrapper handles this for us
It should also be noted that anything which will consume CUDA memory and passed to the `accelerator` **must** be declared inside the inner function,
such as models and optimizers.
```diff
def training_function(args):
accelerator = Accelerator()
model = get_model()
model.to(accelerator.device)
optimizer = get_optimizer()
+ @find_executable_batch_size(starting_batch_size=args.batch_size)
+ def inner_training_loop(batch_size):
+ nonlocal accelerator # Ensure they can be used in our context
+ accelerator.free_memory() # Free all lingering references
model = get_model()
model.to(accelerator.device)
optimizer = get_optimizer()
+ nonlocal model, optimizer # Ensure they can be used in our context
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
lr_scheduler = get_scheduler(
optimizer,
@ -52,4 +48,4 @@ def training_function(args):
+ inner_training_loop()
```
To find out more, check the documentation [here](../package_reference/utilities#accelerate.find_executable_batch_size).
To find out more, check the documentation [here](package_reference/utilities#accelerate.find_executable_batch_size)

View File

@ -19,7 +19,7 @@ This will map computational graphs and primitives on the MPS Graph framework and
For more information please refer official documents [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)
and [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html).
### Benefits of Training and Inference using Apple Silicon Chips
### Benefits of Training and Inference using Apple M1 Chips
1. Enables users to train larger networks or batch sizes locally
2. Reduces data retrieval latency and provides the GPU with direct access to the full memory store due to unified memory architecture.
@ -72,9 +72,8 @@ accelerate launch /examples/cv_example.py --data_dir images
## A few caveats to be aware of
1. We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine.
It has major fixes related to model correctness and performance improvements for transformer based models.
Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details.
1. For `nlp_example.py` the metrics are too bad when compared to CPU-only training.
This means certain operations in BERT model are going wrong using mps device and this needs to be fixed by PyTorch.
2. Distributed setups `gloo` and `nccl` are not working with `mps` device.
This means that currently only single GPU of `mps` device type can be used.

View File

@ -129,26 +129,7 @@ You can find your model data at: s3://your-bucket/accelerate-sagemaker-1-2021-04
### Distributed Training: Data Parallelism
Set up the accelerate config by running `accelerate config` and answer the SageMaker questions and set it up.
To use SageMaker DDP, select it when asked
`What is the distributed mode? ([0] No distributed training, [1] data parallelism):`.
Example config below:
```yaml
base_job_name: accelerate-sagemaker-1
compute_environment: AMAZON_SAGEMAKER
distributed_type: DATA_PARALLEL
ec2_instance_type: ml.p3.16xlarge
iam_role_name: xxxxx
image_uri: null
mixed_precision: fp16
num_machines: 1
profile: xxxxx
py_version: py38
pytorch_version: 1.10.2
region: us-east-1
transformers_version: 4.17.0
use_cpu: false
```
*currently in development, will be supported soon.*
### Distributed Training: Model Parallelism

View File

@ -1,107 +0,0 @@
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# Example Zoo
Below contains a non-exhuastive list of tutorials and scripts showcasing Accelerate
## Official Accelerate Examples:
### Basic Examples
These examples showcase the base features of Accelerate and are a great starting point
- [Barebones NLP example](https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py)
- [Barebones computer vision example](https://github.com/huggingface/accelerate/blob/main/examples/cv_example.py)
### Feature Specific Examples
These examples showcase specific features that the Accelerate framework offers
- [Automatic memory-aware gradient accumulation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/automatic_gradient_accumulation.py)
- [Checkpointing states](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/checkpointing.py)
- [Cross validation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/cross_validation.py)
- [DeepSpeed](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/deepspeed_with_config_support.py)
- [Fully Sharded Data Parallelism](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/fsdp_with_peak_mem_tracking.py)
- [Gradient accumulation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation.py)
- [Memory-aware batch size finder](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/memory.py)
- [Metric Computation](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/multi_process_metrics.py)
- [Using Trackers](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/tracking.py)
### Full Examples
These examples showcase every feature in Accelerate at once that was shown in "Feature Specific Examples"
- [Complete NLP example](https://github.com/huggingface/accelerate/blob/main/examples/complete_nlp_example.py)
- [Complete computer vision example](https://github.com/huggingface/accelerate/blob/main/examples/complete_cv_example.py)
- [Causal language model fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py)
- [Masked language model fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_no_trainer.py)
- [Speech pretraining example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py)
- [Translation fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation_no_trainer.py)
- [Text classification fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue_no_trainer.py)
- [Semantic segmentation fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py)
- [Question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_no_trainer.py)
- [Beam search question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py)
- [Multiple choice question answering fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/multiple-choice/run_swag_no_trainer.py)
- [Named entity recognition fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner_no_trainer.py)
- [Image classification fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/image-classification/run_image_classification_no_trainer.py)
- [Summarization fine-tuning example](https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization_no_trainer.py)
## Integration Examples
These are tutorials from libraries that integrate with 🤗 Accelerate:
### Catalyst
- [Distributed training tutorial with Catalyst](https://catalyst-team.github.io/catalyst/tutorials/ddp.html)
### DALLE2-pytorch
- [Fine-tuning DALLE2](https://github.com/lucidrains/DALLE2-pytorch#usage)
### 🤗 diffusers
- [Performing textual inversion with diffusers](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion)
- [Training DreamBooth with diffusers](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)
### fastai
- [Distributed training from Jupyter Notebooks with fastai](https://docs.fast.ai/tutorial.distributed.html)
- [Basic distributed training examples with fastai](https://docs.fast.ai/examples/distributed_app_examples.html)
### GradsFlow
- [Auto Image Classification with GradsFlow](https://docs.gradsflow.com/en/latest/examples/nbs/01-ImageClassification/)
### imagen-pytorch
- [Fine-tuning Imagen](https://github.com/lucidrains/imagen-pytorch#usage)
### Kornia
- [Fine-tuning vision models with Kornia's Trainer](https://kornia.readthedocs.io/en/latest/get-started/training.html)
### PyTorch Accelerated
- [Quickstart distributed training tutorial with PyTorch Accelerated](https://pytorch-accelerated.readthedocs.io/en/latest/quickstart.html)
### PyTorch3D
- [Perform Deep Learning with 3D data](https://pytorch3d.org/tutorials/)
### Tez
- [Leaf disease detection with Tez and Accelerate](https://www.kaggle.com/code/abhishek/tez-faster-and-easier-training-for-leaf-detection/notebook)
### trlx
- [How to implement a sentiment learning task with trlx](https://github.com/CarperAI/trlx#example-how-to-add-a-task)

View File

@ -136,7 +136,7 @@ To run it in each of these various modes, use the following commands:
```
- single GPU:
```bash
python ./cv_example.py # from a server with a GPU
python ./nlp_example.py # from a server with a GPU
```
- with fp16 (mixed-precision)
* from any server by passing `fp16=True` to the `Accelerator`.
@ -184,13 +184,6 @@ To run it in each of these various modes, use the following commands:
* In PyTorch:
Add an `xmp.spawn` line in your script as you usually do.
### Simple vision example (GANs)
- [huggan project](https://github.com/huggingface/community-events/tree/main/huggan)
### Using AWS SageMaker integration
- [Examples showcasing AWS SageMaker integration of 🤗 Accelerate.](https://github.com/pacman100/accelerate-aws-sagemaker)
## Finer Examples
While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations.

View File

@ -1,232 +0,0 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
# New Code #
import evaluate
from accelerate import Accelerator, DistributedType
from accelerate.utils import find_executable_batch_size
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing how to combine both the gradient accumulation
# and automatic batch size finder utilities of Accelerate to perfrom
# automatic gradient accumulation
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
# - single CPU or single GPU
# - multi GPUS (using PyTorch distributed mode)
# - (multi) TPUs
# - fp16 (mixed-precision) or fp32 (normal precision)
#
# New additions from the base script can be found quickly by
# looking for the # New Code # tags
#
# To run it in each of these various modes, follow the instructions
# in the readme for examples:
# https://github.com/huggingface/accelerate/tree/main/examples
#
########################################################################
EVAL_BATCH_SIZE = 32
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
"""
Creates a set of `DataLoader`s for the `glue` dataset,
using "bert-base-cased" as the tokenizer.
Args:
accelerator (`Accelerator`):
An `Accelerator` object
batch_size (`int`, *optional*):
The batch size for the train and validation DataLoaders.
"""
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
datasets = load_dataset("glue", "mrpc")
def tokenize_function(examples):
# max_length=None => use the model max length (it's actually the default)
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
return outputs
# Apply the method we just defined to all the examples in all the splits of the dataset
# starting with the main process first:
with accelerator.main_process_first():
tokenized_datasets = datasets.map(
tokenize_function,
batched=True,
remove_columns=["idx", "sentence1", "sentence2"],
)
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
if accelerator.distributed_type == DistributedType.TPU:
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
# Instantiate dataloaders.
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
)
return train_dataloader, eval_dataloader
# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
from accelerate.test_utils.training import mocked_dataloaders
get_dataloaders = mocked_dataloaders # noqa: F811
def training_function(config, args):
# For testing only
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
config["num_epochs"] = 2
# Initialize accelerator
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
lr = config["lr"]
num_epochs = int(config["num_epochs"])
seed = int(config["seed"])
observed_batch_size = int(config["batch_size"])
metric = evaluate.load("glue", "mrpc")
# New Code #
# We use the `find_executable_batch_size` decorator, passing in the desired observed batch size
# to train on. If a CUDA OOM error occurs, it will retry this loop cutting the batch size in
# half each time. From this, we can calculate the number of gradient accumulation steps needed
# and modify the Accelerator object as a result
@find_executable_batch_size(starting_batch_size=int(observed_batch_size))
def inner_training_loop(batch_size):
# Since we need to modify the outside accelerator object, we need to bring it
# to the local scope
nonlocal accelerator
# We can calculate the number of gradient accumulation steps based on the current
# batch size vs the starting batch size
num_gradient_accumulation_steps = observed_batch_size // batch_size
# And then set it in the Accelerator directly:
accelerator.gradient_accumulation_steps = num_gradient_accumulation_steps
# Next we need to free all of the stored model references in the Accelerator each time
accelerator.free_memory()
# And set the seed so our results are reproducable each reset
set_seed(seed)
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
model = model.to(accelerator.device)
# Instantiate optimizer
optimizer = AdamW(params=model.parameters(), lr=lr)
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=100,
num_training_steps=(len(train_dataloader) * num_epochs),
)
# Prepare everything
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
# prepare method.
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)
# Now we train the model
for epoch in range(num_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
# And perform gradient accumulation
with accelerator.accumulate(model):
# We could avoid this line since we set the accelerator with `device_placement=True`.
batch.to(accelerator.device)
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
model.eval()
for step, batch in enumerate(eval_dataloader):
# We could avoid this line since we set the accelerator with `device_placement=True`.
batch.to(accelerator.device)
with torch.no_grad():
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1)
predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
metric.add_batch(
predictions=predictions,
references=references,
)
eval_metric = metric.compute()
# Use accelerator.print to print only on the main process.
accelerator.print(f"epoch {epoch}:", eval_metric)
# New Code #
# And call it at the end with no arguments
# Note: You could also refactor this outside of your training loop function
inner_training_loop()
def main():
parser = argparse.ArgumentParser(description="Simple example of training script.")
parser.add_argument(
"--mixed_precision",
type=str,
default="no",
choices=["no", "fp16", "bf16"],
help="Whether to use mixed precision. Choose"
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
"and an Nvidia Ampere GPU.",
)
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
args = parser.parse_args()
# New Code #
# We modify the starting batch size to be an observed batch size of 256, to guarentee an initial CUDA OOM
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 256}
training_function(config, args)
if __name__ == "__main__":
main()

View File

@ -58,7 +58,7 @@ MAX_GPU_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32
# New Code #
# We need a different `get_dataloaders` function that will build dataloaders by index
# We need a different `get_dataloaders` function that will build dataloaders by indexs
def get_fold_dataloaders(
@ -71,9 +71,9 @@ def get_fold_dataloaders(
accelerator (`Accelerator`):
The main `Accelerator` object
train_idxs (list of `int`):
The split indices for the training dataset
The split indicies for the training dataset
valid_idxs (list of `int`):
The split indices for the validation dataset
The split indicies for the validation dataset
batch_size (`int`):
The size of the minibatch. Default is 16
"""

View File

@ -525,7 +525,7 @@ def main():
},
]
# New Code #
# Creates Dummy Optimizer if `optimizer` was specified in the config file else creates Adam Optimizer
# Creates Dummy Optimizer if `optimizer` was spcified in the config file else creates Adam Optimizer
optimizer_cls = (
torch.optim.AdamW
if accelerator.state.deepspeed_plugin is None
@ -554,7 +554,7 @@ def main():
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# New Code #
# Creates Dummy Scheduler if `scheduler` was specified in the config file else creates `args.lr_scheduler_type` Scheduler
# Creates Dummy Scheduler if `scheduler` was spcified in the config file else creates `args.lr_scheduler_type` Scheduler
if (
accelerator.state.deepspeed_plugin is None
or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
@ -588,12 +588,14 @@ def main():
checkpointing_steps = None
# We need to initialize the trackers we use, and also store our configuration.
# The trackers initializes automatically on the main process.
# We initialize the trackers only on main process because `accelerator.log`
# only logs on main process and we don't want empty logs/runs on other processes.
if args.with_tracking:
experiment_config = vars(args)
# TensorBoard cannot log Enums, need the raw value
experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
accelerator.init_trackers("clm_no_trainer", experiment_config)
if accelerator.is_main_process:
experiment_config = vars(args)
# TensorBoard cannot log Enums, need the raw value
experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
accelerator.init_trackers("clm_no_trainer", experiment_config)
# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

View File

@ -113,7 +113,7 @@ def training_function(config, args):
batch_size = int(config["batch_size"])
# We need to initialize the trackers we use, and also store our configuration
if args.with_tracking:
if args.with_tracking and accelerator.is_main_process:
experiment_config = vars(args)
accelerator.init_trackers("fsdp_glue_no_trainer", experiment_config)

View File

@ -29,7 +29,7 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_
########################################################################
# This is a fully working simple example to use Accelerate,
# specifically showcasing how to ensure out-of-memory errors never
# interrupt training, and builds off the `nlp_example.py` script.
# iterrupt training, and builds off the `nlp_example.py` script.
#
# This example trains a Bert base model on GLUE MRPC
# in any of the following settings (with the same script):
@ -122,6 +122,24 @@ def training_function(config, args):
metric = evaluate.load("glue", "mrpc")
# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE
set_seed(seed)
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
model = model.to(accelerator.device)
# Instantiate optimizer
optimizer = AdamW(params=model.parameters(), lr=lr)
# New Code #
# We now can define an inner training loop function. It should take a batch size as the only parameter,
# and build the dataloaders in there.
@ -129,31 +147,16 @@ def training_function(config, args):
@find_executable_batch_size(starting_batch_size=batch_size)
def inner_training_loop(batch_size):
# And now just move everything below under this function
# We need to bring in the Accelerator object from earlier
nonlocal accelerator
# And reset all of its attributes that could hold onto any memory:
accelerator.free_memory()
# Then we can declare the model, optimizer, and everything else:
set_seed(seed)
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
model = model.to(accelerator.device)
# Instantiate optimizer
optimizer = AdamW(params=model.parameters(), lr=lr)
# Ensure that anything declared outside this function is set as `nonlocal`
# so it is in scope
nonlocal model, optimizer
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=100,
num_training_steps=(len(train_dataloader) * num_epochs),
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
)
# Prepare everything
@ -171,10 +174,12 @@ def training_function(config, args):
batch.to(accelerator.device)
outputs = model(**batch)
loss = outputs.loss
loss = loss / gradient_accumulation_steps
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
if step % gradient_accumulation_steps == 0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
model.eval()
for step, batch in enumerate(eval_dataloader):

View File

@ -166,8 +166,8 @@ def training_function(config, args):
)
# New Code #
# We need to initialize the trackers we use. Overall configurations can also be stored
if args.with_tracking:
# We need to initalize the trackers we use. Overall configurations can also be stored
if args.with_tracking and accelerator.is_main_process:
run = os.path.split(__file__)[-1].split(".")[0]
accelerator.init_trackers(run, config)

View File

@ -103,8 +103,10 @@ def training_function(config, args):
checkpointing_steps = None
# We need to initialize the trackers we use, and also store our configuration
if args.with_tracking:
if args.with_tracking and accelerator.is_main_process:
run = os.path.split(__file__)[-1].split(".")[0]
if args.logging_dir:
run = os.path.join(args.logging_dir, run)
accelerator.init_trackers(run, config)
# Grab all the image filenames

View File

@ -75,8 +75,10 @@ def training_function(config, args):
batch_size = int(config["batch_size"])
# We need to initialize the trackers we use, and also store our configuration
if args.with_tracking:
if args.with_tracking and accelerator.is_main_process:
run = os.path.split(__file__)[-1].split(".")[0]
if args.logging_dir:
run = os.path.join(args.logging_dir, run)
accelerator.init_trackers(run, config)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

View File

@ -1,108 +0,0 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage1(Scene):
def construct(self):
mem = Rectangle(height=0.5,width=0.5)
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
cpu_left_col_base = [mem.copy() for i in range(6)]
cpu_right_col_base = [mem.copy() for i in range(6)]
cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
cpu_text = Text("CPU", font_size=24)
cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
cpu.move_to([-2.5,-.5,0])
self.add(cpu)
gpu_base = [mem.copy() for i in range(1)]
gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
gpu_text = Text("GPU", font_size=24)
gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
gpu.align_to(cpu, DOWN)
gpu.set_x(gpu.get_x() - 1)
self.add(gpu)
model_base = [mem.copy() for i in range(6)]
model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
model_text = Text("Model", font_size=24)
model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
model.move_to([3, -1., 0])
self.play(
Create(cpu_left_col, run_time=1),
Create(cpu_right_col, run_time=1),
Create(gpu_rect, run_time=1),
)
step_1 = MarkupText(
f"First, an empty model skeleton is loaded\ninto <span fgcolor='{YELLOW}'>memory</span> without using much RAM.",
font_size=24
)
key = Square(side_length=2.2)
key.move_to([-5, 2, 0])
key_text = MarkupText(
f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
font_size=18,
)
key_text.move_to([-5, 2.4, 0])
step_1.move_to([2, 2, 0])
self.play(
Write(step_1, run_time=2.5),
Write(key_text),
Write(key)
)
self.add(model)
cpu_targs = []
first_animations = []
second_animations = []
for i,rect in enumerate(model_base):
cpu_target = Rectangle(height=0.46,width=0.46).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
cpu_target.move_to(rect)
cpu_target.generate_target()
cpu_target.target.height = 0.46/4
cpu_target.target.width = 0.46/3
if i == 0:
cpu_target.target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
cpu_target.target.set_x(cpu_target.target.get_x()+0.1)
elif i == 3:
cpu_target.target.next_to(cpu_targs[0].target, direction=UP, buff=0.)
else:
cpu_target.target.next_to(cpu_targs[i-1].target, direction=RIGHT, buff=0.)
cpu_targs.append(cpu_target)
first_animations.append(rect.animate(run_time=0.5).set_stroke(YELLOW))
second_animations.append(MoveToTarget(cpu_target, run_time=1.5))
self.play(*first_animations)
self.play(*second_animations)
self.wait()

View File

@ -1,126 +0,0 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage2(Scene):
def construct(self):
mem = Rectangle(height=0.5,width=0.5)
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
cpu_left_col_base = [mem.copy() for i in range(6)]
cpu_right_col_base = [mem.copy() for i in range(6)]
cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
cpu_text = Text("CPU", font_size=24)
cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
cpu.move_to([-2.5,-.5,0])
self.add(cpu)
gpu_base = [mem.copy() for i in range(4)]
gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
gpu_text = Text("GPU", font_size=24)
gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
gpu.move_to([-1,-1,0])
self.add(gpu)
model_base = [mem.copy() for i in range(6)]
model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
model_text = Text("Model", font_size=24)
model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
model.move_to([3, -1., 0])
self.add(model)
cpu_targs = []
for i,rect in enumerate(model_base):
rect.set_stroke(YELLOW)
# target = fill.copy().set_fill(YELLOW, opacity=0.7)
# target.move_to(rect)
# self.add(target)
cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
if i == 0:
cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
cpu_target.set_x(cpu_target.get_x()+0.1)
elif i == 3:
cpu_target.next_to(cpu_targs[0], direction=UP, buff=0.)
else:
cpu_target.next_to(cpu_targs[i-1], direction=RIGHT, buff=0.)
self.add(cpu_target)
cpu_targs.append(cpu_target)
checkpoint_base = [mem.copy() for i in range(6)]
checkpoint_rect = VGroup(*checkpoint_base).arrange(RIGHT,buff=0)
checkpoint_text = Text("Loaded Checkpoint", font_size=24)
checkpoint = Group(checkpoint_rect,checkpoint_text).arrange(DOWN, aligned_edge=DOWN, buff=0.4)
checkpoint.move_to([3, .5, 0])
key = Square(side_length=2.2)
key.move_to([-5, 2, 0])
key_text = MarkupText(
f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
font_size=18,
)
key_text.move_to([-5, 2.4, 0])
self.add(key_text, key)
blue_text = MarkupText(
f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
font_size=18,
)
blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
step_2 = MarkupText(
f'Next, a <i><span fgcolor="{BLUE}">second</span></i> model is loaded into memory,\nwith the weights of a <span fgcolor="{BLUE}">single shard</span>.',
font_size=24
)
step_2.move_to([2, 2, 0])
self.play(
Write(step_2),
Write(blue_text)
)
self.play(
Write(checkpoint_text, run_time=1),
Create(checkpoint_rect, run_time=1)
)
first_animations = []
second_animations = []
for i,rect in enumerate(checkpoint_base):
target = fill.copy().set_fill(BLUE, opacity=0.7)
target.move_to(rect)
first_animations.append(GrowFromCenter(target, run_time=1))
cpu_target = target.copy()
cpu_target.generate_target()
if i < 5:
cpu_target.target.move_to(cpu_left_col_base[i+1])
else:
cpu_target.target.move_to(cpu_right_col_base[i-5])
second_animations.append(MoveToTarget(cpu_target, run_time=1.5))
self.play(*first_animations)
self.play(*second_animations)
self.wait()

View File

@ -1,158 +0,0 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage3(Scene):
def construct(self):
mem = Rectangle(height=0.5,width=0.5)
meta_mem = Rectangle(height=0.25,width=0.25)
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
cpu_left_col_base = [mem.copy() for i in range(6)]
cpu_right_col_base = [mem.copy() for i in range(6)]
cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
cpu_text = Text("CPU", font_size=24)
cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
cpu.move_to([-2.5,-.5,0])
self.add(cpu)
gpu_base = [mem.copy() for i in range(4)]
gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
gpu_text = Text("GPU", font_size=24)
gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
gpu.move_to([-1,-1,0])
self.add(gpu)
model_base = [mem.copy() for i in range(6)]
model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
model_text = Text("Model", font_size=24)
model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
model.move_to([3, -1., 0])
self.add(model)
model_arr = []
model_cpu_arr = []
model_meta_arr = []
for i,rect in enumerate(model_base):
rect.set_stroke(YELLOW)
cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
if i == 0:
cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
cpu_target.set_x(cpu_target.get_x()+0.1)
elif i == 3:
cpu_target.next_to(model_cpu_arr[0], direction=UP, buff=0.)
else:
cpu_target.next_to(model_cpu_arr[i-1], direction=RIGHT, buff=0.)
self.add(cpu_target)
model_cpu_arr.append(cpu_target)
self.add(*model_arr, *model_cpu_arr, *model_meta_arr)
checkpoint_base = [mem.copy() for i in range(6)]
checkpoint_rect = VGroup(*checkpoint_base).arrange(RIGHT,buff=0)
checkpoint_text = Text("Loaded Checkpoint", font_size=24)
checkpoint = Group(checkpoint_rect,checkpoint_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
checkpoint.move_to([3, .5, 0])
self.add(checkpoint)
ckpt_arr = []
ckpt_cpu_arr = []
for i,rect in enumerate(checkpoint_base):
target = fill.copy().set_fill(BLUE, opacity=0.7)
target.move_to(rect)
ckpt_arr.append(target)
cpu_target = target.copy()
if i < 5:
cpu_target.move_to(cpu_left_col_base[i+1])
else:
cpu_target.move_to(cpu_right_col_base[i-5])
ckpt_cpu_arr.append(cpu_target)
self.add(*ckpt_arr, *ckpt_cpu_arr)
key = Square(side_length=2.2)
key.move_to([-5, 2, 0])
key_text = MarkupText(
f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
font_size=18,
)
key_text.move_to([-5, 2.4, 0])
self.add(key_text, key)
blue_text = MarkupText(
f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
font_size=18,
)
blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
self.add(blue_text)
step_3 = MarkupText(
f'Based on the passed in configuration, weights are stored in\na variety of np.memmaps on disk or to a particular device.',
font_size=24
)
step_3.move_to([2, 2, 0])
disk_left_col_base = [meta_mem.copy() for i in range(6)]
disk_right_col_base = [meta_mem.copy() for i in range(6)]
disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
disk_text = Text("Disk", font_size=24)
disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
disk.move_to([-4.,-1.25,0])
self.play(
Write(step_3, run_time=3),
Write(disk_text, run_time=1),
Create(disk_rects, run_time=1)
)
animations = []
for i,rect in enumerate(ckpt_cpu_arr):
target = rect.copy()
target.generate_target()
target.target.move_to(disk_left_col_base[i]).scale(0.5)
animations.append(MoveToTarget(target, run_time=1.5))
self.play(*animations)
self.play(FadeOut(step_3))
step_4 = MarkupText(
f'Then, the checkpoint is removed from memory\nthrough garbage collection.',
font_size=24
)
step_4.move_to([2, 2, 0])
self.play(
Write(step_4, run_time=3)
)
self.play(
FadeOut(checkpoint_rect, checkpoint_text, *ckpt_arr, *ckpt_cpu_arr),
)
self.wait()

View File

@ -1,156 +0,0 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage4(Scene):
def construct(self):
mem = Rectangle(height=0.5,width=0.5)
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
meta_mem = Rectangle(height=0.25,width=0.25)
cpu_left_col_base = [mem.copy() for i in range(6)]
cpu_right_col_base = [mem.copy() for i in range(6)]
cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
cpu_text = Text("CPU", font_size=24)
cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
cpu.move_to([-2.5,-.5,0])
self.add(cpu)
gpu_base = [mem.copy() for i in range(4)]
gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
gpu_text = Text("GPU", font_size=24)
gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
gpu.move_to([-1,-1,0])
self.add(gpu)
model_base = [mem.copy() for i in range(6)]
model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
model_text = Text("Model", font_size=24)
model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
model.move_to([3, -1., 0])
self.add(model)
model_cpu_arr = []
model_meta_arr = []
for i,rect in enumerate(model_base):
rect.set_stroke(YELLOW)
cpu_target = Rectangle(height=0.46/4,width=0.46/3).set_stroke(width=0.).set_fill(YELLOW, opacity=0.7)
if i == 0:
cpu_target.next_to(cpu_left_col_base[0].get_corner(DOWN+LEFT), buff=0.02, direction=UP)
cpu_target.set_x(cpu_target.get_x()+0.1)
elif i == 3:
cpu_target.next_to(model_cpu_arr[0], direction=UP, buff=0.)
else:
cpu_target.next_to(model_cpu_arr[i-1], direction=RIGHT, buff=0.)
self.add(cpu_target)
model_cpu_arr.append(cpu_target)
self.add(*model_cpu_arr, *model_meta_arr)
disk_left_col_base = [meta_mem.copy() for i in range(6)]
disk_right_col_base = [meta_mem.copy() for i in range(6)]
disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
disk_text = Text("Disk", font_size=24)
disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
disk.move_to([-4.,-1.25,0])
self.add(disk_text, disk_rects)
cpu_disk_arr = []
for i in range(6):
target = fill.copy().set_fill(BLUE, opacity=0.8)
target.move_to(disk_left_col_base[i]).scale(0.5)
cpu_disk_arr.append(target)
self.add(*cpu_disk_arr)
key = Square(side_length=2.2)
key.move_to([-5, 2, 0])
key_text = MarkupText(
f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
font_size=18,
)
key_text.move_to([-5, 2.4, 0])
self.add(key_text, key)
blue_text = MarkupText(
f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
font_size=18,
)
blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
self.add(blue_text)
step_5 = MarkupText(
f'The offloaded weights are all sent to the CPU.',
font_size=24
)
step_5.move_to([2, 2, 0])
self.play(Write(step_5, run_time=3))
for i in range(6):
rect = cpu_disk_arr[i]
cp2 = rect.copy().set_fill(BLUE, opacity=0.8).scale(2.0)
cp2.generate_target()
cp2.target.move_to(model_base[i])
if i == 0:
rect.set_fill(BLUE, opacity=0.8)
rect.generate_target()
rect.target.move_to(cpu_left_col_base[0]).scale(2.0)
self.remove(*model_meta_arr,
*model_cpu_arr,
)
else:
rect.generate_target()
rect.target.move_to(cpu_left_col_base[i]).scale(2.0)
self.play(
MoveToTarget(rect),
MoveToTarget(cp2),
model_base[i].animate.set_stroke(WHITE)
)
self.play(FadeOut(step_5))
step_5 = MarkupText(
f'Finally, hooks are added to each weight in the model\nto transfer the weights from CPU to GPU\n\t\tand back when needed.',
font_size=24
)
step_5.move_to([2, 2, 0])
self.play(Write(step_5, run_time=3))
arrows = []
animations = []
for i in range(6):
a = Arrow(start=UP, end=DOWN, color=RED, buff=.5)
a.next_to(model_base[i].get_left(), UP, buff=0.2)
arrows.append(a)
animations.append(Write(a))
self.play(*animations)
self.wait()

View File

@ -1,221 +0,0 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from manim import *
class Stage5(Scene):
def construct(self):
mem = Rectangle(height=0.5,width=0.5)
fill = Rectangle(height=0.46,width=0.46).set_stroke(width=0)
meta_mem = Rectangle(height=0.25,width=0.25)
cpu_left_col_base = [mem.copy() for i in range(6)]
cpu_right_col_base = [mem.copy() for i in range(6)]
cpu_left_col = VGroup(*cpu_left_col_base).arrange(UP, buff=0)
cpu_right_col = VGroup(*cpu_right_col_base).arrange(UP, buff=0)
cpu_rects = VGroup(cpu_left_col,cpu_right_col).arrange(RIGHT, buff=0)
cpu_text = Text("CPU", font_size=24)
cpu = Group(cpu_rects,cpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
cpu.move_to([-2.5,-.5,0])
self.add(cpu)
gpu_base = [mem.copy() for i in range(4)]
gpu_rect = VGroup(*gpu_base).arrange(UP,buff=0)
gpu_text = Text("GPU", font_size=24)
gpu = Group(gpu_rect,gpu_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
gpu.move_to([-1,-1,0])
self.add(gpu)
model_base = [mem.copy() for i in range(6)]
model_rect = VGroup(*model_base).arrange(RIGHT,buff=0)
model_text = Text("Model", font_size=24)
model = Group(model_rect,model_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
model.move_to([3, -1., 0])
self.add(model)
model_arr = []
model_cpu_arr = []
for i,rect in enumerate(model_base):
target = fill.copy().set_fill(BLUE, opacity=0.8)
target.move_to(rect)
model_arr.append(target)
cpu_target = Rectangle(height=0.46,width=0.46).set_stroke(width=0.).set_fill(BLUE, opacity=0.8)
cpu_target.move_to(cpu_left_col_base[i])
model_cpu_arr.append(cpu_target)
self.add(*model_arr, *model_cpu_arr)
disk_left_col_base = [meta_mem.copy() for i in range(6)]
disk_right_col_base = [meta_mem.copy() for i in range(6)]
disk_left_col = VGroup(*disk_left_col_base).arrange(UP, buff=0)
disk_right_col = VGroup(*disk_right_col_base).arrange(UP, buff=0)
disk_rects = VGroup(disk_left_col,disk_right_col).arrange(RIGHT, buff=0)
disk_text = Text("Disk", font_size=24)
disk = Group(disk_rects,disk_text).arrange(DOWN, buff=0.5, aligned_edge=DOWN)
disk.move_to([-4,-1.25,0])
self.add(disk_text, disk_rects)
key = Square(side_length=2.2)
key.move_to([-5, 2, 0])
key_text = MarkupText(
f"<b>Key:</b>\n\n<span fgcolor='{YELLOW}'>●</span> Empty Model",
font_size=18,
)
key_text.move_to([-5, 2.4, 0])
self.add(key_text, key)
blue_text = MarkupText(
f"<span fgcolor='{BLUE}'>●</span> Checkpoint",
font_size=18,
)
blue_text.next_to(key_text, DOWN*2.4, aligned_edge=key_text.get_left())
self.add(blue_text)
step_6 = MarkupText(
f'Now watch as an input is passed through the model\nand how the memory is utilized and handled.',
font_size=24
)
step_6.move_to([2, 2, 0])
self.play(Write(step_6))
input = Square(0.3)
input.set_fill(RED, opacity=1.)
input.set_stroke(width=0.)
input.next_to(model_base[0], LEFT, buff=.5)
self.play(Write(input))
input.generate_target()
input.target.next_to(model_arr[0], direction=LEFT, buff=0.02)
self.play(MoveToTarget(input))
self.play(FadeOut(step_6))
a = Arrow(start=UP, end=DOWN, color=RED, buff=.5)
a.next_to(model_arr[0].get_left(), UP, buff=0.2)
model_cpu_arr[0].generate_target()
model_cpu_arr[0].target.move_to(gpu_rect[0])
step_7 = MarkupText(
f'As the input reaches a layer, the hook triggers\nand weights are moved from the CPU\nto the GPU and back.',
font_size=24
)
step_7.move_to([2, 2, 0])
self.play(Write(step_7, run_time=3))
circ_kwargs = {"run_time":1, "fade_in":True, "fade_out":True, "buff":0.02}
self.play(
Write(a),
Circumscribe(model_arr[0], color=ORANGE, **circ_kwargs),
Circumscribe(model_cpu_arr[0], color=ORANGE, **circ_kwargs),
Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
)
self.play(
MoveToTarget(model_cpu_arr[0])
)
a_c = a.copy()
for i in range(6):
a_c.next_to(model_arr[i].get_right()+0.02, UP, buff=0.2)
input.generate_target()
input.target.move_to(model_arr[i].get_right()+0.02)
grp = AnimationGroup(
FadeOut(a, run_time=.5),
MoveToTarget(input, run_time=.5),
FadeIn(a_c, run_time=.5),
lag_ratio=0.2
)
self.play(grp)
model_cpu_arr[i].generate_target()
model_cpu_arr[i].target.move_to(cpu_left_col_base[i])
if i < 5:
model_cpu_arr[i+1].generate_target()
model_cpu_arr[i+1].target.move_to(gpu_rect[0])
if i >= 1:
circ_kwargs["run_time"] = .7
self.play(
Circumscribe(model_arr[i], **circ_kwargs),
Circumscribe(cpu_left_col_base[i], **circ_kwargs),
Circumscribe(cpu_left_col_base[i+1], color=ORANGE, **circ_kwargs),
Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
Circumscribe(model_arr[i+1], color=ORANGE, **circ_kwargs),
)
if i < 1:
self.play(
MoveToTarget(model_cpu_arr[i]),
MoveToTarget(model_cpu_arr[i+1]),
)
else:
self.play(
MoveToTarget(model_cpu_arr[i], run_time=.7),
MoveToTarget(model_cpu_arr[i+1], run_time=.7),
)
else:
model_cpu_arr[i].generate_target()
model_cpu_arr[i].target.move_to(cpu_left_col_base[-1])
input.generate_target()
input.target.next_to(model_arr[-1].get_right(), RIGHT+0.02, buff=0.2)
self.play(
Circumscribe(model_arr[-1], color=ORANGE, **circ_kwargs),
Circumscribe(cpu_left_col_base[-1], color=ORANGE, **circ_kwargs),
Circumscribe(gpu_rect[0], color=ORANGE, **circ_kwargs),
)
self.play(
MoveToTarget(model_cpu_arr[i])
)
a = a_c
a_c = a_c.copy()
input.generate_target()
input.target.next_to(model_base[-1], RIGHT+0.02, buff=.5)
self.play(
FadeOut(step_7),
FadeOut(a, run_time=.5),
)
step_8 = MarkupText(
f'Inference on a model too large for GPU memory\nis successfully completed.', font_size=24
)
step_8.move_to([2, 2, 0])
self.play(
Write(step_8, run_time=3),
MoveToTarget(input)
)
self.wait()

View File

@ -21,10 +21,9 @@ extras["docs"] = []
extras["test_prod"] = ["pytest", "pytest-xdist", "pytest-subtests", "parameterized"]
extras["test_dev"] = ["datasets", "evaluate", "transformers", "scipy", "sklearn", "deepspeed<0.7.0", "tqdm"]
extras["testing"] = extras["test_prod"] + extras["test_dev"]
extras["rich"] = ["rich"]
extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard"]
extras["dev"] = extras["quality"] + extras["testing"] + extras["rich"]
extras["dev"] = extras["quality"] + extras["testing"]
extras["sagemaker"] = [
"sagemaker", # boto3 is a required package in sagemaker
@ -32,7 +31,7 @@ extras["sagemaker"] = [
setup(
name="accelerate",
version="0.13.1",
version="0.12.0",
description="Accelerate",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",

View File

@ -2,7 +2,7 @@
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
__version__ = "0.13.1"
__version__ = "0.12.0"
from .accelerator import Accelerator
from .big_modeling import cpu_offload, disk_offload, dispatch_model, init_empty_weights, load_checkpoint_and_dispatch
@ -16,11 +16,6 @@ from .utils import (
InitProcessGroupKwargs,
find_executable_batch_size,
infer_auto_device_map,
is_rich_available,
load_checkpoint_in_model,
synchronize_rng_states,
)
if is_rich_available():
from .utils import rich

View File

@ -125,8 +125,8 @@ class Accelerator:
- `"tensorboard"`
- `"wandb"`
- `"comet_ml"`
If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
If `"all`" is selected, will pick up all available trackers in the environment and intialize them. Can also
accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
logging_dir (`str`, `os.PathLike`, *optional*):
A path to a directory for storing logs of locally-compatible loggers.
dispatch_batches (`bool`, *optional*):
@ -140,7 +140,7 @@ class Accelerator:
A list of `KwargHandler` to customize how the objects related to distributed training or mixed precision
are created. See [kwargs](kwargs) for more information.
**Available attributes:**
**Attributes:**
- **device** (`torch.device`) -- The device to use.
- **distributed_type** ([`~utils.DistributedType`]) -- The distributed training configuration.
@ -279,7 +279,9 @@ class Accelerator:
self.native_amp = False
err = "{mode} mixed precision requires {requirement}"
if self.state.mixed_precision == "fp16":
self.native_amp = True
self.native_amp = is_torch_version(">=", "1.6")
if not self.native_amp:
raise ValueError(err.format(mode="fp16", requirement="PyTorch >= 1.6"))
if not torch.cuda.is_available() and not parse_flag_from_env("USE_MPS_DEVICE"):
raise ValueError(err.format(mode="fp16", requirement="a GPU"))
kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
@ -312,7 +314,7 @@ class Accelerator:
# RNG Types
self.rng_types = rng_types
if self.rng_types is None:
self.rng_types = ["generator"]
self.rng_types = ["torch"] if is_torch_version("<=", "1.5.1") else ["generator"]
@property
def use_distributed(self):
@ -461,28 +463,6 @@ class Accelerator:
Args:
model (`torch.nn.Module`):
PyTorch Module that was prepared with `Accelerator.prepare`
Example:
```python
>>> from accelerate import Accelerator
>>> accelerator = Accelerator()
>>> dataloader, model, optimizer = accelerator.prepare(dataloader, model, optimizer)
>>> input_a = next(iter(dataloader))
>>> input_b = next(iter(dataloader))
>>> with accelerator.no_sync():
... outputs = model(input_a)
... loss = loss_func(outputs)
... accelerator.backward(loss)
... # No synchronization across processes, only accumulate gradients
>>> outputs = model(input_b)
>>> accelerator.backward(loss)
>>> # Synchronization across all processes
>>> optimizer.step()
>>> optimizer.zero_grad()
```
"""
context = contextlib.nullcontext
if self.use_distributed:
@ -512,24 +492,6 @@ class Accelerator:
Args:
model (`torch.nn.Module`):
PyTorch Module that was prepared with `Accelerator.prepare`
Example:
```python
>>> from accelerate import Accelerator
>>> accelerator = Accelerator(gradient_accumulation_steps=2)
>>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)
>>> with accelerator.accumulate():
... for input, output in dataloader:
... outputs = model(input)
... loss = loss_func(outputs)
... loss.backward()
... optimizer.step()
... scheduler.step()
... optimizer.zero_grad()
```
"""
self._do_sync()
if self.sync_gradients:
@ -547,15 +509,15 @@ class Accelerator:
if self.is_local_main_process:
print(*args, **kwargs)
def _prepare_one(self, obj, first_pass=False, device_placement=None):
def _prepare_one(self, obj, first_pass=False):
# First pass of preparation: DataLoader, model, optimizer
if first_pass:
if isinstance(obj, torch.utils.data.DataLoader):
return self.prepare_data_loader(obj, device_placement=device_placement)
return self.prepare_data_loader(obj)
elif isinstance(obj, torch.nn.Module):
return self.prepare_model(obj, device_placement=device_placement)
return self.prepare_model(obj)
elif isinstance(obj, torch.optim.Optimizer):
optimizer = self.prepare_optimizer(obj, device_placement=device_placement)
optimizer = self.prepare_optimizer(obj)
return optimizer
# Second pass of preparation: LR scheduler (which need the full list of optimizers)
elif isinstance(obj, torch.optim.lr_scheduler._LRScheduler):
@ -602,39 +564,17 @@ class Accelerator:
self._optimizers = optimizers
return tuple(result)
def prepare(self, *args, device_placement=None):
def prepare(self, *args):
"""
Prepare all objects passed in `args` for distributed training and mixed precision, then return them in the same
order.
Args:
*args (list of objects):
Any of the following type of objects:
Accepts the following type of objects:
- `torch.utils.data.DataLoader`: PyTorch Dataloader
- `torch.nn.Module`: PyTorch Module
- `torch.optim.Optimizer`: PyTorch Optimizer
- `torch.optim.lr_scheduler._LRScheduler`: PyTorch LR Scheduler
device_placement (`List[bool]`, *optional*):
Used to customize whether automatic device placement should be performed for each object passed. Needs
to be a list of the same length as `args`.
<Tip>
You don't need to prepare a model if you only use it for inference without any kind of mixed precision
</Tip>
- `torch.utils.data.DataLoader`: PyTorch Dataloader
- `torch.nn.Module`: PyTorch Module
- `torch.optim.Optimizer`: PyTorch Optimizer
"""
if device_placement is None:
device_placement = [None for _ in args]
elif self.distributed_type == DistributedType.DEEPSPEED:
raise ValueError("You can't customize device placements with DeepSpeed.")
elif len(device_placement) != len(args):
raise ValueError(
f"`device_placement` should be a list with {len(args)} elements (the number of objects passed)."
)
if self.distributed_type == DistributedType.FSDP:
model_count = 0
optimizer_present = False
@ -665,7 +605,7 @@ class Accelerator:
"The model and the optimizer parameters are not on the same device, which probably means you "
"created an optimizer around your model **before** putting on the device. Make sure the line "
"model.to(device) is before the optimizer creation in your script or remove it entirely and use "
"the flag default value for `device_placement` in your `Accelerator` to let it handle that "
"the flag default value for `devicement_placement` in your `Accelerator` to let it handle that "
"part for you."
)
@ -678,10 +618,8 @@ class Accelerator:
if self.distributed_type == DistributedType.DEEPSPEED:
result = self._prepare_deepspeed(*args)
else:
result = tuple(
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
)
result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
result = tuple(self._prepare_one(obj, first_pass=True) for obj in args)
result = tuple(self._prepare_one(obj) for obj in result)
if tpu_should_fix_optimizer:
# 2. grabbing new model parameters
@ -698,22 +636,9 @@ class Accelerator:
return result if len(result) > 1 else result[0]
def prepare_model(self, model: torch.nn.Module, device_placement=None):
"""
Prepares a PyTorch model for training in any distributed setup. It is recommended to use
[`Accelerator.prepare`] instead.
Args:
model (`torch.nn.Module`):
A PyTorch model to prepare. You don't need to prepare a model if it is used only for inference without
any kind of mixed precision
device_placement (`bool`, *optional*):
Whether or not to place the model on the proper device. Will default to `self.device_placement`.
"""
if device_placement is None:
device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP
def prepare_model(self, model):
self._models.append(model)
if device_placement:
if self.device_placement and self.distributed_type != DistributedType.FSDP:
model = model.to(self.device)
if self.distributed_type == DistributedType.MULTI_GPU:
kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
@ -760,36 +685,26 @@ class Accelerator:
deepspeed_plugin = self.state.deepspeed_plugin
if deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] == "auto":
result = [
self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
for obj in args
]
result = [
self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
for obj in args
]
batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
if self.split_batches:
batch_sizes = [batch_size // self.num_processes for batch_size in batch_sizes]
if len(batch_sizes) == 0:
raise ValueError(
"You must specify a training or evaluation dataloader in `accelerate.prepare()` when using DeepSpeed."
)
batch_size_per_device = min(batch_sizes) if deepspeed_plugin.is_train_batch_min else max(batch_sizes)
if len(batch_sizes) > 1:
logger.info(
"Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
f"{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
)
else:
batch_size_per_device = deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"]
result = [obj for obj in args]
if self.gradient_accumulation_steps != deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]:
logger.info(
f"Updating DeepSpeed's gradient accumulation steps to {self.gradient_accumulation_steps} from "
f"{deepspeed_plugin.deepspeed_config['gradient_accumulation_steps']}."
batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
if self.split_batches:
batch_sizes = [batch_size // self.num_processes for batch_size in batch_sizes]
if len(batch_sizes) == 0:
raise ValueError(
"You must specify a training or evaluation dataloader in `accelerate.prepare()` when using DeepSpeed."
)
deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"] = self.gradient_accumulation_steps
batch_size_per_device = min(batch_sizes) if deepspeed_plugin.is_train_batch_min else max(batch_sizes)
if len(batch_sizes) > 1:
logger.info(
"Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
f"{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
)
config_kwargs = {
"train_micro_batch_size_per_gpu": batch_size_per_device,
"train_batch_size": batch_size_per_device
@ -923,57 +838,24 @@ class Accelerator:
)
return tuple(result)
def prepare_data_loader(self, data_loader: torch.utils.data.DataLoader, device_placement=None):
"""
Prepares a PyTorch DataLoader for training in any distributed setup. It is recommended to use
[`Accelerator.prepare`] instead.
Args:
data_loader (`torch.utils.data.DataLoader`):
A vanilla PyTorch DataLoader to prepare
device_placement (`bool`, *optional*):
Whether or not to place the batches on the proper device in the prepared dataloader. Will default to
`self.device_placement`.
"""
if device_placement is None:
device_placement = self.device_placement if self.distributed_type != DistributedType.TPU else False
def prepare_data_loader(self, data_loader):
return prepare_data_loader(
data_loader,
self.device,
num_processes=self.num_processes,
process_index=self.process_index,
split_batches=self.split_batches,
put_on_device=device_placement,
put_on_device=self.device_placement if self.distributed_type != DistributedType.TPU else False,
rng_types=self.rng_types.copy(),
dispatch_batches=self.dispatch_batches,
)
def prepare_optimizer(self, optimizer: torch.optim.Optimizer, device_placement=None):
"""
Prepares a PyTorch Optimizer for training in any distributed setup. It is recommended to use
[`Accelerator.prepare`] instead.
Args:
optimizer (`torch.optim.Optimizer`):
A vanilla PyTorch optimizer to prepare
device_placement (`bool`, *optional*):
Whether or not to place the optimizer on the proper device. Will default to `self.device_placement`.
"""
if device_placement is None:
device_placement = self.device_placement
optimizer = AcceleratedOptimizer(optimizer, device_placement=device_placement, scaler=self.scaler)
def prepare_optimizer(self, optimizer):
optimizer = AcceleratedOptimizer(optimizer, device_placement=self.device_placement, scaler=self.scaler)
self._optimizers.append(optimizer)
return optimizer
def prepare_scheduler(self, scheduler: torch.optim.lr_scheduler._LRScheduler):
"""
Prepares a PyTorch Scheduler for training in any distributed setup. It is recommended to use
[`Accelerator.prepare`] instead.
Args:
scheduler (`torch.optim.lr_scheduler._LRScheduler`):
A vanilla PyTorch scheduler to prepare
"""
def prepare_scheduler(self, scheduler):
# We try to find the optimizer associated with `scheduler`, the default is the full list.
optimizer = self._optimizers
for opt in self._optimizers:
@ -991,14 +873,9 @@ class Accelerator:
def backward(self, loss, **kwargs):
"""
Scales the gradients in accordance to `Accelerator.gradient_accumulation_steps` and calls the correct
`backward()` based on the configuration.
Should be used in lieu of `loss.backward()`.
Use `accelerator.backward(loss)` in lieu of `loss.backward()`.
"""
if self.distributed_type != DistributedType.DEEPSPEED:
# deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
loss = loss / self.gradient_accumulation_steps
loss /= self.gradient_accumulation_steps
if self.distributed_type == DistributedType.DEEPSPEED:
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
elif self.scaler is not None:
@ -1029,24 +906,6 @@ class Accelerator:
def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
"""
Should be used in place of `torch.nn.utils.clip_grad_norm_`.
Example:
```python
>>> from accelerate import Accelerator
>>> accelerator = Accelerator(gradient_accumulation_steps=2)
>>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)
>>> for (input, target) in dataloader:
... optimizer.zero_grad()
... output = model(input)
... loss = loss_func(output, target)
... accelerator.backward(loss)
... if accelerator.sync_gradients:
... accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)
... optimizer.step()
```
"""
if self.distributed_type == DistributedType.FSDP:
self.unscale_gradients()
@ -1064,24 +923,6 @@ class Accelerator:
def clip_grad_value_(self, parameters, clip_value):
"""
Should be used in place of `torch.nn.utils.clip_grad_value_`.
Example:
```python
>>> from accelerate import Accelerator
>>> accelerator = Accelerator(gradient_accumulation_steps=2)
>>> dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)
>>> for (input, target) in dataloader:
... optimizer.zero_grad()
... output = model(input)
... loss = loss_func(output, target)
... accelerator.backward(loss)
... if accelerator.sync_gradients:
... accelerator.clip_grad_value_(model.parameters(), clip_value)
... optimizer.step()
```
"""
if self.distributed_type in [DistributedType.DEEPSPEED, DistributedType.FSDP]:
raise Exception("DeepSpeed and FSDP do not support `clip_grad_value_`. Use `clip_grad_norm_` instead.")
@ -1191,7 +1032,6 @@ class Accelerator:
"""
wait_for_everyone()
@on_main_process
def init_trackers(self, project_name: str, config: Optional[dict] = None, init_kwargs: Optional[dict] = {}):
"""
Initializes a run for all trackers stored in `self.log_with`, potentially with starting configurations
@ -1203,7 +1043,7 @@ class Accelerator:
Optional starting configuration to be logged.
init_kwargs (`dict`, *optional*):
A nested dictionary of kwargs to be passed to a specific tracker's `__init__` function. Should be
formatted like so:
formatted like this:
```python
{"wandb": {"tags": ["tag_a", "tag_b"]}}
```
@ -1252,7 +1092,7 @@ class Accelerator:
The run step. If included, the log will be affiliated with this step.
log_kwargs (`dict`, *optional*):
A nested dictionary of kwargs to be passed to a specific tracker's `log` function. Should be formatted
like so:
like this:
```python
{"wandb": {"tags": ["tag_a", "tag_b"]}}
```
@ -1263,8 +1103,7 @@ class Accelerator:
@on_main_process
def end_training(self):
"""
Runs any special end training behaviors, such as stopping trackers on the main process only. Should always be
called at the end of your script if using experiment tracking.
Runs any special end training behaviors, such as stopping trackers on the main process only.
"""
for tracker in self.trackers:
tracker.finish()
@ -1284,13 +1123,6 @@ class Accelerator:
"""
Saves the current states of the model, optimizer, scaler, RNG generators, and registered objects.
<Tip>
Should only be used when wanting to save a checkpoint during training and restoring the state in the same
environment.
</Tip>
Args:
output_dir (`str` or `os.PathLike`):
The name of the folder to save all relevant weights and states.
@ -1346,12 +1178,6 @@ class Accelerator:
"""
Loads the current states of the model, optimizer, scaler, RNG generators, and registered objects.
<Tip>
Should only be used in conjunction with [`Accelerator.save_state`].
</Tip>
Args:
input_dir (`str` or `os.PathLike`):
The name of the folder all relevant weights and states were saved in.
@ -1454,15 +1280,6 @@ class Accelerator:
return (model_device, optimizer_device)
def get_state_dict(self, model, unwrap=True):
"""
Returns the state dictionary of a model sent through [`Accelerator.prepare`] in full precision
Args:
model (`torch.nn.Module`):
A PyTorch model sent through [`Accelerator.prepare`]
unwrap (`bool`, *optional*, defaults to True):
Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict
"""
is_zero_3 = False
if self.distributed_type == DistributedType.DEEPSPEED:
is_zero_3 = self.deepspeed_config["zero_optimization"]["stage"] == 3

View File

@ -29,7 +29,6 @@ from .utils import (
load_checkpoint_in_model,
offload_state_dict,
)
from .utils.versions import is_torch_version
@contextmanager
@ -44,7 +43,7 @@ def init_empty_weights(include_buffers: bool = False):
Example:
```python
```pyton
import torch.nn as nn
from accelerate import init_empty_weights
@ -60,8 +59,6 @@ def init_empty_weights(include_buffers: bool = False):
</Tip>
"""
if not is_torch_version(">=", "1.9.0"):
raise NotImplementedError("Initializing empty weights to a meta device requires torch >= 1.9.0")
old_register_parameter = nn.Module.register_parameter
if include_buffers:
old_register_buffer = nn.Module.register_buffer
@ -78,35 +75,15 @@ def init_empty_weights(include_buffers: bool = False):
if buffer is not None:
module._buffers[name] = module._buffers[name].to(torch.device("meta"))
# Patch tensor creation
if include_buffers:
tensor_constructors_to_patch = {
torch_function_name: getattr(torch, torch_function_name)
for torch_function_name in ["empty", "zeros", "ones", "full"]
}
else:
tensor_constructors_to_patch = {}
def patch_tensor_constructor(fn):
def wrapper(*args, **kwargs):
kwargs["device"] = torch.device("meta")
return fn(*args, **kwargs)
return wrapper
try:
nn.Module.register_parameter = register_empty_parameter
if include_buffers:
nn.Module.register_buffer = register_empty_buffer
for torch_function_name in tensor_constructors_to_patch.keys():
setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
yield
finally:
nn.Module.register_parameter = old_register_parameter
if include_buffers:
nn.Module.register_buffer = old_register_buffer
for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
setattr(torch, torch_function_name, old_torch_function)
def cpu_offload(
@ -137,8 +114,6 @@ def cpu_offload(
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
"""
if not is_torch_version(">=", "1.9.0"):
raise NotImplementedError("CPU offloading requires torch >= 1.9.0")
if execution_device is None:
execution_device = next(iter(model.parameters())).device
if state_dict is None:
@ -182,8 +157,6 @@ def disk_offload(
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
"""
if not is_torch_version(">=", "1.9.0"):
raise NotImplementedError("Disk offloading requires torch >= 1.9.0")
if not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")):
offload_state_dict(offload_dir, model.state_dict())
if execution_device is None:
@ -235,8 +208,6 @@ def dispatch_model(
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
"""
if not is_torch_version(">=", "1.9.0"):
raise NotImplementedError("Model dispatching requires torch >= 1.9.0")
# Error early if the device map is incomplete.
check_device_map(model, device_map)
@ -324,7 +295,7 @@ def load_checkpoint_and_dispatch(
dtype (`str` or `torch.dtype`, *optional*):
If provided, the weights will be converted to that type when loaded.
offload_state_dict (`bool`, *optional*):
If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
If `True`, will temporarily offload the CPU state dict on the hard drive to avoig getting out of CPU RAM if
the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
picked contains `"disk"` values.
preload_module_classes (`List[str]`, *optional*):
@ -333,8 +304,6 @@ def load_checkpoint_and_dispatch(
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
"""
if not is_torch_version(">=", "1.9.0"):
raise NotImplementedError("Loading and dispatching requires torch >= 1.9.0")
if isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
raise ValueError(
"If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "

View File

@ -35,12 +35,8 @@ def get_cluster_input():
machine_rank = 0
num_machines = 1
num_processes = 1
gpu_ids = None
main_process_ip = None
main_process_port = None
rdzv_backend = "static"
same_network = True
if distributed_type in [DistributedType.MULTI_GPU, DistributedType.MULTI_CPU]:
num_machines = _ask_field(
"How many different machines will you use (use more than 1 for multi-node training)? [1]: ",
@ -60,16 +56,6 @@ def get_cluster_input():
"What is the port you will use to communicate with the main process? ",
lambda x: int(x),
)
same_network = _ask_field(
"Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: ",
_convert_yes_no_to_bool,
default=True,
error_message="Please enter yes or no.",
)
if not same_network:
rdzv_backend = _ask_field(
"What rendezvous backend will you use? ('static', 'c10d', ...): ", default="static"
)
if distributed_type == DistributedType.NO:
use_cpu = _ask_field(
@ -305,12 +291,6 @@ def get_cluster_input():
else:
num_processes = 1
if distributed_type in [DistributedType.MULTI_GPU, DistributedType.NO] and not use_cpu:
gpu_ids = _ask_field(
"What GPU(s) (by id) should be used for training on this machine as a comma-seperated list? [all]:",
default="all",
)
if distributed_type != DistributedType.TPU:
if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config:
mixed_precision = "no"
@ -333,7 +313,6 @@ def get_cluster_input():
compute_environment=ComputeEnvironment.LOCAL_MACHINE,
distributed_type=distributed_type,
num_processes=num_processes,
gpu_ids=gpu_ids,
mixed_precision=mixed_precision,
downcast_bf16=downcast_bf16,
machine_rank=machine_rank,
@ -344,6 +323,4 @@ def get_cluster_input():
deepspeed_config=deepspeed_config,
fsdp_config=fsdp_config,
use_cpu=use_cpu,
rdzv_backend=rdzv_backend,
same_network=same_network,
)

View File

@ -135,11 +135,8 @@ class ClusterConfig(BaseConfig):
num_processes: int
machine_rank: int = 0
num_machines: int = 1
gpu_ids: Optional[str] = None
main_process_ip: Optional[str] = None
main_process_port: Optional[int] = None
rdzv_backend: Optional[str] = "static"
same_network: Optional[bool] = False
main_training_function: str = "main"
# args for deepspeed_plugin

View File

@ -157,7 +157,7 @@ def get_sagemaker_input():
)
distributed_type = _ask_field(
"What is the distributed mode? ([0] No distributed training, [1] data parallelism): ",
"Which type of machine are you using? ([0] No distributed training, [1] data parallelism): ",
_convert_sagemaker_distributed_mode,
error_message="Please enter 0 or 1",
)

View File

@ -36,28 +36,15 @@ from accelerate.utils import (
DistributedType,
PrecisionType,
PrepareForLaunch,
_filter_args,
get_launch_prefix,
is_deepspeed_available,
is_rich_available,
is_sagemaker_available,
is_torch_version,
patch_environment,
)
from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS
from accelerate.utils.dataclasses import SageMakerDistributedType
if is_rich_available():
from rich import get_console
from rich.logging import RichHandler
FORMAT = "%(message)s"
logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()])
if is_torch_version(">=", "1.9.0"):
import torch.distributed.run as distrib_run
logger = logging.getLogger(__name__)
@ -259,11 +246,6 @@ def launch_command_parser(subparsers=None):
parser.add_argument(
"--num_machines", type=int, default=None, help="The total number of machines used in this training."
)
parser.add_argument(
"--gpu_ids",
default=None,
help="What GPUs (by id) should be used for training on this machine as a comma-seperated list",
)
parser.add_argument(
"--machine_rank", type=int, default=None, help="The rank of the machine on which this script is launched."
)
@ -274,25 +256,6 @@ def launch_command_parser(subparsers=None):
default=None,
help="The port to use to communicate with the machine of rank 0.",
)
# Rendezvous related arguments
parser.add_argument(
"--rdzv_conf",
type=str,
default="",
help="Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).",
)
parser.add_argument(
"--max_restarts",
type=int,
default=0,
help="Maximum number of worker group restarts before failing.",
)
parser.add_argument(
"--monitor_interval",
type=float,
default=5,
help="Interval, in seconds, to monitor the state of workers.",
)
parser.add_argument(
"--main_training_function",
type=str,
@ -331,12 +294,7 @@ def launch_command_parser(subparsers=None):
"--aws_secret_access_key",
type=str,
default=None,
help="The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job.",
)
parser.add_argument(
"--debug",
action="store_true",
help="Whether to print out the torch.distributed stack trace when something fails.",
help="The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job",
)
parser.add_argument(
"training_script",
@ -369,10 +327,6 @@ def simple_launcher(args):
current_env = os.environ.copy()
current_env["USE_CPU"] = str(args.cpu or args.use_cpu)
current_env["USE_MPS_DEVICE"] = str(args.use_mps_device)
if args.use_mps_device:
current_env["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
elif args.gpu_ids != "all":
current_env["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
if args.num_machines > 1:
current_env["MASTER_ADDR"] = args.main_process_ip
current_env["MASTER_PORT"] = str(args.main_process_port)
@ -401,40 +355,43 @@ def simple_launcher(args):
def multi_gpu_launcher(args):
num_processes = getattr(args, "num_processes")
num_machines = getattr(args, "num_machines")
main_process_ip = getattr(args, "main_process_ip")
main_process_port = getattr(args, "main_process_port")
if num_machines > 1:
setattr(args, "nproc_per_node", str(num_processes // num_machines))
setattr(args, "nnodes", str(num_machines))
setattr(args, "node_rank", int(args.machine_rank))
if getattr(args, "same_network"):
setattr(args, "master_addr", str(main_process_ip))
setattr(args, "master_port", str(main_process_port))
else:
setattr(args, "rdzv_endpoint", f"{main_process_ip}:{main_process_port}")
cmd = get_launch_prefix()
if args.num_machines > 1:
cmd.extend(
[
"--nproc_per_node",
str(args.num_processes // args.num_machines),
"--nnodes",
str(args.num_machines),
"--node_rank",
str(args.machine_rank),
"--master_addr",
args.main_process_ip,
"--master_port",
str(args.main_process_port),
]
)
else:
setattr(args, "nproc_per_node", str(num_processes))
if main_process_port is not None:
setattr(args, "master_port", str(main_process_port))
cmd.extend(["--nproc_per_node", str(args.num_processes)])
if args.main_process_port is not None:
cmd.extend(["--master_port", str(args.main_process_port)])
if args.module and args.no_python:
raise ValueError("--module and --no_python cannot be used together")
elif args.module:
setattr(args, "module", True)
cmd.append("--module")
elif args.no_python:
setattr(args, "no_python", True)
cmd.append("--no_python")
cmd.append(args.training_script)
cmd.extend(args.training_script_args)
current_env = os.environ.copy()
gpu_ids = getattr(args, "gpu_ids")
if gpu_ids != "all":
current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
mixed_precision = args.mixed_precision.lower()
try:
mixed_precision = PrecisionType(mixed_precision)
mixed_precision = PrecisionType(args.mixed_precision.lower())
except ValueError:
raise ValueError(f"Unknown mixed_precision mode: {mixed_precision}. Choose between {PrecisionType.list()}.")
raise ValueError(
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
)
if args.fp16:
warnings.warn('--fp16 flag is deprecated. Use "--mixed_precision fp16" instead.', DeprecationWarning)
@ -487,81 +444,66 @@ def multi_gpu_launcher(args):
if args.fsdp_state_dict_type is not None:
current_env["FSDP_STATE_DICT_TYPE"] = str(args.fsdp_state_dict_type)
current_env["OMP_NUM_THREADS"] = str(args.num_cpu_threads_per_process)
if is_torch_version("<", "1.9.0"):
raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
debug = getattr(args, "debug", False)
args = _filter_args(args)
with patch_environment(**current_env):
try:
distrib_run.run(args)
except:
if debug:
console = get_console()
console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
console.print_exception(suppress=[__file__], show_locals=False)
process = subprocess.Popen(cmd, env=current_env)
process.wait()
if process.returncode != 0:
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
def deepspeed_launcher(args):
if not is_deepspeed_available():
raise ImportError("DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source.")
num_processes = getattr(args, "num_processes")
num_machines = getattr(args, "num_machines")
main_process_ip = getattr(args, "main_process_ip")
main_process_port = getattr(args, "main_process_port")
if num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
cmd = ["deepspeed", "--no_local_rank"]
cmd.extend(["--hostfile", str(args.deepspeed_hostfile), "--launcher", str(args.deepspeed_multinode_launcher)])
if args.deepspeed_exclusion_filter is not None:
cmd = ["deepspeed", "--no_local_rank"]
if args.num_machines > 1:
if args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
cmd = get_launch_prefix()
cmd.extend(
[
"--exclude",
str(args.deepspeed_exclusion_filter),
]
)
elif args.deepspeed_inclusion_filter is not None:
cmd.extend(
[
"--include",
str(args.deepspeed_inclusion_filter),
"--nproc_per_node",
str(args.num_processes // args.num_machines),
"--nnodes",
str(args.num_machines),
"--node_rank",
str(args.machine_rank),
"--master_addr",
args.main_process_ip,
"--master_port",
str(args.main_process_port),
]
)
else:
cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
if args.module and args.no_python:
raise ValueError("--module and --no_python cannot be used together")
elif args.module:
cmd.append("--module")
elif args.no_python:
cmd.append("--no_python")
cmd.append(args.training_script)
cmd.extend(args.training_script_args)
elif num_machines > 1 and args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
setattr(args, "nproc_per_node", str(num_processes // num_machines))
setattr(args, "nnodes", str(num_machines))
setattr(args, "node_rank", int(args.machine_rank))
if getattr(args, "same_network"):
setattr(args, "master_addr", str(main_process_ip))
setattr(args, "master_port", str(main_process_port))
else:
setattr(args, "rdzv_endpoint", f"{main_process_ip}:{main_process_port}")
cmd.extend(
["--hostfile", str(args.deepspeed_hostfile), "--launcher", str(args.deepspeed_multinode_launcher)]
)
if args.deepspeed_exclusion_filter is not None:
cmd.extend(
[
"--exclude",
str(args.deepspeed_exclusion_filter),
]
)
elif args.deepspeed_inclusion_filter is not None:
cmd.extend(
[
"--include",
str(args.deepspeed_inclusion_filter),
]
)
else:
cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
else:
setattr(args, "nproc_per_node", str(num_processes))
if main_process_port is not None:
setattr(args, "master_port", str(main_process_port))
cmd.extend(["--num_gpus", str(args.num_processes)])
if args.module and args.no_python:
raise ValueError("--module and --no_python cannot be used together")
elif args.module:
setattr(args, "module", True)
cmd.append("--module")
elif args.no_python:
setattr(args, "no_python", True)
cmd.append("--no_python")
cmd.append(args.training_script)
cmd.extend(args.training_script_args)
current_env = os.environ.copy()
gpu_ids = getattr(args, "gpu_ids")
if gpu_ids != "all":
current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
try:
mixed_precision = PrecisionType(args.mixed_precision.lower())
except ValueError:
@ -583,8 +525,7 @@ def deepspeed_launcher(args):
current_env["DEEPSPEED_OFFLOAD_PARAM_DEVICE"] = str(args.offload_param_device).lower()
current_env["DEEPSPEED_ZERO3_INIT"] = str(args.zero3_init_flag).lower()
current_env["DEEPSPEED_ZERO3_SAVE_16BIT_MODEL"] = str(args.zero3_save_16bit_model).lower()
if args.deepspeed_config_file is not None:
current_env["DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
current_env["DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file).lower()
if args.num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
with open(".deepspeed_env", "a") as f:
@ -593,24 +534,10 @@ def deepspeed_launcher(args):
continue
f.write(f"{key}={value}\n")
process = subprocess.Popen(cmd, env=current_env)
process.wait()
if process.returncode != 0:
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
else:
if is_torch_version("<", "1.9.0"):
raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
debug = getattr(args, "debug", False)
args = _filter_args(args)
with patch_environment(**current_env):
try:
distrib_run.run(args)
except:
if debug:
console = get_console()
console.print("\n[bold red]Using --debug, `torch.distributed` Stack Trace:[/bold red]")
console.print_exception(suppress=[__file__], show_locals=False)
process = subprocess.Popen(cmd, env=current_env)
process.wait()
if process.returncode != 0:
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
def tpu_launcher(args):
@ -830,14 +757,6 @@ def launch_command(args):
args.tpu = defaults.distributed_type == DistributedType.TPU
args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
args.use_mps_device = defaults.distributed_type == DistributedType.MPS
if not args.use_mps_device:
if args.gpu_ids is None:
if defaults.gpu_ids is not None:
args.gpu_ids = defaults.gpu_ids
else:
args.gpu_ids = "all"
if len(args.gpu_ids.split(",")) < 2 and args.multi_gpu and (args.gpu_ids != "all"):
args.multi_gpu = False
if defaults.compute_environment == ComputeEnvironment.LOCAL_MACHINE:
# Update args with the defaults
for name, attr in defaults.__dict__.items():
@ -865,8 +784,8 @@ def launch_command(args):
args.mixed_precision = defaults.mixed_precision
else:
if args.num_processes is None:
args.num_processes = torch.cuda.device_count() if args.multi_gpu else 1
warned.append("\t`--num_processes` was set to a value of `{args.num_processes}`")
warned.append("\t`--num_processes` was set to a value of `1`")
args.num_processes = 1
if args.num_machines is None:
warned.append("\t`--num_machines` was set to a value of `1`")
args.num_machines = 1
@ -875,6 +794,14 @@ def launch_command(args):
args.mixed_precision = "no"
if not hasattr(args, "use_cpu"):
args.use_cpu = args.cpu
if args.multi_gpu and args.num_processes == 1:
args.num_processes = torch.cuda.device_count()
if not any("--num_processes" in warn for warn in warned):
warned.append(f"\t`--num_processes` was set to `{args.num_processes}`")
else:
for i, warn in enumerate(warned):
if "--num_processes" in warn:
warned[i] = warn.replace("`1`", f"`{args.num_processes}`")
if args.num_cpu_threads_per_process is None:
local_size = get_int_from_env(

View File

@ -75,11 +75,11 @@ _PYTORCH_DATALOADER_KWARGS = {
"timeout": 0,
"worker_init_fn": None,
"multiprocessing_context": None,
"generator": None,
}
# kwargs added after by version
_PYTORCH_DATALOADER_ADDITIONAL_KWARGS = {
"1.6.0": {"generator": None},
"1.7.0": {"prefetch_factor": 2, "persistent_workers": False},
}
@ -347,6 +347,7 @@ class DataLoaderShard(DataLoader):
try:
current_batch = next(dataloader_iter)
except StopIteration:
self.gradient_state._iterate_samples_seen(find_batch_size(current_batch))
yield
while True:
try:
@ -363,11 +364,10 @@ class DataLoaderShard(DataLoader):
@property
def total_batch_size(self):
batch_sampler = self.sampler if isinstance(self.sampler, BatchSampler) else self.batch_sampler
return (
batch_sampler.batch_size
if batch_sampler.split_batches
else (batch_sampler.batch_size * batch_sampler.num_processes)
self.batch_sampler.batch_size
if self.batch_sampler.split_batches
else (self.batch_sampler.batch_size * self.batch_sampler.num_processes)
)
@property
@ -412,7 +412,7 @@ class DataLoaderDispatcher(DataLoader):
self.split_batches = split_batches
if is_torch_version("<", "1.8.0"):
raise ImportError(
f"Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
"Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
)
if shuffle:
torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
@ -462,7 +462,11 @@ class DataLoaderDispatcher(DataLoader):
else:
batch_info = [None, True]
broadcast_object_list(batch_info)
return batch, batch_info
if batch_info[1]:
return batch, batch_info, True
else:
return batch, batch_info, True
return batch, batch_info, False
def __iter__(self):
self.gradient_state._set_end_of_dataloader(False)
@ -473,10 +477,11 @@ class DataLoaderDispatcher(DataLoader):
stop_iteration = False
self._stop_iteration = False
first_batch = None
next_batch, next_batch_info = self._fetch_batches(main_iterator)
next_batch, next_batch_info, next_skip = self._fetch_batches(main_iterator)
while not stop_iteration:
batch, batch_info = next_batch, next_batch_info
batch, batch_info, skip = next_batch, next_batch_info, next_skip
if skip:
continue
if self.state.process_index != 0:
# Initialize tensors on other processes than process 0.
batch = initialize_tensors(batch_info[0])
@ -495,7 +500,7 @@ class DataLoaderDispatcher(DataLoader):
if not stop_iteration:
# We may still be at the end of the dataloader without knowing it yet: if there is nothing left in
# the dataloader since the number of batches is a round multiple of the number of processes.
next_batch, next_batch_info = self._fetch_batches(main_iterator)
next_batch, next_batch_info, next_skip = self._fetch_batches(main_iterator)
# next_batch_info[0] is None when there are no more batches, otherwise we still need to process them.
if self._stop_iteration and next_batch_info[0] is None:
stop_iteration = True
@ -622,7 +627,6 @@ def prepare_data_loader(
new_dataset = dataloader.dataset
# Iterable dataset doesn't like batch_sampler, but data_loader creates a default one for it
new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
sampler_is_batch_sampler = False
generator = getattr(dataloader, "generator", None)
# No change if no multiprocess
if num_processes != 1 and not dispatch_batches:
@ -639,20 +643,15 @@ def prepare_data_loader(
)
else:
# New batch sampler for the current process.
sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
if sampler_is_batch_sampler:
sampler = dataloader.sampler.sampler
else:
sampler = dataloader.batch_sampler.sampler
if hasattr(sampler, "generator"):
if sampler.generator is None:
sampler.generator = torch.Generator()
generator = sampler.generator
generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
batch_sampler = dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
if hasattr(dataloader.sampler, "generator"):
if dataloader.sampler.generator is None:
dataloader.sampler.generator = torch.Generator()
generator = dataloader.sampler.generator
generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item()))
elif getattr(dataloader.batch_sampler, "generator", None) is not None:
generator = dataloader.batch_sampler.generator
new_batch_sampler = BatchSamplerShard(
batch_sampler,
dataloader.batch_sampler,
num_processes=num_processes,
process_index=process_index,
split_batches=split_batches,
@ -690,16 +689,6 @@ def prepare_data_loader(
_drop_last=dataloader.drop_last,
**kwargs,
)
elif sampler_is_batch_sampler:
dataloader = DataLoaderShard(
new_dataset,
device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
sampler=new_batch_sampler,
batch_size=getattr(dataloader, "batch_size", _PYTORCH_DATALOADER_KWARGS["batch_size"]),
rng_types=rng_types,
generator=generator,
**kwargs,
)
else:
dataloader = DataLoaderShard(
new_dataset,

View File

@ -71,7 +71,7 @@ class ModelHook:
def detach_hook(self, module):
"""
To be executed when the hook is detached from a module.
To be executed when the hook is deached from a module.
Args:
module (`torch.nn.Module`): The module detached from this hook.
@ -182,7 +182,7 @@ class AlignDevicesHook(ModelHook):
Args:
execution_device (`torch.device`, *optional*):
The device on which inputs and model weights should be placed before the forward pass.
offload (`bool`, *optional*, defaults to `False`):
offload (`bool`, *optional*, defauts to `False`):
Whether or not the weights should be offloaded after the forward pass.
io_same_device (`bool`, *optional*, defaults to `False`):
Whether or not the output should be placed on the same device as the input was.
@ -319,7 +319,7 @@ def attach_align_device_hook(
The module where we want to attach the hooks.
execution_device (`torch.device`, *optional*):
The device on which inputs and model weights should be placed before the forward pass.
offload (`bool`, *optional*, defaults to `False`):
offload (`bool`, *optional*, defauts to `False`):
Whether or not the weights should be offloaded after the forward pass.
weights_map (`Mapping[str, torch.Tensor]`, *optional*):
When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
@ -402,7 +402,7 @@ def attach_align_device_hook_on_blocks(
execution_device (`torch.device` or `Dict[str, torch.device]`, *optional*):
The device on which inputs and model weights should be placed before the forward pass. It can be one device
for the whole module, or a dictionary mapping module name to device.
offload (`bool`, *optional*, defaults to `False`):
offload (`bool`, *optional*, defauts to `False`):
Whether or not the weights should be offloaded after the forward pass. It can be one boolean for the whole
module, or a dictionary mapping module name to boolean.
weights_map (`Mapping[str, torch.Tensor]`, *optional*):

View File

@ -20,7 +20,7 @@ import warnings
import torch
from .state import AcceleratorState
from .utils import PrecisionType, PrepareForLaunch, patch_environment
from .utils import PrecisionType, PrepareForLaunch, is_torch_version, patch_environment
def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mixed_precision="no", use_port="29500"):
@ -90,6 +90,12 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mix
if num_processes > 1:
# Multi-GPU launch
if is_torch_version("<", "1.5.0"):
raise ImportError(
"Using `notebook_launcher` for distributed training on GPUs require torch >= 1.5.0, got "
f"{torch.__version__}."
)
from torch.multiprocessing import start_processes
if len(AcceleratorState._shared_state) > 0:
@ -121,17 +127,12 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mix
start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")
else:
# No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
use_mps_device = "false"
if torch.backends.mps.is_available():
print("Launching training on MPS.")
use_mps_device = "true"
elif torch.cuda.is_available():
# No need for a distributed launch otherwise as it's either CPU or one GPU.
if torch.cuda.is_available():
print("Launching training on one GPU.")
else:
print("Launching training on CPU.")
with patch_environment(use_mps_device=use_mps_device):
function(*args)
function(*args)
def debug_launcher(function, args=(), num_processes=2):
@ -153,6 +154,12 @@ def debug_launcher(function, args=(), num_processes=2):
num_processes (`int`, *optional*, defaults to 2):
The number of processes to use for training.
"""
if is_torch_version("<", "1.5.0"):
raise ImportError(
"Using `debug_launcher` for distributed training on GPUs require torch >= 1.5.0, got "
f"{torch.__version__}."
)
from torch.multiprocessing import start_processes
with tempfile.NamedTemporaryFile() as tmp_file:

View File

@ -49,20 +49,15 @@ def get_logger(name: str):
If a log should be called on all processes, pass `main_process_only=False`
E.g.
```python
logger.info("My log", main_process_only=False)
logger.debug("My log", main_process_only=False)
```
Args:
name (`str`):
The name for the logger, such as `__file__`
Example:
```python
>>> from accelerate.logging import get_logger
>>> logger = get_logger(__name__)
>>> logger.info("My log", main_process_only=False)
>>> logger.debug("My log", main_process_only=True)
```
"""
logger = logging.getLogger(name)
return MultiProcessAdapter(logger, {})

View File

@ -28,7 +28,7 @@ class AcceleratedScheduler:
to avoid making a scheduler step too fast when gradients went overflow and there was no training step (in mixed
precision training)
When performing gradient accumulation scheduler lengths should not be changed accordingly, Accelerate will always
When performing gradient accumulation scheduler lengths should not be changed accordingly, accelerate will always
step the scheduler to account for it.
Args:
@ -69,9 +69,7 @@ class AcceleratedScheduler:
num_processes = AcceleratorState().num_processes
for _ in range(num_processes):
# Special case when using OneCycle and `drop_last` was not used
if hasattr(self.scheduler, "total_steps") and self.scheduler._step_count <= self.scheduler.total_steps:
self.scheduler.step(*args, **kwargs)
else:
if getattr(self.scheduler, "total_steps", 0) <= self.scheduler.last_epoch:
self.scheduler.step(*args, **kwargs)
# Passthroughs

View File

@ -13,7 +13,6 @@
# limitations under the License.
import os
import warnings
from distutils.util import strtobool
import torch
@ -50,14 +49,14 @@ class AcceleratorState:
"""
Singleton class that has information about the current training environment.
**Available attributes:**
**Attributes:**
- **device** (`torch.device`) -- The device to use.
- **distributed_type** ([`~accelerate.state.DistributedType`]) -- The type of distributed environment currently
in use.
- **local_process_index** (`int`) -- The index of the current process on the current server.
- **mixed_precision** (`str`) -- Whether or not the current script will use mixed precision, and if so the type
of mixed precision being performed.
- **mixed_precision** (`str`) -- Whether or not the current script will use mixed precision. If you are using
mixed precision, define if you want to use FP16 or BF16 (bfloat16) as the floating point.
- **num_processes** (`int`) -- The number of processes currently launched in parallel.
- **process_index** (`int`) -- The index of the current process.
"""
@ -222,14 +221,6 @@ class AcceleratorState:
"and/or you do not have an MPS-enabled device on this machine."
)
else:
from .utils import is_torch_version
if not is_torch_version(">", "1.12.0"):
warnings.warn(
"We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing) on your MacOS machine. "
"It has major fixes related to model correctness and performance improvements for transformer based models. "
"Please refer to https://github.com/pytorch/pytorch/issues/82707 for more details."
)
self.device = torch.device("mps")
elif cpu or not torch.cuda.is_available():
self.device = torch.device("cpu")
@ -251,7 +242,7 @@ class AcceleratorState:
if self.distributed_type == DistributedType.DEEPSPEED:
repr += f"ds_config: {self.deepspeed_plugin.deepspeed_config}\n"
else:
repr += f"Mixed precision type: {mixed_precision}\n"
f"Mixed precision type: {mixed_precision}\n"
return repr
# For backward compatibility
@ -278,11 +269,10 @@ class GradientState:
"""
Singleton class that has information related to gradient synchronization for gradient accumulation
**Available attributes:**
**Attributes:**
- **end_of_dataloader** (`bool`) -- Whether we have reached the end the current dataloader
- **remainder** (`int`) -- The number of extra samples that were added from padding the dataloader
- **sync_gradients** (`bool`) -- Whether the gradients should be synced across all devices
"""
_shared_state = {}
@ -311,5 +301,5 @@ class GradientState:
self.end_of_dataloader = end_of_dataloader
def _set_remainder(self, remainder):
"Private function that sets the number of remaining samples at the end of the dataloader. Users should not have to call this."
"Private function that sets the number of remaining samples at the end of the dataloader"
self.remainder = remainder

View File

@ -10,7 +10,6 @@ from .testing import (
require_huggingface_suite,
require_multi_gpu,
require_single_gpu,
require_torch_min_version,
require_tpu,
skip,
slow,

View File

@ -46,9 +46,10 @@ def rng_sync_check():
if state.distributed_type == DistributedType.MULTI_GPU:
synchronize_rng_states(["cuda"])
assert are_the_same_tensors(torch.cuda.get_rng_state()), "RNG states improperly synchronized on GPU."
generator = torch.Generator()
synchronize_rng_states(["generator"], generator=generator)
assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."
if is_torch_version(">=", "1.6.0"):
generator = torch.Generator()
synchronize_rng_states(["generator"], generator=generator)
assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."
if state.local_process_index == 0:
print("All rng are properly synched.")
@ -338,7 +339,7 @@ def main():
if state.local_process_index == 0:
print("\n**DataLoader integration test**")
dl_preparation_check()
if state.distributed_type != DistributedType.TPU and is_torch_version(">=", "1.8.0"):
if state.distributed_type != DistributedType.TPU:
central_dl_preparation_check()
# Trainings are not exactly the same in DeepSpeed and CPU mode

View File

@ -20,7 +20,6 @@ import sys
import tempfile
import unittest
from distutils.util import strtobool
from functools import partial
from pathlib import Path
from typing import List, Union
from unittest import mock
@ -133,16 +132,6 @@ def require_fsdp(test_case):
return unittest.skipUnless(is_torch_version(">=", "1.12.0"), "test requires torch version >= 1.12.0")(test_case)
def require_torch_min_version(test_case=None, version=None):
"""
Decorator marking that a test requires a particular torch version to be tested. These tests are skipped when an
installed torch version is less than the required one.
"""
if test_case is None:
return partial(require_torch_min_version, version=version)
return unittest.skipUnless(is_torch_version(">=", version), f"test requires torch version >= {version}")(test_case)
def require_tensorboard(test_case):
"""
Decorator marking a test that requires tensorboard installed. These tests are skipped when tensorboard isn't

View File

@ -16,14 +16,11 @@
# Provide a project dir name, then each type of logger gets stored in project/{`logging_dir`}
import os
import time
from abc import ABCMeta, abstractmethod, abstractproperty
from typing import List, Optional, Union
import yaml
from .logging import get_logger
from .utils import LoggerType, is_aim_available, is_comet_ml_available, is_tensorboard_available, is_wandb_available
from .utils import LoggerType, is_comet_ml_available, is_tensorboard_available, is_wandb_available
_available_trackers = []
@ -43,11 +40,6 @@ if is_comet_ml_available():
_available_trackers.append(LoggerType.COMETML)
if is_aim_available():
from aim import Run
_available_trackers.append(LoggerType.AIM)
logger = get_logger(__name__)
@ -139,8 +131,8 @@ class TensorBoardTracker(GeneralTracker):
self.run_name = run_name
self.logging_dir = os.path.join(logging_dir, run_name)
self.writer = tensorboard.SummaryWriter(self.logging_dir, **kwargs)
logger.debug(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
logger.debug(
logger.info(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
logger.info(
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
)
@ -150,8 +142,7 @@ class TensorBoardTracker(GeneralTracker):
def store_init_configuration(self, values: dict):
"""
Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
hyperparameters in a yaml file for future use.
Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
Args:
values (Dictionary `str` to `bool`, `str`, `float` or `int`):
@ -160,16 +151,7 @@ class TensorBoardTracker(GeneralTracker):
"""
self.writer.add_hparams(values, metric_dict={})
self.writer.flush()
project_run_name = time.time()
dir_name = os.path.join(self.logging_dir, str(project_run_name))
os.makedirs(dir_name, exist_ok=True)
with open(os.path.join(dir_name, "hparams.yml"), "w") as outfile:
try:
yaml.dump(values, outfile)
except yaml.representer.RepresenterError:
logger.error("Serialization to store hyperparameters failed")
raise
logger.debug("Stored initial configuration hyperparameters to TensorBoard and hparams yaml file")
logger.info("Stored initial configuration hyperparameters to TensorBoard")
def log(self, values: dict, step: Optional[int] = None, **kwargs):
"""
@ -193,14 +175,14 @@ class TensorBoardTracker(GeneralTracker):
elif isinstance(v, dict):
self.writer.add_scalars(k, v, global_step=step, **kwargs)
self.writer.flush()
logger.debug("Successfully logged to TensorBoard")
logger.info("Successfully logged to TensorBoard")
def finish(self):
"""
Closes `TensorBoard` writer
"""
self.writer.close()
logger.debug("TensorBoard writer closed")
logger.info("TensorBoard writer closed")
class WandBTracker(GeneralTracker):
@ -220,8 +202,8 @@ class WandBTracker(GeneralTracker):
def __init__(self, run_name: str, **kwargs):
self.run_name = run_name
self.run = wandb.init(project=self.run_name, **kwargs)
logger.debug(f"Initialized WandB project {self.run_name}")
logger.debug(
logger.info(f"Initialized WandB project {self.run_name}")
logger.info(
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
)
@ -239,7 +221,7 @@ class WandBTracker(GeneralTracker):
`str`, `float`, `int`, or `None`.
"""
wandb.config.update(values)
logger.debug("Stored initial configuration hyperparameters to WandB")
logger.info("Stored initial configuration hyperparameters to WandB")
def log(self, values: dict, step: Optional[int] = None, **kwargs):
"""
@ -255,14 +237,14 @@ class WandBTracker(GeneralTracker):
Additional key word arguments passed along to the `wandb.log` method.
"""
self.run.log(values, step=step, **kwargs)
logger.debug("Successfully logged to WandB")
logger.info("Successfully logged to WandB")
def finish(self):
"""
Closes `wandb` writer
"""
self.run.finish()
logger.debug("WandB run closed")
logger.info("WandB run closed")
class CometMLTracker(GeneralTracker):
@ -284,8 +266,8 @@ class CometMLTracker(GeneralTracker):
def __init__(self, run_name: str, **kwargs):
self.run_name = run_name
self.writer = Experiment(project_name=run_name, **kwargs)
logger.debug(f"Initialized CometML project {self.run_name}")
logger.debug(
logger.info(f"Initialized CometML project {self.run_name}")
logger.info(
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
)
@ -303,7 +285,7 @@ class CometMLTracker(GeneralTracker):
`str`, `float`, `int`, or `None`.
"""
self.writer.log_parameters(values)
logger.debug("Stored initial configuration hyperparameters to CometML")
logger.info("Stored initial configuration hyperparameters to CometML")
def log(self, values: dict, step: Optional[int] = None, **kwargs):
"""
@ -328,82 +310,17 @@ class CometMLTracker(GeneralTracker):
self.writer.log_other(k, v, **kwargs)
elif isinstance(v, dict):
self.writer.log_metrics(v, step=step, **kwargs)
logger.debug("Successfully logged to CometML")
logger.info("Successfully logged to CometML")
def finish(self):
"""
Closes `comet-ml` writer
"""
self.writer.end()
logger.debug("CometML run closed")
logger.info("CometML run closed")
class AimTracker(GeneralTracker):
"""
A `Tracker` class that supports `aim`. Should be initialized at the start of your script.
Args:
run_name (`str`):
The name of the experiment run.
kwargs:
Additional key word arguments passed along to the `Run.__init__` method.
"""
name = "aim"
requires_logging_directory = True
def __init__(self, run_name: str, logging_dir: Optional[Union[str, os.PathLike]] = ".", **kwargs):
self.run_name = run_name
self.writer = Run(repo=logging_dir, **kwargs)
self.writer.name = self.run_name
logger.debug(f"Initialized Aim project {self.run_name}")
logger.debug(
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
)
@property
def tracker(self):
return self.writer
def store_init_configuration(self, values: dict):
"""
Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
Args:
values (`dict`):
Values to be stored as initial hyperparameters as key-value pairs.
"""
self.writer["hparams"] = values
def log(self, values: dict, step: Optional[int], **kwargs):
"""
Logs `values` to the current run.
Args:
values (`dict`):
Values to be logged as key-value pairs.
step (`int`, *optional*):
The run step. If included, the log will be affiliated with this step.
kwargs:
Additional key word arguments passed along to the `Run.track` method.
"""
# Note: replace this with the dictionary support when merged
for key, value in values.items():
self.writer.track(value, name=key, step=step, **kwargs)
def finish(self):
"""
Closes `aim` writer
"""
self.writer.close()
LOGGER_TYPE_TO_CLASS = {
"aim": AimTracker,
"comet_ml": CometMLTracker,
"tensorboard": TensorBoardTracker,
"wandb": WandBTracker,
}
LOGGER_TYPE_TO_CLASS = {"tensorboard": TensorBoardTracker, "wandb": WandBTracker, "comet_ml": CometMLTracker}
def filter_trackers(
@ -424,8 +341,8 @@ def filter_trackers(
- `"tensorboard"`
- `"wandb"`
- `"comet_ml"`
If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
If `"all`" is selected, will pick up all available trackers in the environment and intialize them. Can also
accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
logging_dir (`str`, `os.PathLike`, *optional*):
A path to a directory for storing logs of locally-compatible loggers.
"""
@ -454,6 +371,6 @@ def filter_trackers(
)
loggers.append(log_type)
else:
logger.debug(f"Tried adding logger {log_type}, but package is unavailable in the system.")
logger.info(f"Tried adding logger {log_type}, but package is unavailable in the system.")
return loggers

View File

@ -2,7 +2,7 @@
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all
from .constants import MODEL_NAME, OPTIMIZER_NAME, RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME, TORCH_LAUNCH_PARAMS
from .constants import MODEL_NAME, OPTIMIZER_NAME, RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME
from .dataclasses import (
ComputeEnvironment,
DeepSpeedPlugin,
@ -20,7 +20,6 @@ from .dataclasses import (
)
from .imports import (
get_ccl_version,
is_aim_available,
is_apex_available,
is_bf16_available,
is_boto3_available,
@ -28,7 +27,6 @@ from .imports import (
is_comet_ml_available,
is_datasets_available,
is_deepspeed_available,
is_rich_available,
is_sagemaker_available,
is_tensorboard_available,
is_tpu_available,
@ -93,7 +91,7 @@ if is_deepspeed_available():
HfDeepSpeedConfig,
)
from .launch import PrepareForLaunch, _filter_args, get_launch_prefix
from .launch import PrepareForLaunch, get_launch_prefix
from .memory import find_executable_batch_size
from .other import (
extract_model_from_parallel,

View File

@ -31,30 +31,3 @@ FSDP_STATE_DICT_TYPE = ["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DI
DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich"]
STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
# These are the args for `torch.distributed.launch` for pytorch < 1.9
TORCH_LAUNCH_PARAMS = [
"nnodes",
"nproc_per_node",
"rdzv_backend",
"rdzv_endpoint",
"rdzv_id",
"rdzv_conf",
"standalone",
"max_restarts",
"monitor_interval",
"start_method",
"role",
"module",
"m",
"no_python",
"run_path",
"log_dir",
"r",
"redirects",
"t",
"tee",
"node_rank",
"master_addr",
"master_port",
]

View File

@ -60,8 +60,6 @@ class DistributedDataParallelKwargs(KwargsHandler):
`gradient_as_bucket_view` is only available in PyTorch 1.7.0 and later versions.
`static_graph` is only available in PyTorch 1.11.0 and later versions.
</Tip>"""
dim: int = 0
@ -70,7 +68,6 @@ class DistributedDataParallelKwargs(KwargsHandler):
find_unused_parameters: bool = False
check_reduction: bool = False
gradient_as_bucket_view: bool = False
static_graph: bool = False
@dataclass
@ -196,7 +193,6 @@ class LoggerType(BaseEnum):
"""
ALL = "all"
AIM = "aim"
TENSORBOARD = "tensorboard"
WANDB = "wandb"
COMETML = "comet_ml"

View File

@ -14,7 +14,6 @@
import importlib
import sys
from functools import lru_cache
import torch
@ -51,7 +50,6 @@ def is_apex_available():
return importlib.util.find_spec("apex") is not None
@lru_cache()
def is_tpu_available(check_device=True):
"Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
if _tpu_available and check_device:
@ -95,10 +93,6 @@ def is_datasets_available():
return importlib.util.find_spec("datasets") is not None
def is_aim_available():
return importlib.util.find_spec("aim") is not None
def is_tensorboard_available():
return importlib.util.find_spec("tensorboard") is not None or importlib.util.find_spec("tensorboardX") is not None
@ -115,10 +109,6 @@ def is_boto3_available():
return importlib.util.find_spec("boto3") is not None
def is_rich_available():
return importlib.util.find_spec("rich") is not None
def is_sagemaker_available():
return importlib.util.find_spec("sagemaker") is not None

View File

@ -21,10 +21,6 @@ from ..utils import is_torch_version
from .dataclasses import DistributedType
if is_torch_version(">=", "1.9.0"):
import torch.distributed.run as distrib_run
def get_launch_prefix():
"""
Grabs the correct launcher for starting a distributed command, such as either `torchrun`, `python -m
@ -39,19 +35,6 @@ def get_launch_prefix():
return cmd
def _filter_args(args):
"""
Filters out all `accelerate` specific args
"""
distrib_args = distrib_run.get_args_parser()
new_args, _ = distrib_args.parse_known_args()
for key, value in vars(args).items():
if key in vars(new_args).keys():
setattr(new_args, key, value)
return new_args
class PrepareForLaunch:
"""
Prepare a function that will launched in a distributed setup.

View File

@ -617,7 +617,7 @@ def load_checkpoint_in_model(
dtype (`str` or `torch.dtype`, *optional*):
If provided, the weights will be converted to that type when loaded.
offload_state_dict (`bool`, *optional*, defaults to `False`):
If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
If `True`, will temporarily offload the CPU state dict on the hard drive to avoig getting out of CPU RAM if
the weight of the CPU state dict + the biggest shard does not fit.
"""
if offload_folder is None and device_map is not None and "disk" in device_map.values():

View File

@ -64,7 +64,7 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
state = AcceleratorState()
if state.distributed_type == DistributedType.TPU:
rng_state = xm.mesh_reduce("random_seed", rng_state, lambda x: x[0])
elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU, DistributedType.FSDP]:
elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
rng_state = rng_state.to(state.device)
torch.distributed.broadcast(rng_state, 0)
rng_state = rng_state.cpu()

View File

@ -1,24 +0,0 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .imports import is_rich_available
if is_rich_available():
from rich.traceback import install
install(show_locals=False)
else:
raise ModuleNotFoundError("To use the rich extension, install rich with `pip install rich`")

View File

@ -35,6 +35,7 @@ from accelerate.test_utils.testing import (
require_cuda,
require_deepspeed,
require_multi_gpu,
skip,
slow,
)
from accelerate.test_utils.training import RegressionDataset
@ -696,6 +697,7 @@ class DeepSpeedIntegrationTest(TempDirTestCase):
with patch_environment(omp_num_threads=1):
execute_subprocess_async(cmd_stage, env=os.environ.copy())
@skip
def test_checkpointing(self):
self.test_file_path = os.path.join(self.test_scripts_folder, "test_checkpointing.py")
cmd = [

View File

@ -27,7 +27,7 @@ from accelerate.big_modeling import (
load_checkpoint_and_dispatch,
)
from accelerate.hooks import remove_hook_from_submodules
from accelerate.test_utils import require_cuda, require_multi_gpu, require_torch_min_version, slow
from accelerate.test_utils import require_cuda, require_multi_gpu, slow
from accelerate.utils import offload_state_dict
from transformers import AutoModelForCausalLM, AutoTokenizer
@ -79,7 +79,6 @@ class ModelWithUnusedSubModulesForTest(nn.Module):
return self.linear4(self.linear3(self.batchnorm(self.linear2(self.linear1(x)))))
@require_torch_min_version(version="1.9.0")
class BigModelingTester(unittest.TestCase):
def test_init_empty_weights(self):
# base use

View File

@ -36,7 +36,6 @@ EXCLUDE_EXAMPLES = [
"gradient_accumulation.py",
"multi_process_metrics.py",
"memory.py",
"automatic_gradient_accumulation.py",
"fsdp_with_peak_mem_tracking.py",
"deepspeed_with_config_support.py",
]

View File

@ -27,7 +27,7 @@ from accelerate.hooks import (
remove_hook_from_module,
remove_hook_from_submodules,
)
from accelerate.test_utils import require_multi_gpu, require_torch_min_version
from accelerate.test_utils import require_multi_gpu
class ModelForTest(nn.Module):
@ -51,7 +51,6 @@ class PostForwardHook(ModelHook):
return output + 1
@require_torch_min_version(version="1.9.0")
class HooksModelTester(unittest.TestCase):
def test_add_and_remove_hooks(self):
test_model = ModelForTest()

View File

@ -26,13 +26,11 @@ from accelerate.test_utils import (
require_huggingface_suite,
require_multi_gpu,
require_single_gpu,
require_torch_min_version,
)
from accelerate.utils import get_launch_prefix, patch_environment
@require_huggingface_suite
@require_torch_min_version(version="1.8.0")
class MetricTester(unittest.TestCase):
def setUp(self):
mod_file = inspect.getfile(accelerate.test_utils)

View File

@ -21,7 +21,6 @@ import torch
import torch.nn as nn
from accelerate.test_utils import require_cuda, require_multi_gpu
from accelerate.test_utils.testing import require_torch_min_version
from accelerate.utils.modeling import (
check_device_map,
clean_device_map,
@ -46,7 +45,6 @@ class ModelForTest(nn.Module):
return self.linear2(self.batchnorm(self.linear1(x)))
@require_torch_min_version(version="1.9.0")
class ModelingUtilsTester(unittest.TestCase):
def check_set_module_tensor_for_device(self, model, device1, device2):
self.assertEqual(model.linear1.weight.device, torch.device(device1))

View File

@ -21,30 +21,12 @@ from accelerate import Accelerator, debug_launcher
from accelerate.test_utils import require_cpu
def one_cycle_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
accelerator = Accelerator(step_scheduler_with_optimizer=step_scheduler_with_optimizer, split_batches=split_batches)
model = torch.nn.Linear(2, 4)
optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=2, epochs=1)
model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
# Optimizer has stepped
scheduler.step()
if step_scheduler_with_optimizer or (num_processes == 1):
assert (
scheduler.scheduler.last_epoch == num_processes
), f"Last Epoch ({scheduler.scheduler.last_epoch}) != Num Processes ({num_processes})"
else:
assert (
scheduler.scheduler.last_epoch != num_processes
), f"Last Epoch ({scheduler.scheduler.last_epoch}) == Num Processes ({num_processes})"
def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
def scheduler_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
accelerator = Accelerator(step_scheduler_with_optimizer=step_scheduler_with_optimizer, split_batches=split_batches)
model = torch.nn.Linear(2, 4)
optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda n: 1 - n / 10)
model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
# Optimizer has stepped
@ -67,30 +49,16 @@ def lambda_test(num_processes=2, step_scheduler_with_optimizer=True, split_batch
@require_cpu
class SchedulerTester(unittest.TestCase):
def test_lambda_scheduler_steps_with_optimizer_single_process(self):
debug_launcher(partial(lambda_test, num_processes=1), num_processes=1)
debug_launcher(partial(lambda_test, num_processes=1, split_batches=True), num_processes=1)
def test_scheduler_steps_with_optimizer_single_process(self):
debug_launcher(partial(scheduler_test, num_processes=1), num_processes=1)
debug_launcher(partial(scheduler_test, num_processes=1, split_batches=True), num_processes=1)
def test_one_cycle_scheduler_steps_with_optimizer_single_process(self):
debug_launcher(partial(one_cycle_test, num_processes=1), num_processes=1)
debug_launcher(partial(one_cycle_test, num_processes=1, split_batches=True), num_processes=1)
def test_scheduler_not_step_with_optimizer_single_process(self):
debug_launcher(partial(scheduler_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)
def test_lambda_scheduler_not_step_with_optimizer_single_process(self):
debug_launcher(partial(lambda_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)
def test_scheduler_steps_with_optimizer_multiprocess(self):
debug_launcher(scheduler_test)
debug_launcher(partial(scheduler_test, num_processes=1, split_batches=True), num_processes=1)
def test_one_cycle_scheduler_not_step_with_optimizer_single_process(self):
debug_launcher(partial(one_cycle_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)
def test_lambda_scheduler_steps_with_optimizer_multiprocess(self):
debug_launcher(lambda_test)
debug_launcher(partial(lambda_test, num_processes=1, split_batches=True), num_processes=1)
def test_one_cycle_scheduler_steps_with_optimizer_multiprocess(self):
debug_launcher(one_cycle_test)
debug_launcher(partial(one_cycle_test, num_processes=1, split_batches=True), num_processes=1)
def test_lambda_scheduler_not_step_with_optimizer_multiprocess(self):
debug_launcher(partial(lambda_test, step_scheduler_with_optimizer=False))
def test_one_cycle_scheduler_not_step_with_optimizer_multiprocess(self):
debug_launcher(partial(one_cycle_test, step_scheduler_with_optimizer=False))
def test_scheduler_not_step_with_optimizer_multiprocess(self):
debug_launcher(partial(scheduler_test, step_scheduler_with_optimizer=False))

View File

@ -85,12 +85,12 @@ class WandBTrackingTest(TempDirTestCase, MockingTestCase):
self.add_mocks(mock.patch.dict(os.environ, {"WANDB_DIR": self.tmpdir}))
@staticmethod
def get_value_from_log(key: str, log: str, key_occurrence: int = 0):
def get_value_from_log(key: str, log: str, key_occurance: int = 0):
"""
Parses wandb log for `key` and returns the value.
If parsing through multiple calls to .log, pass in a `key_occurrence`
If parsing through multiple calls to .log, pass in a `key_occurance`
"""
res = re.findall(rf"(?<={key} )[^\s]+", log)[key_occurrence]
res = re.findall(rf"(?<={key} )[^\s]+", log)[key_occurance]
if '"' in res:
return re.findall(r'"([^"]*)"', res)[0]
else:

View File

@ -1,34 +0,0 @@
import json
from pathlib import Path
import subprocess
failed = []
passed = []
group_info = []
total_num_failed = 0
for log in Path().glob("*.log"):
section_num_failed = 0
with open(log, "r") as f:
for line in f:
line = json.loads(line)
if line.get("nodeid", "") != "":
test = line["nodeid"]
if line.get("duration", None) is not None:
duration = f'{line["duration"]:.4f}'
if line.get("outcome", "") == "failed":
section_num_failed += 1
failed.append([test, duration])
else:
passed.append([test, duration])
group_info.append([str(log), section_num_failed])
if len(failed) > 0:
result = "## Failed Tests:\n"
failed_table = '| Test Location | Test Class | Test Name |\n|---|---|---|\n| '
for test in failed:
failed_table += ' | '.join(test[0].split("::"))
failed_table += " |"
result += failed_table
print(result)

View File

@ -28,7 +28,7 @@ BLACK_AVOID_PATTERNS = {}
# Regexes
# Re pattern that catches list introduction (with potential indent)
_re_list = re.compile(r"^(\s*-\s+|\s*\*\s+|\s*\d+\.\s+)")
# Re pattern that catches code block introduction (with potential indent)
# Re pattern that catches code block introduction (with potentinal indent)
_re_code = re.compile(r"^(\s*)```(.*)$")
# Re pattern that catches rst args blocks of the form `Parameters:`.
_re_args = re.compile("^\s*(Args?|Arguments?|Params?|Parameters?):\s*$")
@ -62,7 +62,7 @@ def parse_code_example(code_lines):
Args:
code_lines (`List[str]`): The code lines to parse.
max_len (`int`): The maximum length per line.
max_len (`int`): The maximum lengh per line.
Returns:
(List[`str`], List[`str`]): The list of code samples and the list of outputs.
@ -109,7 +109,7 @@ def format_code_example(code: str, max_len: int, in_docstring: bool = False):
Args:
code (`str`): The code example to format.
max_len (`int`): The maximum length per line.
max_len (`int`): The maximum lengh per line.
in_docstring (`bool`, *optional*, defaults to `False`): Whether or not the code example is inside a docstring.
Returns: