mirror of
https://github.com/huggingface/accelerate.git
synced 2025-11-15 06:48:56 +08:00
Compare commits
97 Commits
v0.4.0
...
v0.7-relea
| Author | SHA1 | Date | |
|---|---|---|---|
| 3eac8e7a66 | |||
| b3691db1d6 | |||
| c6657791d7 | |||
| a6e49ed045 | |||
| f0bb5f0ed5 | |||
| 11e8b33217 | |||
| 2622cc0f98 | |||
| 5f433673e1 | |||
| b028a1981d | |||
| 3e14dd16be | |||
| fa476d03ce | |||
| 53638352a0 | |||
| 5791d3dd6b | |||
| 2d7fbbdc73 | |||
| 461ac7d476 | |||
| 209db19dc8 | |||
| 381ae20027 | |||
| 8595834292 | |||
| fa2ec4ba16 | |||
| 1d95ebdaa4 | |||
| 38e6d941fa | |||
| 7eb5255694 | |||
| e72a125502 | |||
| e361dcc2a7 | |||
| e66ba31af2 | |||
| 2c554b056c | |||
| 5668270de7 | |||
| f03f18252f | |||
| 5b2e6edab2 | |||
| 1e0b96f814 | |||
| 5d83eed3d2 | |||
| 69ff072643 | |||
| 211e6555fa | |||
| a5b782b0a1 | |||
| 339d4e0372 | |||
| 3cfebcc93a | |||
| 4628652866 | |||
| 0e0ac26fdf | |||
| 2fcbc81d4b | |||
| 06df083041 | |||
| f7dc733685 | |||
| 00e80dcfff | |||
| a2e3e5ebec | |||
| 3ef1724b3f | |||
| f4bd8e3cc5 | |||
| 1e7ec4a5c2 | |||
| 986d5b93b7 | |||
| fb5ed62c10 | |||
| 6ffab178ac | |||
| 515fcca9ed | |||
| 2a5f4c6311 | |||
| 1e630cd3a7 | |||
| bbccd2c3fb | |||
| c7e9e10bad | |||
| 49658cdc20 | |||
| 503a9ffa7f | |||
| badfbced27 | |||
| 9e5e7f32d5 | |||
| d742bce525 | |||
| 4fc586f5af | |||
| 76b35124dd | |||
| a0995e1ccb | |||
| ace103ee63 | |||
| 29a09a8ddc | |||
| c607532f4f | |||
| 21f2c5bce4 | |||
| 18e5a56cbb | |||
| 31fa5e0ce3 | |||
| a5ea2a932c | |||
| d820a584d7 | |||
| 39a0b30a95 | |||
| 75421766d3 | |||
| 34a4e4ea15 | |||
| c5c73e0238 | |||
| 5343b4e8e2 | |||
| 120b82bfce | |||
| d0cc908438 | |||
| 19ec4a782c | |||
| 929e17d3b0 | |||
| d6247b7cc1 | |||
| 1b1463fe2c | |||
| 56d8760856 | |||
| e35baa5274 | |||
| 270e978159 | |||
| 67141ebef7 | |||
| 7ad23dc269 | |||
| a8de5bd93f | |||
| abbf844423 | |||
| e549cea65c | |||
| 37e4f036f0 | |||
| 3379d64dab | |||
| 545cddb528 | |||
| 50ac7483de | |||
| be3cc4d144 | |||
| eb8b342dd4 | |||
| 5d99345b78 | |||
| 49d1f04b4f |
40
.github/deploy_doc.sh
vendored
40
.github/deploy_doc.sh
vendored
@ -1,40 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
function deploy_doc(){
|
||||
echo "Creating doc at commit $1 and pushing to folder $2"
|
||||
git checkout $1
|
||||
cd "$GITHUB_WORKSPACE"
|
||||
pip install -U .
|
||||
cd "$GITHUB_WORKSPACE/docs"
|
||||
if [ ! -z "$2" ]
|
||||
then
|
||||
if [ "$2" == "main" ]; then
|
||||
echo "Pushing main"
|
||||
make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $DOC_HOST:$DOC_PATH/$2/
|
||||
cp -r _build/html/_static .
|
||||
elif ssh -oStrictHostKeyChecking=no $DOC_HOST "[ -d $DOC_PATH/$2 ]"; then
|
||||
echo "Directory" $2 "already exists"
|
||||
scp -r -oStrictHostKeyChecking=no _static/* $DOC_HOST:$DOC_PATH/$2/_static/
|
||||
else
|
||||
echo "Pushing version" $2
|
||||
make clean && make html
|
||||
rm -rf _build/html/_static
|
||||
cp -r _static _build/html
|
||||
scp -r -oStrictHostKeyChecking=no _build/html $DOC_HOST:$DOC_PATH/$2
|
||||
fi
|
||||
else
|
||||
echo "Pushing stable"
|
||||
make clean && make html
|
||||
rm -rf _build/html/_static
|
||||
cp -r _static _build/html
|
||||
scp -r -oStrictHostKeyChecking=no _build/html/* $DOC_HOST:$DOC_PATH
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# You can find the commit for each tag on https://github.com/huggingface/accelerate/tags
|
||||
deploy_doc "main" main
|
||||
deploy_doc "0fbbbc5" v0.1.0
|
||||
deploy_doc "499a5e5" v0.2.0
|
||||
deploy_doc "dd9f7aa" # v0.3.0 Latest stable release
|
||||
17
.github/workflows/build_documentation.yml
vendored
Normal file
17
.github/workflows/build_documentation.yml
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
name: Build documentation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- doc-builder*
|
||||
- v*-release
|
||||
|
||||
jobs:
|
||||
build:
|
||||
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
|
||||
with:
|
||||
commit_sha: ${{ github.sha }}
|
||||
package: accelerate
|
||||
secrets:
|
||||
token: ${{ secrets.HUGGINGFACE_PUSH }}
|
||||
16
.github/workflows/build_pr_documentation.yml
vendored
Normal file
16
.github/workflows/build_pr_documentation.yml
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
name: Build PR Documentation
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
|
||||
with:
|
||||
commit_sha: ${{ github.event.pull_request.head.sha }}
|
||||
pr_number: ${{ github.event.number }}
|
||||
package: accelerate
|
||||
13
.github/workflows/delete_doc_comment.yml
vendored
Normal file
13
.github/workflows/delete_doc_comment.yml
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
name: Delete dev documentation
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [ closed ]
|
||||
|
||||
|
||||
jobs:
|
||||
delete:
|
||||
uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
|
||||
with:
|
||||
pr_number: ${{ github.event.number }}
|
||||
package: accelerate
|
||||
37
.github/workflows/docs-deploy.yml
vendored
37
.github/workflows/docs-deploy.yml
vendored
@ -1,37 +0,0 @@
|
||||
name: Deploy Documentation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v1
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install SSH Key
|
||||
uses: shimataro/ssh-key-action@v2
|
||||
with:
|
||||
key: ${{ secrets.DOC_SSH_KEY }}
|
||||
name: id_rsa
|
||||
known_hosts: ${{ secrets.DOC_KNOWN_HOST }}
|
||||
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: 3.6
|
||||
|
||||
- name: Install Python dependencies
|
||||
working-directory: ./
|
||||
run: pip install -e .[docs]
|
||||
|
||||
- name: Deploy documentation
|
||||
env:
|
||||
DOC_HOST: ${{ secrets.DOC_HOST }}
|
||||
DOC_PATH: ${{ secrets.DOC_PATH }}
|
||||
run: ./.github/deploy_doc.sh
|
||||
17
.github/workflows/test.yml
vendored
17
.github/workflows/test.yml
vendored
@ -12,6 +12,19 @@ jobs:
|
||||
with:
|
||||
python-version: 3.6
|
||||
- name: Install Python dependencies
|
||||
run: pip install -e .[test]
|
||||
run: pip install setuptools==59.5.0; pip install -e .[test,test_trackers]
|
||||
- name: Run Tests
|
||||
run: make test
|
||||
run: make test
|
||||
|
||||
test_examples:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.6
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.6
|
||||
- name: Install Python dependencies
|
||||
run: pip install setuptools==59.5.0; pip install -e .[test] tensorboard
|
||||
- name: Run Tests
|
||||
run: make test_examples
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -130,3 +130,9 @@ dmypy.json
|
||||
|
||||
# VSCode
|
||||
.vscode
|
||||
|
||||
# IntelliJ
|
||||
.idea
|
||||
|
||||
# Mac .DS_Store
|
||||
.DS_Store
|
||||
235
CONTRIBUTING.md
Normal file
235
CONTRIBUTING.md
Normal file
@ -0,0 +1,235 @@
|
||||
<!---
|
||||
Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
# How to contribute to 🤗 Accelerate?
|
||||
|
||||
Everyone is welcome to contribute, and we value everybody's contribution. Code
|
||||
is thus not the only way to help the community. Answering questions, helping
|
||||
others, reaching out and improving the documentations are immensely valuable to
|
||||
the community.
|
||||
|
||||
It also helps us if you spread the word: reference the library from blog posts
|
||||
on the awesome projects it made possible, shout out on Twitter every time it has
|
||||
helped you, or simply star the repo to say "thank you".
|
||||
|
||||
Whichever way you choose to contribute, please be mindful to respect our
|
||||
[code of conduct](https://github.com/huggingface/accelerate/blob/main/CODE_OF_CONDUCT.md).
|
||||
|
||||
## You can contribute in so many ways!
|
||||
|
||||
Some of the ways you can contribute to Accelerate:
|
||||
* Fixing outstanding issues with the existing code;
|
||||
* Contributing to the examples or to the documentation;
|
||||
* Submitting issues related to bugs or desired new features.
|
||||
|
||||
## Submitting a new issue or feature request
|
||||
|
||||
Do your best to follow these guidelines when submitting an issue or a feature
|
||||
request. It will make it easier for us to come back to you quickly and with good
|
||||
feedback.
|
||||
|
||||
### Did you find a bug?
|
||||
|
||||
The 🤗 Accelerate library is robust and reliable thanks to the users who notify us of
|
||||
the problems they encounter. So thank you for reporting an issue.
|
||||
|
||||
First, we would really appreciate it if you could **make sure the bug was not
|
||||
already reported** (use the search bar on Github under Issues).
|
||||
|
||||
Did not find it? :( So we can act quickly on it, please follow these steps:
|
||||
|
||||
* Include your **OS type and version**, the versions of **Python** and **PyTorch**.
|
||||
* A short, self-contained, code snippet that allows us to reproduce the bug in
|
||||
less than 30s;
|
||||
* Provide the with your Accelerate configuration (located by default in `~/.cache/huggingface/accelerate/default_config.yaml`)
|
||||
|
||||
### Do you want a new feature?
|
||||
|
||||
A good feature request addresses the following points:
|
||||
|
||||
1. Motivation first:
|
||||
* Is it related to a problem/frustration with the library? If so, please explain
|
||||
why. Providing a code snippet that demonstrates the problem is best.
|
||||
* Is it related to something you would need for a project? We'd love to hear
|
||||
about it!
|
||||
* Is it something you worked on and think could benefit the community?
|
||||
Awesome! Tell us what problem it solved for you.
|
||||
2. Write a *full paragraph* describing the feature;
|
||||
3. Provide a **code snippet** that demonstrates its future use;
|
||||
4. In case this is related to a paper, please attach a link;
|
||||
5. Attach any additional information (drawings, screenshots, etc.) you think may help.
|
||||
|
||||
If your issue is well written we're already 80% of the way there by the time you
|
||||
post it.
|
||||
|
||||
## Submitting a pull request (PR)
|
||||
|
||||
Before writing code, we strongly advise you to search through the existing PRs or
|
||||
issues to make sure that nobody is already working on the same thing. If you are
|
||||
unsure, it is always a good idea to open an issue to get some feedback.
|
||||
|
||||
You will need basic `git` proficiency to be able to contribute to
|
||||
🤗 Accelerate. `git` is not the easiest tool to use but it has the greatest
|
||||
manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
|
||||
Git](https://git-scm.com/book/en/v2) is a very good reference.
|
||||
|
||||
Follow these steps to start contributing:
|
||||
|
||||
1. Fork the [repository](https://github.com/huggingface/accelerate) by
|
||||
clicking on the 'Fork' button on the repository's page. This creates a copy of the code
|
||||
under your GitHub user account.
|
||||
|
||||
2. Clone your fork to your local disk, and add the base repository as a remote. The following command
|
||||
assumes you have your public SSH key uploaded to GitHub. See the following guide for more
|
||||
[information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
|
||||
|
||||
```bash
|
||||
$ git clone git@github.com:<your Github handle>/accelerate.git
|
||||
$ cd accelerate
|
||||
$ git remote add upstream https://github.com/huggingface/accelerate.git
|
||||
```
|
||||
|
||||
3. Create a new branch to hold your development changes, and do this for every new PR you work on.
|
||||
|
||||
Start by synchronizing your `main` branch with the `upstream/main` branch (ore details in the [GitHub Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork)):
|
||||
|
||||
```bash
|
||||
$ git checkout main
|
||||
$ git fetch upstream
|
||||
$ git merge upstream/main
|
||||
```
|
||||
|
||||
Once your `main` branch is synchronized, create a new branch from it:
|
||||
|
||||
```bash
|
||||
$ git checkout -b a-descriptive-name-for-my-changes
|
||||
```
|
||||
|
||||
**Do not** work on the `main` branch.
|
||||
|
||||
4. Set up a development environment by running the following command in a conda or a virtual environment you've created for working on this library:
|
||||
|
||||
```bash
|
||||
$ pip install -e ".[quality]"
|
||||
```
|
||||
|
||||
(If accelerate was already installed in the virtual environment, remove
|
||||
it with `pip uninstall accelerate` before reinstalling it in editable
|
||||
mode with the `-e` flag.)
|
||||
|
||||
5. Develop the features on your branch.
|
||||
|
||||
As you work on the features, you should make sure that the test suite
|
||||
passes. You should run the tests impacted by your changes like this (see
|
||||
below an explanation regarding the environment variable):
|
||||
|
||||
```bash
|
||||
$ pytest tests/<TEST_TO_RUN>.py
|
||||
```
|
||||
|
||||
> For the following commands leveraging the `make` utility, we recommend using the WSL system when running on
|
||||
> Windows. More information [here](https://docs.microsoft.com/en-us/windows/wsl/about).
|
||||
|
||||
You can also run the full suite with the following command.
|
||||
|
||||
```bash
|
||||
$ make test
|
||||
```
|
||||
|
||||
`accelerate` relies on `black` and `isort` to format its source code
|
||||
consistently. After you make changes, apply automatic style corrections and code verifications
|
||||
that can't be automated in one go with:
|
||||
|
||||
This target is also optimized to only work with files modified by the PR you're working on.
|
||||
|
||||
If you prefer to run the checks one after the other, the following command apply the
|
||||
style corrections:
|
||||
|
||||
```bash
|
||||
$ make style
|
||||
```
|
||||
|
||||
`accelerate` also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
|
||||
control runs in CI, however you can also run the same checks with:
|
||||
|
||||
```bash
|
||||
$ make quality
|
||||
```
|
||||
|
||||
Once you're happy with your changes, add changed files using `git add` and
|
||||
make a commit with `git commit` to record your changes locally:
|
||||
|
||||
```bash
|
||||
$ git add modified_file.py
|
||||
$ git commit
|
||||
```
|
||||
|
||||
Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
|
||||
|
||||
It is a good idea to sync your copy of the code with the original
|
||||
repository regularly. This way you can quickly account for changes:
|
||||
|
||||
```bash
|
||||
$ git fetch upstream
|
||||
$ git rebase upstream/main
|
||||
```
|
||||
|
||||
Push the changes to your account using:
|
||||
|
||||
```bash
|
||||
$ git push -u origin a-descriptive-name-for-my-changes
|
||||
```
|
||||
|
||||
6. Once you are satisfied (**and the checklist below is happy too**), go to the
|
||||
webpage of your fork on GitHub. Click on 'Pull request' to send your changes
|
||||
to the project maintainers for review.
|
||||
|
||||
7. It's ok if maintainers ask you for changes. It happens to core contributors
|
||||
too! So everyone can see the changes in the Pull request, work in your local
|
||||
branch and push the changes to your fork. They will automatically appear in
|
||||
the pull request.
|
||||
|
||||
|
||||
### Checklist
|
||||
|
||||
1. The title of your pull request should be a summary of its contribution;
|
||||
2. If your pull request addresses an issue, please mention the issue number in
|
||||
the pull request description to make sure they are linked (and people
|
||||
consulting the issue know you are working on it);
|
||||
3. To indicate a work in progress please prefix the title with `[WIP]`, or mark
|
||||
the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
|
||||
it from PRs ready to be merged;
|
||||
4. Make sure existing tests pass;
|
||||
5. Add high-coverage tests. No quality testing = no merge.
|
||||
|
||||
See an example of a good PR here: https://github.com/huggingface/accelerate/pull/255
|
||||
|
||||
### Tests
|
||||
|
||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
|
||||
the [tests folder](https://github.com/huggingface/accelerate/tree/main/tests).
|
||||
|
||||
We use `pytest` in order to run the tests. From the root of the
|
||||
repository, here's how to run tests with `pytest` for the library:
|
||||
|
||||
```bash
|
||||
$ python -m pytest -sv ./tests
|
||||
```
|
||||
|
||||
In fact, that's how `make test` is implemented (sans the `pip install` line)!
|
||||
|
||||
You can specify a smaller set of tests in order to test only the feature
|
||||
you're working on.
|
||||
7
Makefile
7
Makefile
@ -25,8 +25,7 @@ style:
|
||||
|
||||
# Run tests for the library
|
||||
test:
|
||||
python -m pytest -n auto --dist=loadfile -s -v ./tests/
|
||||
python -m pytest -n auto --dist=loadfile -s -v ./tests/ --ignore=./tests/test_examples.py
|
||||
|
||||
# Check that docs can build
|
||||
docs:
|
||||
cd docs && make html SPHINXOPTS="-W"
|
||||
test_examples:
|
||||
python -m pytest -n auto --dist=loadfile -s -v ./tests/test_examples.py
|
||||
|
||||
18
README.md
18
README.md
@ -26,7 +26,7 @@ limitations under the License.
|
||||
<img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
|
||||
</a>
|
||||
-->
|
||||
<a href="https://github.com/huggingface/accelerate/blob/master/LICENSE">
|
||||
<a href="https://github.com/huggingface/accelerate/blob/main/LICENSE">
|
||||
<img alt="License" src="https://img.shields.io/github/license/huggingface/accelerate.svg?color=blue">
|
||||
</a>
|
||||
<a href="https://huggingface.co/docs/accelerate/index.html">
|
||||
@ -35,7 +35,7 @@ limitations under the License.
|
||||
<a href="https://github.com/huggingface/accelerate/releases">
|
||||
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/accelerate.svg">
|
||||
</a>
|
||||
<a href="https://github.com/huggingface/accelerate/blob/master/CODE_OF_CONDUCT.md">
|
||||
<a href="https://github.com/huggingface/accelerate/blob/main/CODE_OF_CONDUCT.md">
|
||||
<img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
|
||||
</a>
|
||||
</p>
|
||||
@ -155,6 +155,8 @@ For instance, here is how you would run the GLUE example on the MRPC task (from
|
||||
accelerate launch examples/nlp_example.py
|
||||
```
|
||||
|
||||
This CLI tool is **optional**, and you can still use `python my_script.py` or `python -m torch.distributed.launch my_script.py` at your convenance.
|
||||
|
||||
## Launching multi-CPU run using MPI
|
||||
|
||||
🤗 Here is another way to launch multi-CPU run using MPI. You can learn how to install Open MPI on [this page](https://www.open-mpi.org/faq/?category=building#easy-build). You can use Intel MPI or MVAPICH as well.
|
||||
@ -166,7 +168,7 @@ mpirun -np 2 python examples/nlp_example.py
|
||||
|
||||
## Launching training using DeepSpeed
|
||||
|
||||
🤗 Accelerate supports training on single/multiple GPUs using DeepSpeed. to use it, you don't need to change anything in your training code; you can set everything using just `accelerate config`. However, if you desire to tweak your DeepSpeed related args from your python script, we provide you the `DeepSpeedPlugin`.
|
||||
🤗 Accelerate supports training on single/multiple GPUs using DeepSpeed. To use it, you don't need to change anything in your training code; you can set everything using just `accelerate config`. However, if you desire to tweak your DeepSpeed related args from your python script, we provide you the `DeepSpeedPlugin`.
|
||||
|
||||
```python
|
||||
from accelerator import Accelerator, DeepSpeedPlugin
|
||||
@ -204,6 +206,16 @@ You should use 🤗 Accelerate when you want to easily run your training scripts
|
||||
|
||||
You shouldn't use 🤗 Accelerate if you don't want to write a training loop yourself. There are plenty of high-level libraries above PyTorch that will offer you that, 🤗 Accelerate is not one of them.
|
||||
|
||||
## Frameworks using 🤗 Accelerate
|
||||
|
||||
If you like the simplicity of 🤗 Accelerate but would prefer a higher-level abstraction around your training loop, some frameworks that are built on top of 🤗 Accelerate are listed below:
|
||||
|
||||
* [Animus](https://github.com/Scitator/animus) is a minimalistic framework to run machine learning experiments. Animus highlights common "breakpoints" in ML experiments and provides a unified interface for them within [IExperiment](https://github.com/Scitator/animus/blob/main/animus/core.py#L76).
|
||||
* [Catalyst](https://github.com/catalyst-team/catalyst#getting-started) is a PyTorch framework for Deep Learning Research and Development. It focuses on reproducibility, rapid experimentation, and codebase reuse so you can create something new rather than write yet another train loop. Catalyst provides a [Runner](https://catalyst-team.github.io/catalyst/api/core.html#runner) to connect all parts of the experiment: hardware backend, data transformations, model train, and inference logic.
|
||||
* [Kornia](https://kornia.readthedocs.io/en/latest/get-started/introduction.html) is a differentiable library that allows classical computer vision to be integrated into deep learning models. Kornia provides a [Trainer](https://kornia.readthedocs.io/en/latest/x.html#kornia.x.Trainer) with the specific purpose to train and fine-tune the supported deep learning algorithms within the library.
|
||||
* [pytorch-accelerated](https://github.com/Chris-hughes10/pytorch-accelerated) is a lightweight training library, with a streamlined feature set centred around a general-purpose [Trainer](https://pytorch-accelerated.readthedocs.io/en/latest/trainer.html), that places a huge emphasis on simplicity and transparency; enabling users to understand exactly what is going on under the hood, but without having to write and maintain the boilerplate themselves!
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
This repository is tested on Python 3.6+ and PyTorch 1.4.0+
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,16 +0,0 @@
|
||||
|
||||
.highlight .c1, .highlight .sd{
|
||||
color: #999
|
||||
}
|
||||
|
||||
.highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
|
||||
color: #FB8D68;
|
||||
}
|
||||
|
||||
.highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
|
||||
color: #6670FF;
|
||||
}
|
||||
|
||||
.highlight .gp {
|
||||
color: #FB8D68;
|
||||
}
|
||||
@ -1,350 +0,0 @@
|
||||
/* Our DOM objects */
|
||||
|
||||
/* Colab dropdown */
|
||||
|
||||
table.center-aligned-table td {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
table.center-aligned-table th {
|
||||
text-align: center;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
.colab-dropdown {
|
||||
position: relative;
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
.colab-dropdown-content {
|
||||
display: none;
|
||||
position: absolute;
|
||||
background-color: #f9f9f9;
|
||||
min-width: 117px;
|
||||
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
|
||||
z-index: 1;
|
||||
}
|
||||
|
||||
.colab-dropdown-content button {
|
||||
color: #6670FF;
|
||||
background-color: #f9f9f9;
|
||||
font-size: 12px;
|
||||
border: none;
|
||||
min-width: 117px;
|
||||
padding: 5px 5px;
|
||||
text-decoration: none;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.colab-dropdown-content button:hover {background-color: #eee;}
|
||||
|
||||
.colab-dropdown:hover .colab-dropdown-content {display: block;}
|
||||
|
||||
/* Version control */
|
||||
|
||||
.version-button {
|
||||
background-color: #6670FF;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 5px;
|
||||
font-size: 15px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.version-button:hover, .version-button:focus {
|
||||
background-color: #A6B0FF;
|
||||
}
|
||||
|
||||
.version-dropdown {
|
||||
display: none;
|
||||
background-color: #6670FF;
|
||||
min-width: 160px;
|
||||
overflow: auto;
|
||||
font-size: 15px;
|
||||
}
|
||||
|
||||
.version-dropdown a {
|
||||
color: white;
|
||||
padding: 3px 4px;
|
||||
text-decoration: none;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.version-dropdown a:hover {
|
||||
background-color: #A6B0FF;
|
||||
}
|
||||
|
||||
.version-show {
|
||||
display: block;
|
||||
}
|
||||
|
||||
/* Framework selector */
|
||||
|
||||
.framework-selector {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
justify-content: flex-end;
|
||||
margin-right: 30px;
|
||||
}
|
||||
|
||||
.framework-selector > button {
|
||||
background-color: white;
|
||||
color: #6670FF;
|
||||
border: 1px solid #6670FF;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
.framework-selector > button.selected{
|
||||
background-color: #6670FF;
|
||||
color: white;
|
||||
border: 1px solid #6670FF;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
/* Copy button */
|
||||
|
||||
a.copybtn {
|
||||
margin: 3px;
|
||||
}
|
||||
|
||||
/* The literal code blocks */
|
||||
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
|
||||
color: #6670FF;
|
||||
}
|
||||
|
||||
/* To keep the logo centered */
|
||||
.wy-side-scroll {
|
||||
width: auto;
|
||||
font-size: 20px;
|
||||
}
|
||||
|
||||
/* The div that holds the Hugging Face logo */
|
||||
.HuggingFaceDiv {
|
||||
width: 100%
|
||||
}
|
||||
|
||||
/* The research field on top of the toc tree */
|
||||
.wy-side-nav-search{
|
||||
padding-top: 0;
|
||||
background-color: #6670FF;
|
||||
}
|
||||
|
||||
/* The toc tree */
|
||||
.wy-nav-side{
|
||||
background-color: #6670FF;
|
||||
}
|
||||
|
||||
/* The section headers in the toc tree */
|
||||
.wy-menu-vertical p.caption{
|
||||
background-color: #4d59ff;
|
||||
line-height: 40px;
|
||||
}
|
||||
|
||||
/* The selected items in the toc tree */
|
||||
.wy-menu-vertical li.current{
|
||||
background-color: #A6B0FF;
|
||||
}
|
||||
|
||||
/* When a list item that does belong to the selected block from the toc tree is hovered */
|
||||
.wy-menu-vertical li.current a:hover{
|
||||
background-color: #B6C0FF;
|
||||
}
|
||||
|
||||
/* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
|
||||
.wy-menu-vertical li a:hover{
|
||||
background-color: #A7AFFB;
|
||||
}
|
||||
|
||||
/* The text items on the toc tree */
|
||||
.wy-menu-vertical a {
|
||||
color: #FFFFDD;
|
||||
font-family: Calibre-Light, sans-serif;
|
||||
}
|
||||
.wy-menu-vertical header, .wy-menu-vertical p.caption{
|
||||
color: white;
|
||||
font-family: Calibre-Light, sans-serif;
|
||||
}
|
||||
|
||||
/* The color inside the selected toc tree block */
|
||||
.wy-menu-vertical li.toctree-l2 a, .wy-menu-vertical li.toctree-l3 a, .wy-menu-vertical li.toctree-l4 a {
|
||||
color: black;
|
||||
}
|
||||
|
||||
/* Inside the depth-2 selected toc tree block */
|
||||
.wy-menu-vertical li.toctree-l2.current>a {
|
||||
background-color: #B6C0FF
|
||||
}
|
||||
.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a {
|
||||
background-color: #C6D0FF
|
||||
}
|
||||
|
||||
/* Inside the depth-3 selected toc tree block */
|
||||
.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{
|
||||
background-color: #D6E0FF
|
||||
}
|
||||
|
||||
/* Inside code snippets */
|
||||
.rst-content dl:not(.docutils) dt{
|
||||
font-size: 15px;
|
||||
}
|
||||
|
||||
/* Links */
|
||||
a {
|
||||
color: #6670FF;
|
||||
}
|
||||
|
||||
/* Content bars */
|
||||
.rst-content dl:not(.docutils) dt {
|
||||
background-color: rgba(251, 141, 104, 0.1);
|
||||
border-right: solid 2px #FB8D68;
|
||||
border-left: solid 2px #FB8D68;
|
||||
color: #FB8D68;
|
||||
font-family: Calibre-Light, sans-serif;
|
||||
border-top: none;
|
||||
font-style: normal !important;
|
||||
}
|
||||
|
||||
/* Expand button */
|
||||
.wy-menu-vertical li.toctree-l2 span.toctree-expand,
|
||||
.wy-menu-vertical li.on a span.toctree-expand, .wy-menu-vertical li.current>a span.toctree-expand,
|
||||
.wy-menu-vertical li.toctree-l3 span.toctree-expand{
|
||||
color: black;
|
||||
}
|
||||
|
||||
/* Max window size */
|
||||
.wy-nav-content{
|
||||
max-width: 1200px;
|
||||
}
|
||||
|
||||
/* Mobile header */
|
||||
.wy-nav-top{
|
||||
background-color: #6670FF;
|
||||
}
|
||||
|
||||
|
||||
/* Source spans */
|
||||
.rst-content .viewcode-link, .rst-content .viewcode-back{
|
||||
color: #6670FF;
|
||||
font-size: 110%;
|
||||
letter-spacing: 2px;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
/* It would be better for table to be visible without horizontal scrolling */
|
||||
.wy-table-responsive table td, .wy-table-responsive table th{
|
||||
white-space: normal;
|
||||
}
|
||||
|
||||
.footer {
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.footer__Social {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
}
|
||||
|
||||
.footer__CustomImage {
|
||||
margin: 2px 5px 0 0;
|
||||
}
|
||||
|
||||
/* class and method names in doc */
|
||||
.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
|
||||
font-family: Calibre, sans-serif;
|
||||
font-size: 20px !important;
|
||||
}
|
||||
|
||||
/* class name in doc*/
|
||||
.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
|
||||
margin-right: 10px;
|
||||
font-family: Calibre-Medium, sans-serif;
|
||||
}
|
||||
|
||||
/* Method and class parameters */
|
||||
.sig-param{
|
||||
line-height: 23px;
|
||||
}
|
||||
|
||||
/* Class introduction "class" string at beginning */
|
||||
.rst-content dl:not(.docutils) .property{
|
||||
font-size: 18px;
|
||||
color: black;
|
||||
}
|
||||
|
||||
|
||||
/* FONTS */
|
||||
body{
|
||||
font-family: Calibre, sans-serif;
|
||||
font-size: 16px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-family: Calibre-Thin, sans-serif;
|
||||
font-size: 70px;
|
||||
}
|
||||
|
||||
h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
|
||||
font-family: Calibre-Medium, sans-serif;
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: Calibre-Medium;
|
||||
src: url(./Calibre-Medium.otf);
|
||||
font-weight:400;
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: Calibre;
|
||||
src: url(./Calibre-Regular.otf);
|
||||
font-weight:400;
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: Calibre-Light;
|
||||
src: url(./Calibre-Light.ttf);
|
||||
font-weight:400;
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: Calibre-Thin;
|
||||
src: url(./Calibre-Thin.otf);
|
||||
font-weight:400;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Nav Links to other parts of huggingface.co
|
||||
*/
|
||||
div.menu {
|
||||
position: absolute;
|
||||
top: 0;
|
||||
right: 0;
|
||||
padding-top: 20px;
|
||||
padding-right: 20px;
|
||||
z-index: 1000;
|
||||
}
|
||||
div.menu a {
|
||||
font-size: 14px;
|
||||
letter-spacing: 0.3px;
|
||||
text-transform: uppercase;
|
||||
color: white;
|
||||
-webkit-font-smoothing: antialiased;
|
||||
background: linear-gradient(0deg, #6671ffb8, #9a66ffb8 50%);
|
||||
padding: 10px 16px 6px 16px;
|
||||
border-radius: 3px;
|
||||
margin-left: 12px;
|
||||
position: relative;
|
||||
}
|
||||
div.menu a:active {
|
||||
top: 1px;
|
||||
}
|
||||
@media (min-width: 768px) and (max-width: 1750px) {
|
||||
.wy-breadcrumbs {
|
||||
margin-top: 32px;
|
||||
}
|
||||
}
|
||||
@media (max-width: 768px) {
|
||||
div.menu {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 7.6 KiB |
30
docs/source/_toctree.yml
Normal file
30
docs/source/_toctree.yml
Normal file
@ -0,0 +1,30 @@
|
||||
- sections:
|
||||
- local: index
|
||||
title: 🤗 Accelerate
|
||||
- local: quicktour
|
||||
title: Quick tour
|
||||
- local: installation
|
||||
title: Installation
|
||||
title: Get started
|
||||
- sections:
|
||||
- local: sagemaker
|
||||
title: Amazon SageMaker
|
||||
title: Guides
|
||||
- sections:
|
||||
- local: accelerator
|
||||
title: Accelerator
|
||||
- local: launcher
|
||||
title: Notebook Launcher
|
||||
- local: kwargs
|
||||
title: Kwargs Handlers
|
||||
- local: internal
|
||||
title: Internals
|
||||
- local: checkpoint
|
||||
title: Checkpointing
|
||||
- local: tracking
|
||||
title: Experiment Tracking
|
||||
- local: fsdp
|
||||
title: Fully Sharded Data Parallel
|
||||
- local: memory
|
||||
title: Memory Utilities
|
||||
title: API Reference
|
||||
41
docs/source/accelerator.mdx
Normal file
41
docs/source/accelerator.mdx
Normal file
@ -0,0 +1,41 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Accelerator
|
||||
|
||||
The [`Accelerator`] is the main class provided by 🤗 Accelerate. It serves at the main entrypoint for
|
||||
the API. To quickly adapt your script to work on any kind of setup with 🤗 Accelerate juste:
|
||||
|
||||
1. Initialize an [`Accelerator`] object (that we will call `accelerator` in the rest of this
|
||||
page) as early as possible in your script.
|
||||
2. Pass along your model(s), optimizer(s), dataloader(s) to the [`~Accelerator.prepare`] method.
|
||||
3. (Optional but best practice) Remove all the `.cuda()` or `.to(device)` in your code and let the
|
||||
`accelerator` handle device placement for you.
|
||||
4. Replace the `loss.backward()` in your code by `accelerator.backward(loss)`.
|
||||
5. (Optional, when using distributed evaluation) Gather your predictions and labelsbefore storing them or using them
|
||||
for metric computation using [`~Accelerator.gather`].
|
||||
|
||||
This is all what is needed in most cases. For more advanced case or a nicer experience here are the functions you
|
||||
should search for and replace by the corresponding methods of your `accelerator`:
|
||||
|
||||
- `print` statements should be replaced by [`~Accelerator.print`] to be only printed once per
|
||||
process.
|
||||
- Use [`~Accelerator.is_local_main_process`] for statements that should be executed once per server.
|
||||
- Use [`~Accelerator.is_main_process`] for statements that should be executed once only.
|
||||
- Use [`~Accelerator.wait_for_everyone`] to make sure all processes join that point before continuing
|
||||
(useful before a model save for instance).
|
||||
- Use [`~Accelerator.unwrap_model`] to unwrap your model before saving it.
|
||||
- Use [`~Accelerator.save`] instead of `torch.save`.
|
||||
- Use [`~Accelerator.clip_grad_norm_`] instead of `torch.nn.utils.clip_grad_norm_` and
|
||||
[`~Accelerator.clip_grad_value_`] instead of `torch.nn.utils.clip_grad_value_`.
|
||||
|
||||
[[autodoc]] Accelerator
|
||||
@ -1,43 +0,0 @@
|
||||
..
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
Accelerator
|
||||
=======================================================================================================================
|
||||
|
||||
The :class:`~accelerate.Accelerator` is the main class provided by 🤗 Accelerate. It serves at the main entrypoint for
|
||||
the API. To quickly adapt your script to work on any kind of setup with 🤗 Accelerate juste:
|
||||
|
||||
1. Initialize an :class:`~accelerate.Accelerator` object (that we will call :obj:`accelerator` in the rest of this
|
||||
page) as early as possible in your script.
|
||||
2. Pass along your model(s), optimizer(s), dataloader(s) to the :meth:`~accelerate.Accelerator.prepare` method.
|
||||
3. (Optional but best practice) Remove all the :obj:`.cuda()` or :obj:`.to(device)` in your code and let the
|
||||
:obj:`accelerator` handle device placement for you.
|
||||
4. Replace the :obj:`loss.backward()` in your code by :obj:`accelerator.backward(loss)`.
|
||||
5. (Optional, when using distributed evaluation) Gather your predictions and labelsbefore storing them or using them
|
||||
for metric computation using :meth:`~accelerate.Accelerator.gather`.
|
||||
|
||||
This is all what is needed in most cases. For more advanced case or a nicer experience here are the functions you
|
||||
should search for and replace by the corresponding methods of your :obj:`accelerator`:
|
||||
|
||||
- :obj:`print` statements should be replaced by :meth:`~accelerate.Accelerator.print` to be only printed once per
|
||||
process.
|
||||
- Use :meth:`~accelerate.Accelerator.is_local_main_process` for statements that should be executed once per server.
|
||||
- Use :meth:`~accelerate.Accelerator.is_main_process` for statements that should be executed once only.
|
||||
- Use :meth:`~accelerate.Accelerator.wait_for_everyone` to make sure all processes join that point before continuing
|
||||
(useful before a model save for instance).
|
||||
- Use :meth:`~accelerate.Accelerator.unwrap_model` to unwrap your model before saving it.
|
||||
- Use :meth:`~accelerate.Accelerator.save` instead of :obj:`torch.save`.
|
||||
- Use :meth:`~accelerate.Accelerator.clip_grad_norm_` instead of :obj:`torch.nn.utils.clip_grad_norm_` and
|
||||
:meth:`~accelerate.Accelerator.clip_grad_value_` instead of :obj:`torch.nn.utils.clip_grad_value_`.
|
||||
|
||||
.. autoclass:: accelerate.Accelerator
|
||||
:members:
|
||||
60
docs/source/checkpoint.mdx
Normal file
60
docs/source/checkpoint.mdx
Normal file
@ -0,0 +1,60 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Checkpointing
|
||||
|
||||
When training a PyTorch model with Accelerate, you may often want to save and continue a state of training. Doing so requires
|
||||
saving and loading the model, optimizer, RNG generators, and the GradScaler. Inside Accelerate are two convience functions to achieve this quickly:
|
||||
- Use [`~Accelerator.save_state`] for saving everything mentioned above to a folder location
|
||||
- Use [`~Accelerator.load_state`] for loading everything stored from an earlier `save_state`
|
||||
|
||||
It should be noted that the expectation is that those states come from the same training script, they should not be from two separate scripts.
|
||||
|
||||
- By using [`~Accelerator.register_for_checkpointing`], you can register custom objects to be automatically stored or loaded from the two prior functions,
|
||||
so long as the object has a `state_dict` **and** a `load_state_dict` functionality. This could include objects such as a learning rate scheduler.
|
||||
|
||||
Below is a brief example using checkpointing to save and reload a state during training:
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator
|
||||
import torch
|
||||
|
||||
accelerator = Accelerator()
|
||||
|
||||
my_scheduler = torch.optim.lr_scheduler.StepLR(my_optimizer, step_size=1, gamma=0.99)
|
||||
my_model, my_optimizer, my_training_dataloader = accelerate.prepare(my_model, my_optimizer, my_training_dataloader)
|
||||
|
||||
# Register the LR scheduler
|
||||
accelerate.register_for_checkpointing(my_scheduler)
|
||||
|
||||
# Save the starting state
|
||||
accelerate.save_state("my/save/path")
|
||||
|
||||
device = accelerator.device
|
||||
my_model.to(device)
|
||||
|
||||
# Perform training
|
||||
for epoch in range(num_epochs):
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
my_optimizer.step()
|
||||
my_scheduler.step()
|
||||
|
||||
# Restore previous state
|
||||
accelerate.load_state("my/save/path")
|
||||
```
|
||||
@ -1,210 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file does only contain a selection of the most common options. For a
|
||||
# full list see the documentation:
|
||||
# http://www.sphinx-doc.org/en/master/config
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath("../../src"))
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = "accelerate"
|
||||
copyright = "2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0"
|
||||
author = "huggingface"
|
||||
|
||||
# The short X.Y version
|
||||
version = "0.4.0.dev0"
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#
|
||||
# needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.extlinks",
|
||||
"sphinx.ext.coverage",
|
||||
"sphinx.ext.napoleon",
|
||||
"recommonmark",
|
||||
"sphinx.ext.viewcode",
|
||||
"sphinx_markdown_tables",
|
||||
"sphinx_copybutton",
|
||||
"sphinxext.opengraph",
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
source_suffix = [".rst", ".md"]
|
||||
# source_suffix = '.rst'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = "index"
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
# This is also used if you do content translation via gettext catalogs.
|
||||
# Usually you set "language" from the command line for these cases.
|
||||
language = None
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = None
|
||||
|
||||
# Remove the prompt when copying examples
|
||||
copybutton_prompt_text = r">>> |\.\.\. "
|
||||
copybutton_prompt_is_regexp = True
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#
|
||||
html_theme_options = {"analytics_id": "UA-83738774-2"}
|
||||
|
||||
# Configuration for OpenGraph and Twitter Card Tags.
|
||||
# These are responsible for creating nice shareable social images https://ahrefs.com/blog/open-graph-meta-tags/
|
||||
# https://ogp.me/#type_website
|
||||
ogp_image = "https://huggingface.co/front/thumbnails/docs/accelerate.png"
|
||||
ogp_description = "Run your raw PyTorch training script on any kind of device. 🤗 Accelerate provides an easy API to make your scripts run with mixed precision and on any kind of distributed setting (multi-GPUs, TPUs etc.)"
|
||||
ogp_description_length = 160
|
||||
|
||||
ogp_custom_meta_tags = [
|
||||
f'<meta name="twitter:image" content="{ogp_image}">',
|
||||
f'<meta name="twitter:description" content="{ogp_description}">',
|
||||
]
|
||||
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ["_static"]
|
||||
|
||||
# Custom sidebar templates, must be a dictionary that maps document names
|
||||
# to template names.
|
||||
#
|
||||
# The default sidebars (for documents that don't match any pattern) are
|
||||
# defined by theme itself. Builtin themes are using these templates by
|
||||
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
|
||||
# 'searchbox.html']``.
|
||||
#
|
||||
# html_sidebars = {}
|
||||
|
||||
# This must be the name of an image file (path relative to the configuration
|
||||
# directory) that is the favicon of the docs. Modern browsers use this as
|
||||
# the icon for tabs, windows and bookmarks. It should be a Windows-style
|
||||
# icon file (.ico).
|
||||
html_favicon = "favicon.ico"
|
||||
|
||||
|
||||
# -- Options for HTMLHelp output ---------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = "acceleratedoc"
|
||||
|
||||
|
||||
# -- Options for LaTeX output ------------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#
|
||||
# 'papersize': 'letterpaper',
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#
|
||||
# 'pointsize': '10pt',
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#
|
||||
# 'preamble': '',
|
||||
# Latex figure (float) alignment
|
||||
#
|
||||
# 'figure_align': 'htbp',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(master_doc, "accelerate.tex", "accelerate Documentation", "huggingface", "manual"),
|
||||
]
|
||||
|
||||
|
||||
# -- Options for manual page output ------------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [(master_doc, "accelerate", "accelerate Documentation", [author], 1)]
|
||||
|
||||
|
||||
# -- Options for Texinfo output ----------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(
|
||||
master_doc,
|
||||
"accelerate",
|
||||
"accelerate Documentation",
|
||||
author,
|
||||
"accelerate",
|
||||
"One line description of project.",
|
||||
"Miscellaneous",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# -- Options for Epub output -------------------------------------------------
|
||||
|
||||
# Bibliographic Dublin Core info.
|
||||
epub_title = project
|
||||
|
||||
# The unique identifier of the text. This can be a ISBN number
|
||||
# or the project homepage.
|
||||
#
|
||||
# epub_identifier = ''
|
||||
|
||||
# A unique identification for the text.
|
||||
#
|
||||
# epub_uid = ''
|
||||
|
||||
# A list of files that should not be packed into the epub file.
|
||||
epub_exclude_files = ["search.html"]
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.add_css_file("css/huggingface.css")
|
||||
app.add_css_file("css/code-snippets.css")
|
||||
app.add_js_file("js/custom.js")
|
||||
|
||||
|
||||
# -- Extension configuration -------------------------------------------------
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 47 KiB |
120
docs/source/fsdp.mdx
Normal file
120
docs/source/fsdp.mdx
Normal file
@ -0,0 +1,120 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Fully Sharded Data Parallel
|
||||
|
||||
To accelerate training huge models on larger batch sizes, we can use a fully sharded data parallel model.
|
||||
This type of data parallel paradigm enables fitting more data and larger models by sharding the optimizer states, gradients and parameters.
|
||||
To read more about it and the benefits, check out the [Fully Sharded Data Parallel blog](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/).
|
||||
We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
|
||||
All you need to do is enable it through the config.
|
||||
|
||||
## How it works out the box
|
||||
|
||||
On your machine(s) just run:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
and answer the questions asked. This will generate a config file that will be used automatically to properly set the
|
||||
default options when doing
|
||||
|
||||
```bash
|
||||
accelerate launch my_script.py --args_to_my_script
|
||||
```
|
||||
|
||||
For instance, here is how you would run the NLP example (from the root of the repo) with FSDP enabled:
|
||||
|
||||
```bash
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config: {}
|
||||
distributed_type: FSDP
|
||||
fsdp_config:
|
||||
min_num_params: 2000
|
||||
offload_params: false
|
||||
sharding_strategy: 1
|
||||
machine_rank: 0
|
||||
main_process_ip: null
|
||||
main_process_port: null
|
||||
main_training_function: main
|
||||
mixed_precision: 'no'
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
```bash
|
||||
accelerate launch examples/nlp_example.py
|
||||
```
|
||||
|
||||
Currently, `Accelerate` supports following config through the CLI:
|
||||
|
||||
```bash
|
||||
`Sharding Strategy`: [1] FULL_SHARD, [2] SHARD_GRAD_OP
|
||||
`Min Num Params`: FSDP\'s minimum number of parameters for Default Auto Wrapping.
|
||||
`Offload Params`: Decides Whether to offload parameters and gradients to CPU.
|
||||
```
|
||||
|
||||
## Few caveats to be aware of
|
||||
|
||||
- PyTorch FSDP auto wraps sub-modules, flattens the parameters and shards the parameters in place.
|
||||
Due to this, any optimizer created before model wrapping gets broken and occupies more memory.
|
||||
Hence, it is highly recommended and efficient to prepare model before creating optimizer.
|
||||
`Accelerate` will automatically wrap the model and create an optimizer for you in case of single model with a warning message.
|
||||
> FSDP Warning: When using FSDP, it is efficient and recommended to call prepare for the model before creating the optimizer
|
||||
|
||||
However, below is the recommended way to prepare model and optimizer while using FSDP:
|
||||
|
||||
```diff
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
+ model = accelerator.prepare(model)
|
||||
|
||||
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(model,
|
||||
- optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
- )
|
||||
|
||||
+ optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
+ optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
+ )
|
||||
|
||||
```
|
||||
|
||||
- In case of a single model, if you have created optimizer with multiple parameter groups and called prepare with them together,
|
||||
then the parameter groups will be lost and the following warning is displayed:
|
||||
> FSDP Warning: When using FSDP, several parameter groups will be conflated into
|
||||
> a single one due to nested module wrapping and parameter flattening.
|
||||
|
||||
This is because parameter groups created before wrapping will have no meaning post wrapping due parameter flattening of nested FSDP modules into 1D arrays (which can consume many layers).
|
||||
For instance, below are the named parameters of FSDP model on GPU 0 (When using 2 GPUs. Around 55M (110M/2) params in 1D arrays as this will have the 1st shard of the parameters).
|
||||
Here, if one has applied no weight decay for [bias, LayerNorm.weight] named parameters of unwrapped BERT model,
|
||||
it can't be applied to the below FSDP wrapped model as there are no named parameters with either of those strings and
|
||||
the parameters of those layers are concatenated with parameters of various other layers.
|
||||
```
|
||||
{
|
||||
'_fsdp_wrapped_module.flat_param': torch.Size([494209]),
|
||||
'_fsdp_wrapped_module._fpw_module.bert.embeddings.word_embeddings._fsdp_wrapped_module.flat_param': torch.Size([11720448]),
|
||||
'_fsdp_wrapped_module._fpw_module.bert.encoder._fsdp_wrapped_module.flat_param': torch.Size([42527232])
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
- In case of multiple models, it is necessary to prepare the models before creating optimizers else it will throw an error.
|
||||
- Mixed precision is currently not supported with FSDP.
|
||||
|
||||
For more control, users can leverage the `FullyShardedDataParallelPlugin` wherein they can specify `auto_wrap_policy`, `backward_prefetch` and `ignored_modules`.
|
||||
After creating an instance of this class, users can pass it to the Accelerator class instantiation.
|
||||
For more information on these options, please refer to the PyTorch [FullyShardedDataParallel](https://github.com/pytorch/pytorch/blob/0df2e863fbd5993a7b9e652910792bd21a516ff3/torch/distributed/fsdp/fully_sharded_data_parallel.py#L236) code.
|
||||
|
||||
[[autodoc]] utils.FullyShardedDataParallelPlugin
|
||||
132
docs/source/index.mdx
Normal file
132
docs/source/index.mdx
Normal file
@ -0,0 +1,132 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Accelerate
|
||||
|
||||
Run your *raw* PyTorch training script on any kind of device
|
||||
|
||||
## Features
|
||||
|
||||
- 🤗 Accelerate provides an easy API to make your scripts run with mixed precision and on any kind of distributed
|
||||
setting (multi-GPUs, TPUs etc.) while still letting you write your own training loop. The same code can then runs
|
||||
seamlessly on your local machine for debugging or your training environment.
|
||||
|
||||
- 🤗 Accelerate also provides a CLI tool that allows you to quickly configure and test your training environment then
|
||||
launch the scripts.
|
||||
|
||||
|
||||
## Easy to integrate
|
||||
|
||||
A traditional training loop in PyTorch looks like this:
|
||||
|
||||
```python
|
||||
my_model.to(device)
|
||||
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
loss.backward()
|
||||
my_optimizer.step()
|
||||
```
|
||||
|
||||
Changing it to work with accelerate is really easy and only adds a few lines of code:
|
||||
|
||||
```diff
|
||||
+ from accelerate import Accelerator
|
||||
|
||||
+ accelerator = Accelerator()
|
||||
# Use the device given by the *accelerator* object.
|
||||
+ device = accelerator.device
|
||||
my_model.to(device)
|
||||
# Pass every important object (model, optimizer, dataloader) to *accelerator.prepare*
|
||||
+ my_model, my_optimizer, my_training_dataloader = accelerate.prepare(
|
||||
+ my_model, my_optimizer, my_training_dataloader
|
||||
+ )
|
||||
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
# Just a small change for the backward instruction
|
||||
- loss.backward()
|
||||
+ accelerator.backward(loss)
|
||||
my_optimizer.step()
|
||||
```
|
||||
|
||||
and with this, your script can now run in a distributed environment (multi-GPU, TPU).
|
||||
|
||||
You can even simplify your script a bit by letting 🤗 Accelerate handle the device placement for you (which is safer,
|
||||
especially for TPU training):
|
||||
|
||||
```diff
|
||||
+ from accelerate import Accelerator
|
||||
|
||||
+ accelerator = Accelerator()
|
||||
- my_model.to(device)
|
||||
# Pass every important object (model, optimizer, dataloader) to *accelerator.prepare*
|
||||
+ my_model, my_optimizer, my_training_dataloader = accelerate.prepare(
|
||||
+ my_model, my_optimizer, my_training_dataloader
|
||||
+ )
|
||||
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
- inputs = inputs.to(device)
|
||||
- targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
# Just a small change for the backward instruction
|
||||
- loss.backward()
|
||||
+ accelerator.backward(loss)
|
||||
my_optimizer.step()
|
||||
```
|
||||
|
||||
## Script launcher
|
||||
|
||||
No need to remember how to use `torch.distributed.launch` or to write a specific launcher for TPU training! 🤗
|
||||
Accelerate comes with a CLI tool that will make your life easier when launching distributed scripts.
|
||||
|
||||
On your machine(s) just run:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
and answer the questions asked. This will generate a config file that will be used automatically to properly set the
|
||||
default options when doing
|
||||
|
||||
```bash
|
||||
accelerate launch my_script.py --args_to_my_script
|
||||
```
|
||||
|
||||
For instance, here is how you would run the NLP example (from the root of the repo):
|
||||
|
||||
```bash
|
||||
accelerate launch examples/nlp_example.py
|
||||
```
|
||||
|
||||
## Supported integrations
|
||||
|
||||
- CPU only
|
||||
- single GPU
|
||||
- multi-GPU on one node (machine)
|
||||
- multi-GPU on several nodes (machines)
|
||||
- TPU
|
||||
- FP16 with native AMP (apex on the roadmap)
|
||||
- DeepSpeed (experimental support)
|
||||
@ -1,161 +0,0 @@
|
||||
..
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
Accelerate
|
||||
=======================================================================================================================
|
||||
|
||||
Run your *raw* PyTorch training script on any kind of device
|
||||
|
||||
Features
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
- 🤗 Accelerate provides an easy API to make your scripts run with mixed precision and on any kind of distributed
|
||||
setting (multi-GPUs, TPUs etc.) while still letting you write your own training loop. The same code can then runs
|
||||
seamlessly on your local machine for debugging or your training environment.
|
||||
|
||||
- 🤗 Accelerate also provides a CLI tool that allows you to quickly configure and test your training environment then
|
||||
launch the scripts.
|
||||
|
||||
|
||||
Easy to integrate
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
A traditional training loop in PyTorch looks like this:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
my_model.to(device)
|
||||
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
loss.backward()
|
||||
my_optimizer.step()
|
||||
|
||||
Changing it to work with accelerate is really easy and only adds a few lines of code:
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
+ from accelerate import Accelerator
|
||||
|
||||
+ accelerator = Accelerator()
|
||||
# Use the device given by the `accelerator` object.
|
||||
+ device = accelerator.device
|
||||
my_model.to(device)
|
||||
# Pass every important object (model, optimizer, dataloader) to `accelerator.prepare`
|
||||
+ my_model, my_optimizer, my_training_dataloader = accelerate.prepare(
|
||||
+ my_model, my_optimizer, my_training_dataloader
|
||||
+ )
|
||||
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
# Just a small change for the backward instruction
|
||||
- loss.backward()
|
||||
+ accelerate.backward(loss)
|
||||
my_optimizer.step()
|
||||
|
||||
and with this, your script can now run in a distributed environment (multi-GPU, TPU).
|
||||
|
||||
You can even simplify your script a bit by letting 🤗 Accelerate handle the device placement for you (which is safer,
|
||||
especially for TPU training):
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
+ from accelerate import Accelerator
|
||||
|
||||
+ accelerator = Accelerator()
|
||||
- my_model.to(device)
|
||||
# Pass every important object (model, optimizer, dataloader) to `accelerator.prepare`
|
||||
+ my_model, my_optimizer, my_training_dataloader = accelerate.prepare(
|
||||
+ my_model, my_optimizer, my_training_dataloader
|
||||
+ )
|
||||
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
- inputs = inputs.to(device)
|
||||
- targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
# Just a small change for the backward instruction
|
||||
- loss.backward()
|
||||
+ accelerate.backward(loss)
|
||||
my_optimizer.step()
|
||||
|
||||
|
||||
Script launcher
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
No need to remember how to use ``torch.distributed.launch`` or to write a specific launcher for TPU training! 🤗
|
||||
Accelerate comes with a CLI tool that will make your life easier when launching distributed scripts.
|
||||
|
||||
On your machine(s) just run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate config
|
||||
|
||||
and answer the questions asked. This will generate a config file that will be used automatically to properly set the
|
||||
default options when doing
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate launch my_script.py --args_to_my_script
|
||||
|
||||
For instance, here is how you would run the NLP example (from the root of the repo):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate launch examples/nlp_example.py
|
||||
|
||||
|
||||
Supported integrations
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
- CPU only
|
||||
- single GPU
|
||||
- multi-GPU on one node (machine)
|
||||
- multi-GPU on several nodes (machines)
|
||||
- TPU
|
||||
- FP16 with native AMP (apex on the roadmap)
|
||||
- DeepSpeed (experimental)
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Get started
|
||||
|
||||
quicktour
|
||||
installation
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Guides
|
||||
|
||||
sagemaker
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: API reference
|
||||
|
||||
accelerator
|
||||
launcher
|
||||
kwargs
|
||||
internal
|
||||
@ -55,9 +55,9 @@ Here is how to quickly install `accelerate` from source:
|
||||
pip install git+https://github.com/huggingface/accelerate
|
||||
```
|
||||
|
||||
Note that this will install not the latest released version, but the bleeding edge `master` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't been yet rolled out.
|
||||
Note that this will install not the latest released version, but the bleeding edge `main` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't been yet rolled out.
|
||||
|
||||
While we strive to keep `master` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/accelerate/issues) and this way, things will get fixed even sooner.
|
||||
While we strive to keep `main` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/accelerate/issues) and this way, things will get fixed even sooner.
|
||||
|
||||
Again, you can run:
|
||||
|
||||
@ -69,11 +69,11 @@ to check 🤗 Accelerate is properly installed.
|
||||
|
||||
## Editable install
|
||||
|
||||
If you want to constantly use the bleeding edge `master` version of the source code, or if you want to contribute to the library and need to test the changes in the code you're making, you will need an editable install. This is done by cloning the repository and installing with the following commands:
|
||||
If you want to constantly use the bleeding edge `main` version of the source code, or if you want to contribute to the library and need to test the changes in the code you're making, you will need an editable install. This is done by cloning the repository and installing with the following commands:
|
||||
|
||||
``` bash
|
||||
git clone https://github.com/huggingface/accelerate.git
|
||||
cd transformers
|
||||
cd accelerate
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
@ -85,9 +85,9 @@ now this editable install will reside where you clone the folder to, e.g. `~/acc
|
||||
|
||||
Do note that you have to keep that `accelerate` folder around and not delete it to continue using the 🤗 Accelerate library.
|
||||
|
||||
Now, let's get to the real benefit of this installation approach. Say, you saw some new feature has been just committed into `master`. If you have already performed all the steps above, to update your accelerate repo to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
|
||||
Now, let's get to the real benefit of this installation approach. Say, you saw some new feature has been just committed into `main`. If you have already performed all the steps above, to update your accelerate repo to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
|
||||
|
||||
```
|
||||
```bash
|
||||
cd ~/accelerate/
|
||||
git pull
|
||||
```
|
||||
69
docs/source/internal.mdx
Normal file
69
docs/source/internal.mdx
Normal file
@ -0,0 +1,69 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Internals
|
||||
|
||||
## Optimizer
|
||||
|
||||
[[autodoc]] optimizer.AcceleratedOptimizer
|
||||
|
||||
## DataLoader
|
||||
|
||||
The main work on your PyTorch `DataLoader` is done by the following function:
|
||||
|
||||
[[autodoc]] data_loader.prepare_data_loader
|
||||
|
||||
### BatchSamplerShard
|
||||
|
||||
[[autodoc]] data_loader.DataLoaderShard
|
||||
|
||||
### BatchSamplerShard
|
||||
|
||||
[[autodoc]] data_loader.BatchSamplerShard
|
||||
|
||||
### IterableDatasetShard
|
||||
|
||||
[[autodoc]] data_loader.IterableDatasetShard
|
||||
|
||||
## Scheduler
|
||||
|
||||
[[autodoc]] scheduler.AcceleratedScheduler
|
||||
|
||||
## Distributed Config
|
||||
|
||||
### AcceleratorState
|
||||
|
||||
[[autodoc]] state.AcceleratorState
|
||||
|
||||
### DistributedType
|
||||
|
||||
[[autodoc]] state.DistributedType
|
||||
|
||||
## Tracking
|
||||
|
||||
[[autodoc]] tracking.GeneralTracker
|
||||
|
||||
## Utilities
|
||||
|
||||
[[autodoc]] utils.extract_model_from_parallel
|
||||
|
||||
[[autodoc]] utils.gather
|
||||
|
||||
[[autodoc]] utils.send_to_device
|
||||
|
||||
[[autodoc]] utils.set_seed
|
||||
|
||||
[[autodoc]] utils.synchronize_rng_state
|
||||
|
||||
[[autodoc]] utils.synchronize_rng_states
|
||||
|
||||
[[autodoc]] utils.wait_for_everyone
|
||||
@ -1,85 +0,0 @@
|
||||
..
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
Internals
|
||||
=======================================================================================================================
|
||||
|
||||
|
||||
Optimizer
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. autoclass:: accelerate.optimizer.AcceleratedOptimizer
|
||||
|
||||
|
||||
DataLoader
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
The main work on your PyTorch :obj:`DataLoader` is done by the following function:
|
||||
|
||||
.. autofunction:: accelerate.data_loader.prepare_data_loader
|
||||
|
||||
|
||||
BatchSamplerShard
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: accelerate.data_loader.DataLoaderShard
|
||||
:members:
|
||||
|
||||
|
||||
BatchSamplerShard
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: accelerate.data_loader.BatchSamplerShard
|
||||
:members:
|
||||
|
||||
|
||||
IterableDatasetShard
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: accelerate.data_loader.IterableDatasetShard
|
||||
:members:
|
||||
|
||||
|
||||
Distributed Config
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
AcceleratorState
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: accelerate.state.AcceleratorState
|
||||
:members:
|
||||
|
||||
|
||||
DistributedType
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: accelerate.state.DistributedType
|
||||
:members:
|
||||
|
||||
|
||||
Utilities
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. autofunction:: accelerate.utils.extract_model_from_parallel
|
||||
|
||||
.. autofunction:: accelerate.utils.gather
|
||||
|
||||
.. autofunction:: accelerate.utils.send_to_device
|
||||
|
||||
.. autofunction:: accelerate.utils.set_seed
|
||||
|
||||
.. autofunction:: accelerate.utils.synchronize_rng_state
|
||||
|
||||
.. autofunction:: accelerate.utils.synchronize_rng_states
|
||||
|
||||
.. autofunction:: accelerate.utils.wait_for_everyone
|
||||
29
docs/source/kwargs.mdx
Normal file
29
docs/source/kwargs.mdx
Normal file
@ -0,0 +1,29 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Kwargs Handlers
|
||||
|
||||
The following objects can be passed to the main [`Accelerator`] to customize how some PyTorch objects
|
||||
related to distributed training or mixed precision are created.
|
||||
|
||||
|
||||
## DistributedDataParallelKwargs
|
||||
|
||||
[[autodoc]] DistributedDataParallelKwargs
|
||||
|
||||
## GradScalerKwargs
|
||||
|
||||
[[autodoc]] GradScalerKwargs
|
||||
|
||||
## InitProcessGroupKwargs
|
||||
|
||||
[[autodoc]] InitProcessGroupKwargs
|
||||
@ -1,30 +0,0 @@
|
||||
..
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
|
||||
Kwargs Handlers
|
||||
=======================================================================================================================
|
||||
|
||||
The following objects can be passed to the main :class:`~accelerate.Accelerator` to customize how some PyTorch objects
|
||||
related to distributed training or mixed precision are created.
|
||||
|
||||
|
||||
DistributedDataParallelKwargs
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. autoclass:: accelerate.DistributedDataParallelKwargs
|
||||
|
||||
|
||||
GradScalerKwargs
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
.. autoclass:: accelerate.GradScalerKwargs
|
||||
28
docs/source/launcher.mdx
Normal file
28
docs/source/launcher.mdx
Normal file
@ -0,0 +1,28 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Notebook Launcher
|
||||
|
||||
Launch your training function inside a notebook. Currently supports launching a training with TPUs on [Google
|
||||
Colab](https://colab.research.google.com/) and [Kaggle kernels](https://www.kaggle.com/code), as well as training on
|
||||
several GPUs (if the machine on which you are running your notebook has them).
|
||||
|
||||
An example can be found in [this notebook](https://github.com/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Your `Accelerator` object should only be defined inside the training function. This is because the
|
||||
initialization should be done inside the launcher only.
|
||||
|
||||
</Tip>
|
||||
|
||||
[[autodoc]] notebook_launcher
|
||||
@ -1,29 +0,0 @@
|
||||
..
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
|
||||
Notebook Launcher
|
||||
=======================================================================================================================
|
||||
|
||||
Launch your training function inside a notebook. Currently supports launching a training with TPUs on [Google
|
||||
Colab](https://colab.research.google.com/) and [Kaggle kernels](https://www.kaggle.com/code), as well as training on
|
||||
several GPUs (if the machine on which you are running your notebook has them).
|
||||
|
||||
An example can be found in `this notebook
|
||||
<https://github.com/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb>`__.
|
||||
|
||||
.. warning::
|
||||
|
||||
Your :obj:`Accelerator` object should only be defined inside the training function. This is because the
|
||||
initialization should be done inside the launcher only.
|
||||
|
||||
.. autofunction:: accelerate.notebook_launcher
|
||||
51
docs/source/memory.mdx
Normal file
51
docs/source/memory.mdx
Normal file
@ -0,0 +1,51 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Memory Utilities
|
||||
|
||||
One of the most frustrating errors when it comes to running training scripts is hitting "CUDA Out-of-Memory",
|
||||
as the entire script needs to be restarted, progress is lost, and typically a developer would want to simply
|
||||
start their script and let it run.
|
||||
|
||||
`Accelerate` provides a utility heavily based on [toma](https://github.com/BlackHC/toma) to give this capability.
|
||||
|
||||
## find_executable_batch_size
|
||||
|
||||
This algorithm operates with exponential decay, decreasing the batch size in half after each failed run on some
|
||||
training script. To use it, restructure your training function to include an inner function that includes this wrapper,
|
||||
and build your dataloaders inside it. At a minimum, this could look like 4 new lines of code.
|
||||
> Note: The inner function *must* take in the batch size as the first parameter, but we do not pass one to it when called. The wrapper handles this for us
|
||||
|
||||
```diff
|
||||
def training_function(args):
|
||||
accelerator = Accelerator()
|
||||
model = get_model()
|
||||
model.to(accelerator.device)
|
||||
optimizer = get_optimizer()
|
||||
|
||||
+ @find_executable_batch_size(starting_batch_size=args.batch_size)
|
||||
+ def inner_training_loop(batch_size):
|
||||
+ nonlocal model, optimizer # Ensure they can be used in our context
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
lr_scheduler = get_scheduler(
|
||||
optimizer,
|
||||
num_training_steps=len(train_dataloader)*num_epochs
|
||||
)
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
train(model, optimizer, train_dataloader, lr_scheduler)
|
||||
validate(model, eval_dataloader)
|
||||
+ inner_training_loop()
|
||||
```
|
||||
|
||||
[[autodoc]] memory_utils.find_executable_batch_size
|
||||
460
docs/source/quicktour.mdx
Normal file
460
docs/source/quicktour.mdx
Normal file
@ -0,0 +1,460 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Quick tour
|
||||
|
||||
Let's have a look at a look at 🤗 Accelerate main features and traps to avoid.
|
||||
|
||||
## Main use
|
||||
|
||||
To use 🤗 Accelerate in your own script, you have to change four things:
|
||||
|
||||
1. Import the [`Accelerator`] main class instantiate one in an `accelerator` object:
|
||||
|
||||
```python
|
||||
from accelerate import Accelerator
|
||||
|
||||
accelerator = Accelerator()
|
||||
```
|
||||
|
||||
This should happen as early as possible in your training script as it will initialize everything necessary for
|
||||
distributed training. You don't need to indicate the kind of environment you are in (just one machine with a GPU, one
|
||||
match with several GPUs, several machines with multiple GPUs or a TPU), the library will detect this automatically.
|
||||
|
||||
2. Remove the call `.to(device)` or `.cuda()` for your model and input data. The `accelerator` object
|
||||
will handle this for you and place all those objects on the right device for you. If you know what you're doing, you
|
||||
can leave those `.to(device)` calls but you should use the device provided by the `accelerator` object:
|
||||
`accelerator.device`.
|
||||
|
||||
To fully deactivate the automatic device placement, pass along `device_placement=False` when initializing your
|
||||
[`Accelerator`].
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
If you place your objects manually on the proper device, be careful to create your optimizer after putting your
|
||||
model on `accelerator.device` or your training will fail on TPU.
|
||||
|
||||
</Tip>
|
||||
|
||||
3. Pass all objects relevant to training (optimizer, model, training dataloader, learning rate scheduler) to the
|
||||
[`~Accelerator.prepare`] method. This will make sure everything is ready for training.
|
||||
|
||||
```python
|
||||
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
```
|
||||
|
||||
In particular, your training dataloader will be sharded accross all GPUs/TPU cores available so that each one sees a
|
||||
different portion of the training dataset. Also, the random states of all processes will be synchronized at the
|
||||
beginning of each iteration through your dataloader, to make sure the data is shuffled the same way (if you decided to
|
||||
use `shuffle=True` or any kind of random sampler).
|
||||
|
||||
<Tip>
|
||||
|
||||
The actual batch size for your training will be the number of devices used multiplied by the batch size you set in
|
||||
your script: for instance training on 4 GPUs with a batch size of 16 set when creating the training dataloader will
|
||||
train at an actual batch size of 64.
|
||||
|
||||
</Tip>
|
||||
|
||||
Alternatively, you can use the option `split_batches=True` when creating initializing your
|
||||
[`Accelerator`], in which case the batch size will always stay the same, whether your run your
|
||||
script on 1, 2, 4 or 64 GPUs.
|
||||
|
||||
You should execute this instruction as soon as all objects for training are created, before starting your actual
|
||||
training loop.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
You should only pass the learning rate scheduler to [`~Accelerator.prepare`] when the scheduler needs to be stepped
|
||||
at each optimizer step.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Your training dataloader may change length when going through this method: if you run on X GPUs, it will have its
|
||||
length divided by X (since your actual batch size will be multiplied by X), unless you set
|
||||
`split_batches=True`.
|
||||
|
||||
</Tip>
|
||||
|
||||
Any instruction using your training dataloader length (for instance if you want to log the number of total training
|
||||
steps) should go after the call to [`~Accelerator.prepare`].
|
||||
|
||||
You can perfectly send your dataloader to [`~Accelerator.prepare`] on its own, but it's best to send the
|
||||
model and optimizer to [`~Accelerator.prepare`] together.
|
||||
|
||||
You may or may not want to send your validation dataloader to [`~Accelerator.prepare`], depending on
|
||||
whether you want to run distributed evaluation or not (see below).
|
||||
|
||||
4. Replace the line `loss.backward()` by `accelerator.backward(loss)`.
|
||||
|
||||
And you're all set! With all these changes, your script will run on your local machine as well as on multiple GPUs or a
|
||||
TPU! You can either use your favorite tool to launch the distributed training, or you can use the 🤗 Accelerate
|
||||
launcher.
|
||||
|
||||
|
||||
## Distributed evaluation
|
||||
|
||||
You can perform regular evaluation in your training script, if you leave your validation dataloader out of the
|
||||
[`~Accelerator.prepare`] method. In this case, you will need to put the input data on the
|
||||
`accelerator.device` manually.
|
||||
|
||||
To perform distributed evaluation, send along your validation dataloader to the [`~Accelerator.prepare`]
|
||||
method:
|
||||
|
||||
```python
|
||||
validation_dataloader = accelerator.prepare(validation_dataloader)
|
||||
```
|
||||
|
||||
Like for your training dataloader, it will mean that (should you run your script on multiple devices) each device will
|
||||
only see part of the evaluation data. This means you will need to group your predictions together. This is very easy to
|
||||
do with the [`~Accelerator.gather`] method.
|
||||
|
||||
```python
|
||||
for inputs, targets in validation_dataloader:
|
||||
predictions = model(inputs)
|
||||
# Gather all predictions and targets
|
||||
all_predictions = accelerator.gather(predictions)
|
||||
all_targets = accelerator.gather(targets)
|
||||
# Example of use with a *Datasets.Metric*
|
||||
metric.add_batch(all_predictions, all_targets)
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Like for the training dataloader, passing your validation dataloader through
|
||||
[`~Accelerator.prepare`] may change its: if you run on X GPUs, it will have its length divided by X
|
||||
(since your actual batch size will be multiplied by X), unless you set `split_batches=True`.
|
||||
|
||||
Any instruction using your training dataloader length (for instance if you need the number of total training steps
|
||||
to create a learning rate scheduler) should go after the call to [`~Accelerator.prepare`].
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The [`~Accelerator.gather`] method requires the tensors to be all the same size on each process. If
|
||||
you have tensors of different sizes on each process (for instance when dynamically padding to the maximum length in
|
||||
a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the
|
||||
biggest size across processes.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Launching your distributed script
|
||||
|
||||
You can use the regular commands to launch your distributed training (like `torch.distributed.launch` for
|
||||
PyTorch), they are fully compatible with 🤗 Accelerate. The only caveat here is that 🤗 Accelerate uses the environment
|
||||
to determine all useful information, so `torch.distributed.launch` should be used with the flag `--use_env`.
|
||||
|
||||
🤗 Accelerate also provides a CLI tool that unifies all launcher, so you only have to remember one command. To use it,
|
||||
just run
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
on your machine and reply to the questions asked. This will save a *default_config.yaml* file in your cache folder for
|
||||
🤗 Accelerate. That cache folder is (with decreasing order of priority):
|
||||
|
||||
- The content of your environment variable `HF_HOME` suffixed with *accelerate*.
|
||||
- If it does not exist, the content of your environment variable `XDG_CACHE_HOME` suffixed with
|
||||
*huggingface/accelerate*.
|
||||
- If this does not exist either, the folder *~/.cache/huggingface/accelerate*
|
||||
|
||||
You can also specify with the flag `--config_file` the location of the file you want to save.
|
||||
|
||||
Once this is done, you can test everything is going well on your setup by running
|
||||
|
||||
```bash
|
||||
accelerate test
|
||||
```
|
||||
|
||||
This will launch a short script that will test the distributed environment. If it runs fine, you are ready for the next
|
||||
step!
|
||||
|
||||
Note that if you specified a location for the config file in the previous step, you need to pass it here as well:
|
||||
|
||||
```bash
|
||||
accelerate test --config_file path_to_config.yaml
|
||||
```
|
||||
|
||||
Now that this is done, you can run your script with the following command:
|
||||
|
||||
```bash
|
||||
accelerate launch path_to_script.py --args_for_the_script
|
||||
```
|
||||
|
||||
If you stored the config file in a non-default location, you can indicate it to the launcher like his:
|
||||
|
||||
```bash
|
||||
accelerate launch --config_file path_to_config.yaml path_to_script.py --args_for_the_script
|
||||
```
|
||||
|
||||
You can also override any of the arguments determined by your config file, see TODO: insert ref here.
|
||||
|
||||
|
||||
## Launching training from a notebook
|
||||
|
||||
In Accelerate 0.3.0, a new [`notebook_launcher`] has been introduced to help you launch your training
|
||||
function from a notebook. This launcher supports launching a training with TPUs on Colab or Kaggle, as well as training
|
||||
on several GPUs (if the machine on which you are running your notebook has them).
|
||||
|
||||
Just define a function responsible for your whole training and/or evaluation in a cell of the notebook, then execute a
|
||||
cell with the following code:
|
||||
|
||||
```python
|
||||
from accelerate import notebook_launcher
|
||||
|
||||
notebook_launcher(training_function)
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Your `Accelerator` object should only be defined inside the training function. This is because the
|
||||
initialization should be done inside the launcher only.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Training on TPU
|
||||
|
||||
If you want to launch your script on TPUs, there are a few caveats you should be aware of. Behind the scenes, the TPUs
|
||||
will create a graph of all the operations happening in your training step (forward pass, backward pass and optimizer
|
||||
step). This is why your first step of training will always be very long as building and compiling this graph for
|
||||
optimizations takes some time.
|
||||
|
||||
The good news is that this compilation will be cached so the second step and all the following will be much faster. The
|
||||
bas news is that it only applies if all of your steps do exactly the same operations, which implies:
|
||||
|
||||
- having all tensors of the same length in all your lengths
|
||||
- having static code (i.e., not a for loop of length that could change from step to step)
|
||||
|
||||
Having any of the things above change between two steps will trigger a new compilation which will, once again, take a
|
||||
lot of time. In practice, that means you must take special care to have all your tensors in your inputs of the same
|
||||
shape (so no dynamic padding for instance if you are in an NLP problem) and should not use layer with for loops that
|
||||
have different lengths depending on the inputs (such as an LSTM) or the training will be excruciatingly slow.
|
||||
|
||||
To introduce special behavior in your script for TPUs you can check the `distributed_type` of your
|
||||
`accelerator`:
|
||||
|
||||
```python docstyle-ignore
|
||||
from accelerate import DistributedType
|
||||
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
# do something of static shape
|
||||
else:
|
||||
# go crazy and be dynamic
|
||||
```
|
||||
|
||||
The [NLP example](https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py) shows an example in
|
||||
situation with dynamic padding.
|
||||
|
||||
One last thing to pay close attnetion to: if your model has tied weights (such as language models which tie the weights
|
||||
of the embedding matrix with the weights of the decoder), moving this model to the TPU (either yourself or after you
|
||||
passed your model to [`~Accelerator.prepare`]) will break the tying. You will need to retie the weights
|
||||
after. You can find an example of this in the [run_clm_no_trainer](https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm.py) script in
|
||||
the Transformers repository.
|
||||
|
||||
|
||||
## Other caveats
|
||||
|
||||
We list here all smaller issues you could have in your script conversion and how to resolve them.
|
||||
|
||||
### Execute a statement only on one processes
|
||||
|
||||
Some of your instructions only need to run for one process on a given server: for instance a data download or a log
|
||||
statement. To do this, wrap the statement in a test like this:
|
||||
|
||||
```python docstyle-ignore
|
||||
if accelerator.is_local_main_process:
|
||||
# Is executed once per server
|
||||
```
|
||||
|
||||
Another example is progress bars: to avoid having multiple progress bars in your output, you should only display one on
|
||||
the local main process:
|
||||
|
||||
```python
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||
```
|
||||
|
||||
The *local* means per machine: if you are running your training on two servers with several GPUs, the instruction will
|
||||
be executed once on each of those servers. If you need to execute something only once for all processes (and not per
|
||||
machine) for instance, uploading the final model to the 🤗 model hub, wrap it in a test like this:
|
||||
|
||||
```python docstyle-ignore
|
||||
if accelerator.is_main_process:
|
||||
# Is executed once only
|
||||
```
|
||||
|
||||
For printing statements you only want executed once per machine, you can just replace the `print` function by
|
||||
`accelerator.print`.
|
||||
|
||||
|
||||
### Defer execution
|
||||
|
||||
When you run your usual script, instructions are executed in order. Using 🤗 Accelerate to deploy your script on several
|
||||
GPUs at the same time introduces a complication: while each process executes all instructions in order, some may be
|
||||
faster than others.
|
||||
|
||||
You might need to wait for all processes to have reached a certain point before executing a given instruction. For
|
||||
instance, you shouldn't save a model before being sure every process is done with training. To do this, just write the
|
||||
following line in your code:
|
||||
|
||||
```
|
||||
accelerator.wait_for_everyone()
|
||||
```
|
||||
|
||||
This instruction will block all the processes that arrive them first until all the other processes have reached that
|
||||
point (if you run your script on just one GPU or CPU, this wont' do anything).
|
||||
|
||||
|
||||
### Saving/loading a model
|
||||
|
||||
Saving the model you trained might need a bit of adjustment: first you should wait for all processes to reach that
|
||||
point in the script as shown above, and then, you should unwrap your model before saving it. This is because when going
|
||||
through the [`~Accelerator.prepare`] method, your model may have been placed inside a bigger model,
|
||||
which deals with the distributed training. This in turn means that saving your model state dictionary without taking
|
||||
any precaution will take that potential extra layer into account, and you will end up with weights you can't load back
|
||||
in your base model.
|
||||
|
||||
This is why it's recommended to *unwrap* your model first. Here is an example:
|
||||
|
||||
```
|
||||
accelerator.wait_for_everyone()
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
accelerator.save(unwrapped_model.state_dict(), filename)
|
||||
```
|
||||
|
||||
If your script contains a logic to load checkpoint, we also recommend you load your weights in the unwrapped model
|
||||
(this is only useful if you use the load function after making your model go through
|
||||
[`~Accelerator.prepare`]). Here is an example:
|
||||
|
||||
```
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.load_state_dict(torch.load(filename))
|
||||
```
|
||||
|
||||
Note that since all the model parameters are references to tensors, this will load your weights inside `model`.
|
||||
|
||||
## Saving/loading entire states
|
||||
|
||||
When training your model, you may want to save the current state of the model, optimizer, random generators, and potentially LR schedulers to be restored in the _same script_.
|
||||
You can use `accelerator.save_state` and `accelerator.load_state` respectively to do so, just by simply passing in a save location.
|
||||
If you have registered any other stateful items to be stored through `accelerator.register_for_checkpointing` they will also be saved and/or loaded.
|
||||
<Tip>
|
||||
Every object passed to `register_for_checkpointing` must have a `load_state_dict` and `save_dict` function to be stored
|
||||
</Tip>
|
||||
|
||||
|
||||
### Gradient clipping
|
||||
|
||||
If you are using gradient clipping in your script, you should replace the calls to
|
||||
`torch.nn.utils.clip_grad_norm_` or `torch.nn.utils.clip_grad_value_` with `accelerator.clip_grad_norm_`
|
||||
and `accelerator.clip_grad_value_` respectively.
|
||||
|
||||
|
||||
### Mixed Precision training
|
||||
|
||||
If you are running your training in Mixed Precision with Accelerate, you will get the best result with your loss being
|
||||
computed inside your model (like in Transformer models for instance). Every computation outside of the model will be
|
||||
executed in full precision (which is generally what you want for loss computation, expecially if it involves a
|
||||
softmax). However you might want to put your loss computation inside the *accelerator.autocast* context manager:
|
||||
|
||||
```
|
||||
with accelerator.autocast():
|
||||
loss = complex_loss_function(outputs, target):
|
||||
```
|
||||
|
||||
Another caveat with Mixed Precision training is that the gradient will skip a few updates at the beginning and
|
||||
sometimes during training: because of the dynamic loss scaling strategy, there are points during training where the
|
||||
gradients have overflown, and the loss scaling factor is reduced to avoid this happening again at the next step.
|
||||
|
||||
This means that you may update your learning rate scheduler when there was no update, which is fine in general, but may
|
||||
have an impact when you have very little training data, or if the first learning rate values of your scheduler are very
|
||||
important. In this case, you can skip the learning rate scheduler updates when the optimizer step was not done like
|
||||
this:
|
||||
|
||||
```
|
||||
if not accelerator.optimizer_step_was_skipped:
|
||||
lr_scheduler.step()
|
||||
```
|
||||
|
||||
### DeepSpeed
|
||||
|
||||
DeepSpeed support is experimental, so the underlying API will evolve in the near future and may have some slight
|
||||
breaking changes. In particular, 🤗 Accelerate does not support DeepSpeed config you have written yourself yet, this
|
||||
will be added in a next version.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The [`notebook_launcher`] does not support the DeepSpeed integration yet.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Internal mechanism
|
||||
|
||||
Internally, the library works by first analyzing the environment in which the script is launched to determine which
|
||||
kind of distributed setup is used, how many different processes there are and which one the current script is in. All
|
||||
that information is stored in the [`~AcceleratorState`].
|
||||
|
||||
This class is initialized the first time you instantiate a [`Accelerator`] as well as performing any
|
||||
specific initialization your distributed setup needs. Its state is then uniquely shared through all instances of
|
||||
[`~state.AcceleratorState`].
|
||||
|
||||
Then, when calling [`~Accelerator.prepare`], the library:
|
||||
|
||||
- wraps your model(s) in the container adapted for the distributed setup,
|
||||
- wraps your optimizer(s) in a [`~optimizer.AcceleratedOptimizer`],
|
||||
- creates a new version of your dataloader(s) in a [`~data_loader.DataLoaderShard`].
|
||||
|
||||
While the model(s) and optimizer(s) are just put in simple wrappers, the dataloader(s) are re-created. This is mostly
|
||||
because PyTorch does not let the user change the `batch_sampler` of a dataloader once it's been created and the
|
||||
library handles the sharding of your data between processes by changing that `batch_sampler` to yield every other
|
||||
`num_processes` batches.
|
||||
|
||||
The [`~data_loader.DataLoaderShard`] subclasses `DataLoader` to add the following functionality:
|
||||
|
||||
- it synchronizes the appropriate random number generator of all processes at each new iteration, to ensure any
|
||||
randomization (like shuffling) is done the exact same way across processes.
|
||||
- it puts the batches on the proper device before yielding them (unless you have opted out of
|
||||
`device_placement=True`).
|
||||
|
||||
The random number generator synchronization will by default synchronize:
|
||||
|
||||
- the `generator` attribute of a given sampler (like the PyTorch `RandomSampler`) for PyTorch >= 1.6
|
||||
- the main random number generator in PyTorch <=1.5.1
|
||||
|
||||
You can choose which random number generator(s) to synchronize with the `rng_types` argument of the main
|
||||
[`Accelerator`]. In PyTorch >= 1.6, it is recommended to rely on local `generator` to avoid
|
||||
setting the same seed in the main random number generator in all processes.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Synchronization the main torch (or CUDA or XLA) random number generator will affect any other potential random
|
||||
artifacts you could have in your dataset (like random data augmentation) in the sense all processes will get the
|
||||
same random numbers from the torch random modules (so will apply the same random data augmentation if it's
|
||||
controlled by torch).
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip>
|
||||
|
||||
The randomization part of your custom sampler, batch sampler or iterable dataset should be done using a local
|
||||
`torch.Generator` object (in PyTorch >= 1.6), see the traditional `RandomSampler`, as an example.
|
||||
|
||||
</Tip>
|
||||
|
||||
See more details about the internal in the [Internals page](internal).
|
||||
@ -1,434 +0,0 @@
|
||||
..
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
Quick tour
|
||||
=======================================================================================================================
|
||||
|
||||
Let's have a look at a look at 🤗 Accelerate main features and traps to avoid.
|
||||
|
||||
Main use
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
To use 🤗 Accelerate in your own script, you have to change four things:
|
||||
|
||||
1. Import the :class:`~accelerate.Accelerator` main class instantiate one in an :obj:`accelerator` object:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from accelerate import Accelerator
|
||||
|
||||
accelerator = Accelerator()
|
||||
|
||||
This should happen as early as possible in your training script as it will initialize everything necessary for
|
||||
distributed training. You don't need to indicate the kind of environment you are in (just one machine with a GPU, one
|
||||
match with several GPUs, several machines with multiple GPUs or a TPU), the library will detect this automatically.
|
||||
|
||||
2. Remove the call :obj:`.to(device)` or :obj:`.cuda()` for your model and input data. The :obj:`accelerator` object
|
||||
will handle this for you and place all those objects on the right device for you. If you know what you're doing, you
|
||||
can leave those :obj:`.to(device)` calls but you should use the device provided by the :obj:`accelerator` object:
|
||||
:obj:`accelerator.device`.
|
||||
|
||||
To fully deactivate the automatic device placement, pass along :obj:`device_placement=False` when initializing your
|
||||
:class:`~accelerate.Accelerator`.
|
||||
|
||||
.. Warning::
|
||||
|
||||
If you place your objects manually on the proper device, be careful to create your optimizer after putting your
|
||||
model on :obj:`accelerator.device` or your training will fail on TPU.
|
||||
|
||||
3. Pass all objects relevant to training (optimizer, model, training dataloader) to the
|
||||
:meth:`~accelerate.Accelerator.prepare` method. This will make sure everything is ready for training.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)
|
||||
|
||||
In particular, your training dataloader will be sharded accross all GPUs/TPU cores available so that each one sees a
|
||||
different portion of the training dataset. Also, the random states of all processes will be synchronized at the
|
||||
beginning of each iteration through your dataloader, to make sure the data is shuffled the same way (if you decided to
|
||||
use :obj:`shuffle=True` or any kind of random sampler).
|
||||
|
||||
.. Note::
|
||||
|
||||
The actual batch size for your training will be the number of devices used multiplied by the batch size you set in
|
||||
your script: for instance training on 4 GPUs with a batch size of 16 set when creating the training dataloader will
|
||||
train at an actual batch size of 64.
|
||||
|
||||
Alternatively, you can use the option :obj:`split_batches=True` when creating initializing your
|
||||
:class:`~accelerate.Accelerator`, in which case the batch size will always stay the same, whether your run your
|
||||
script on 1, 2, 4 or 64 GPUs.
|
||||
|
||||
You should execute this instruction as soon as all objects for training are created, before starting your actual
|
||||
training loop.
|
||||
|
||||
.. Warning::
|
||||
|
||||
Your training dataloader may change length when going through this method: if you run on X GPUs, it will have its
|
||||
length divided by X (since your actual batch size will be multiplied by X), unless you set
|
||||
:obj:`split_batches=True`.
|
||||
|
||||
Any instruction using your training dataloader length (for instance if you need the number of total training steps
|
||||
to create a learning rate scheduler) should go after the call to :meth:`~accelerate.Accelerator.prepare`.
|
||||
|
||||
You can perfectly send your dataloader to :meth:`~accelerate.Accelerator.prepare` on its own, but it's best to send the
|
||||
model and optimizer to :meth:`~accelerate.Accelerator.prepare` together.
|
||||
|
||||
You may or may not want to send your validation dataloader to :meth:`~accelerate.Accelerator.prepare`, depending on
|
||||
whether you want to run distributed evaluation or not (see below).
|
||||
|
||||
4. Replace the line :obj:`loss.backward()` by :obj:`accelerator.backward(loss)`.
|
||||
|
||||
And you're all set! With all these changes, your script will run on your local machine as well as on multiple GPUs or a
|
||||
TPU! You can either use your favorite tool to launch the distributed training, or you can use the 🤗 Accelerate
|
||||
launcher.
|
||||
|
||||
|
||||
Distributed evaluation
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
You can perform regular evaluation in your training script, if you leave your validation dataloader out of the
|
||||
:meth:`~accelerate.Accelerator.prepare` method. In this case, you will need to put the input data on the
|
||||
:obj:`accelerator.device` manually.
|
||||
|
||||
To perform distributed evaluation, send along your validation dataloader to the :meth:`~accelerate.Accelerator.prepare`
|
||||
method:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
validation_dataloader = accelerator.prepare(validation_dataloader)
|
||||
|
||||
Like for your training dataloader, it will mean that (should you run your script on multiple devices) each device will
|
||||
only see part of the evaluation data. This means you will need to group your predictions together. This is very easy to
|
||||
do with the :meth:`~accelerate.Accelerator.gather` method.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
for inputs, targets in validation_dataloader:
|
||||
predictions = model(inputs)
|
||||
# Gather all predictions and targets
|
||||
all_predictions = accelerator.gather(predictions)
|
||||
all_targets = accelerator.gather(targets)
|
||||
# Example of use with a `Datasets.Metric`
|
||||
metric.add_batch(all_predictions, all_targets)
|
||||
|
||||
|
||||
.. Warning::
|
||||
|
||||
Like for the training dataloader, passing your validation dataloader through
|
||||
:meth:`~accelerate.Accelerator.prepare` may change its: if you run on X GPUs, it will have its length divided by X
|
||||
(since your actual batch size will be multiplied by X), unless you set :obj:`split_batches=True`.
|
||||
|
||||
Any instruction using your training dataloader length (for instance if you need the number of total training steps
|
||||
to create a learning rate scheduler) should go after the call to :meth:`~accelerate.Accelerator.prepare`.
|
||||
|
||||
.. Warning::
|
||||
|
||||
The :meth:`~accelerate.Accelerator.gather` method requires the tensors to be all the same size on each process. If
|
||||
you have tensors of different sizes on each process (for instance when dynamically padding to the maximum length in
|
||||
a batch), you should use the :meth:`~accelerate.Accelerator.pad_across_processes` method to pad you tensor to the
|
||||
biggest size across processes.
|
||||
|
||||
|
||||
Launching your distributed script
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
You can use the regular commands to launch your distributed training (like :obj:`torch.distributed.launch` for
|
||||
PyTorch), they are fully compatible with 🤗 Accelerate. The only caveat here is that 🤗 Accelerate uses the environment
|
||||
to determine all useful information, so :obj:`torch.distributed.launch` should be used with the flag :obj:`--use_env`.
|
||||
|
||||
🤗 Accelerate also provides a CLI tool that unifies all launcher, so you only have to remember one command. To use it,
|
||||
just run
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate config
|
||||
|
||||
on your machine and reply to the questions asked. This will save a `default_config.json` file in your cache folder for
|
||||
🤗 Accelerate. That cache folder is (with decreasing order of priority):
|
||||
|
||||
- The content of your environment variable ``HF_HOME`` suffixed with `accelerate`.
|
||||
- If it does not exist, the content of your environment variable ``XDG_CACHE_HOME`` suffixed with
|
||||
`huggingface/accelerate`.
|
||||
- If this does not exist either, the folder `~/.cache/huggingface/accelerate`
|
||||
|
||||
You can also specify with the flag :obj:`--config_file` the location of the file you want to save.
|
||||
|
||||
Once this is done, you can test everything is going well on your setup by running
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate test
|
||||
|
||||
|
||||
This will launch a short script that will test the distributed environment. If it runs fine, you are ready for the next
|
||||
step!
|
||||
|
||||
Note that if you specified a location for the config file in the previous step, you need to pass it here as well:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate test --config_file path_to_config.json
|
||||
|
||||
|
||||
Now that this is done, you can run your script with the following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate launch path_to_script.py --args_for_the_script
|
||||
|
||||
|
||||
If you stored the config file in a non-default location, you can indicate it to the launcher like his:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
accelerate launch --config_file path_to_config.json path_to_script.py --args_for_the_script
|
||||
|
||||
You can also override any of the arguments determined by your config file, see TODO: insert ref here.
|
||||
|
||||
|
||||
Launching training from a notebook
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
In Accelerate 0.3.0, a new :class:`~accelerate.notebook_launcher` has been introduced to help you launch your training
|
||||
function from a notebook. This launcher supports launching a training with TPUs on Colab or Kaggle, as well as training
|
||||
on several GPUs (if the machine on which you are running your notebook has them).
|
||||
|
||||
Just define a function responsible for your whole training and/or evaluation in a cell of the notebook, then execute a
|
||||
cell with the following code:
|
||||
|
||||
.. code-block::
|
||||
|
||||
from accelerate import notebook_launcher
|
||||
|
||||
notebook_launcher(training_function)
|
||||
|
||||
.. warning::
|
||||
|
||||
Your :obj:`Accelerator` object should only be defined inside the training function. This is because the
|
||||
initialization should be done inside the launcher only.
|
||||
|
||||
|
||||
Training on TPU
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
If you want to launch your script on TPUs, there are a few caveats you should be aware of. Behind the scenes, the TPUs
|
||||
will create a graph of all the operations happening in your training step (forward pass, backward pass and optimizer
|
||||
step). This is why your first step of training will always be very long as building and compiling this graph for
|
||||
optimizations takes some time.
|
||||
|
||||
The good news is that this compilation will be cached so the second step and all the following will be much faster. The
|
||||
bas news is that it only applies if all of your steps do exactly the same operations, which implies:
|
||||
|
||||
- having all tensors of the same length in all your lengths
|
||||
- having static code (i.e., not a for loop of length that could change from step to step)
|
||||
|
||||
Having any of the things above change between two steps will trigger a new compilation which will, once again, take a
|
||||
lot of time. In practice, that means you must take special care to have all your tensors in your inputs of the same
|
||||
shape (so no dynamic padding for instance if you are in an NLP problem) and should not use layer with for loops that
|
||||
have different lengths depending on the inputs (such as an LSTM) or the training will be excruciatingly slow.
|
||||
|
||||
To introduce special behavior in your script for TPUs you can check the :obj:`distributed_type` of your
|
||||
:obj:`accelerator`:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from accelerate import DistributedType
|
||||
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
# do something of static shape
|
||||
else:
|
||||
# go crazy and be dynamic
|
||||
|
||||
The `NLP example <https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py>`__ shows an example in
|
||||
situation with dynamic padding.
|
||||
|
||||
One last thing to pay close attnetion to: if your model has tied weights (such as language models which tie the weights
|
||||
of the embedding matrix with the weights of the decoder), moving this model to the TPU (either yourself or after you
|
||||
passed your model to :meth:`~accelerate.Accelerator.prepare`) will break the tying. You will need to retie the weights
|
||||
after. You can find an example of this in the `run_clm_no_trainer
|
||||
<https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm.py>`__ script in
|
||||
the Transformers repository.
|
||||
|
||||
|
||||
Other caveats
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
We list here all smaller issues you could have in your script conversion and how to resolve them.
|
||||
|
||||
Execute a statement only on one processes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Some of your instructions only need to run for one process on a given server: for instance a data download or a log
|
||||
statement. To do this, wrap the statement in a test like this:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
if accelerator.is_local_main_process:
|
||||
# Is executed once per server
|
||||
|
||||
Another example is progress bars: to avoid having multiple progress bars in your output, you should only display one on
|
||||
the local main process:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||
|
||||
The `local` means per machine: if you are running your training on two servers with several GPUs, the instruction will
|
||||
be executed once on each of those servers. If you need to execute something only once for all processes (and not per
|
||||
machine) for instance, uploading the final model to the 🤗 model hub, wrap it in a test like this:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
if accelerator.is_main_process:
|
||||
# Is executed once only
|
||||
|
||||
For printing statements you only want executed once per machine, you can just replace the :obj:`print` function by
|
||||
:obj:`accelerator.print`.
|
||||
|
||||
|
||||
Defer execution
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
When you run your usual script, instructions are executed in order. Using 🤗 Accelerate to deploy your script on several
|
||||
GPUs at the same time introduces a complication: while each process executes all instructions in order, some may be
|
||||
faster than others.
|
||||
|
||||
You might need to wait for all processes to have reached a certain point before executing a given instruction. For
|
||||
instance, you shouldn't save a model before being sure every process is done with training. To do this, just write the
|
||||
following line in your code:
|
||||
|
||||
.. code-block::
|
||||
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
This instruction will block all the processes that arrive them first until all the other processes have reached that
|
||||
point (if you run your script on just one GPU or CPU, this wont' do anything).
|
||||
|
||||
|
||||
Saving/loading a model
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Saving the model you trained might need a bit of adjustment: first you should wait for all processes to reach that
|
||||
point in the script as shown above, and then, you should unwrap your model before saving it. This is because when going
|
||||
through the :meth:`~accelerate.Accelerator.prepare` method, your model may have been placed inside a bigger model,
|
||||
which deals with the distributed training. This in turn means that saving your model state dictionary without taking
|
||||
any precaution will take that potential extra layer into account, and you will end up with weights you can't load back
|
||||
in your base model.
|
||||
|
||||
This is why it's recommended to `unwrap` your model first. Here is an example:
|
||||
|
||||
.. code-block::
|
||||
|
||||
accelerator.wait_for_everyone()
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
accelerator.save(unwrapped_model.state_dict(), filename)
|
||||
|
||||
If your script contains a logic to load checkpoint, we also recommend you load your weights in the unwrapped model
|
||||
(this is only useful if you use the load function after making your model go through
|
||||
:meth:`~accelerate.Accelerator.prepare`). Here is an example:
|
||||
|
||||
.. code-block::
|
||||
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.load_state_dict(torch.load(filename))
|
||||
|
||||
Note that since all the model parameters are references to tensors, this will load your weights inside :obj:`model`.
|
||||
|
||||
Gradient clipping
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you are using gradient clipping in your script, you should replace the calls to
|
||||
:obj:`torch.nn.utils.clip_grad_norm_` or :obj:`torch.nn.utils.clip_grad_value_` with :obj:`accelerator.clip_grad_norm_`
|
||||
and :obj:`accelerator.clip_grad_value_` respectively.
|
||||
|
||||
|
||||
Mixed Precision training
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you are running your training in Mixed Precision with Accelerate, you will get the best result with your loss being
|
||||
computed inside your model (like in Transformer models for instance). Every computation outside of the model will be
|
||||
executed in full precision (which is generally what you want for loss computation, expecially if it involves a
|
||||
softmax). However you might want to put your loss computation inside the `accelerator.autocast` context manager:
|
||||
|
||||
.. codeblock::
|
||||
|
||||
with accelerator.autocast():
|
||||
loss = complex_loss_function(outputs, target):
|
||||
|
||||
Another caveat with Mixed Precision training is that the gradient will skip a few updates at the beginning and
|
||||
sometimes during training: because of the dynamic loss scaling strategy, there are points during training where the
|
||||
gradients have overflown, and the loss scaling factor is reduced to avoid this happening again at the next step.
|
||||
|
||||
This means that you may update your learning rate scheduler when there was no update, which is fine in general, but may
|
||||
have an impact when you have very little training data, or if the first learning rate values of your scheduler are very
|
||||
important. In this case, you can skip the learning rate scheduler updates when the optimizer step was not done like
|
||||
this:
|
||||
|
||||
.. codeblock::
|
||||
|
||||
if not accelerator.optimizer_step_was_skipped:
|
||||
lr_scheduler.step()
|
||||
|
||||
|
||||
Internal mechanism
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
Internally, the library works by first analyzing the environment in which the script is launched to determine which
|
||||
kind of distributed setup is used, how many different processes there are and which one the current script is in. All
|
||||
that information is stored in the :class:`~accelerate.state.AcceleratorState`.
|
||||
|
||||
This class is initialized the first time you instantiate a :class:`~accelerate.Accelerator` as well as performing any
|
||||
specific initialization your distributed setup needs. Its state is then uniquely shared through all instances of
|
||||
:class:`~accelerate.state.AcceleratorState`.
|
||||
|
||||
Then, when calling :meth:`~accelerate.Accelerator.prepare`, the library:
|
||||
|
||||
- wraps your model(s) in the container adapted for the distributed setup,
|
||||
- wraps your optimizer(s) in a :class:`~accelerate.optimizer.AcceleratedOptimizer`,
|
||||
- creates a new version of your dataloader(s) in a :class:`~accelerate.data_loader.DataLoaderShard`.
|
||||
|
||||
While the model(s) and optimizer(s) are just put in simple wrappers, the dataloader(s) are re-created. This is mostly
|
||||
because PyTorch does not let the user change the :obj:`batch_sampler` of a dataloader once it's been created and the
|
||||
library handles the sharding of your data between processes by changing that :obj:`batch_sampler` to yield every other
|
||||
:obj:`num_processes` batches.
|
||||
|
||||
The :class:`~accelerate.data_loader.DataLoaderShard` subclasses :obj:`DataLoader` to add the following functionality:
|
||||
|
||||
- it synchronizes the appropriate random number generator of all processes at each new iteration, to ensure any
|
||||
randomization (like shuffling) is done the exact same way across processes.
|
||||
- it puts the batches on the proper device before yielding them (unless you have opted out of
|
||||
:obj:`device_placement=True`).
|
||||
|
||||
The random number generator synchronization will by default synchronize:
|
||||
|
||||
- the :obj:`generator` attribute of a given sampler (like the PyTorch :obj:`RandomSampler`) for PyTorch >= 1.6
|
||||
- the main random number generator in PyTorch <=1.5.1
|
||||
|
||||
You can choose which random number generator(s) to synchronize with the :obj:`rng_types` argument of the main
|
||||
:class:`~accelerate.Accelerator`. In PyTorch >= 1.6, it is recommended to rely on local :obj:`generator` to avoid
|
||||
setting the same seed in the main random number generator in all processes.
|
||||
|
||||
.. Warning::
|
||||
|
||||
Synchronization the main torch (or CUDA or XLA) random number generator will affect any other potential random
|
||||
artifacts you could have in your dataset (like random data augmentation) in the sense all processes will get the
|
||||
same random numbers from the torch random modules (so will apply the same random data augmentation if it's
|
||||
controlled by torch).
|
||||
|
||||
.. Note::
|
||||
|
||||
The randomization part of your custom sampler, batch sampler or iterable dataset should be done using a local
|
||||
:obj:`torch.Generator` object (in PyTorch >= 1.6), see the traditional :obj:`RandomSampler`, as an example.
|
||||
|
||||
See more details about the internal in the :doc:`Internals page <internal>`.
|
||||
150
docs/source/sagemaker.mdx
Normal file
150
docs/source/sagemaker.mdx
Normal file
@ -0,0 +1,150 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Amazon SageMaker
|
||||
|
||||
Hugging Face and Amazon introduced new [Hugging Face Deep Learning Containers (DLCs)](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers) to
|
||||
make it easier than ever to train Hugging Face Transformer models in [Amazon SageMaker](https://aws.amazon.com/sagemaker/).
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Setup & Installation
|
||||
|
||||
|
||||
Before you can run your 🤗 Accelerate scripts on Amazon SageMaker you need to sign up for an AWS account. If you do not
|
||||
have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).
|
||||
|
||||
After you have your AWS Account you need to install the `sagemaker` sdk for 🤗 Accelerate with.
|
||||
|
||||
```bash
|
||||
pip install "accelerate[sagemaker]" --upgrade
|
||||
```
|
||||
|
||||
🤗 Accelerate currently uses the 🤗 DLCs, with `transformers`, `datasets` and `tokenizers` pre-installed. 🤗
|
||||
Accelerate is not in the DLC yet (will soon be added!) so to use it within Amazon SageMaker you need to create a
|
||||
`requirements.txt` in the same directory where your training script is located and add it as dependency.
|
||||
|
||||
```
|
||||
accelerate
|
||||
```
|
||||
|
||||
You should also add any other dependencies you have to this `requirements.txt`.
|
||||
|
||||
|
||||
### Configure 🤗 Accelerate
|
||||
|
||||
You can configure the launch configuration for Amazon SageMaker the same as you do for non SageMaker training jobs with
|
||||
the 🤗 Accelerate CLI.
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
# In which compute environment are you running? ([0] This machine, [1] AWS (Amazon SageMaker)): 1
|
||||
```
|
||||
|
||||
🤗 Accelerate will go through a questionnaire about your Amazon SageMaker setup and create a config file you can edit.
|
||||
|
||||
<Tip>
|
||||
|
||||
🤗 Accelerate is not saving any of your credentials.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Prepare a 🤗 Accelerate fine-tuning script
|
||||
|
||||
The training script is very similar to a training script you might run outside of SageMaker, but to save your model
|
||||
after training you need to specify either `/opt/ml/model` or use `os.environ["SM_MODEL_DIR"]` as your save
|
||||
directory. After training, artifacts in this directory are uploaded to S3.
|
||||
|
||||
|
||||
```diff
|
||||
- torch.save('/opt/ml/model`)
|
||||
+ accelerator.save('/opt/ml/model')
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
SageMaker doesn’t support argparse actions. If you want to use, for example, boolean hyperparameters, you need to
|
||||
specify type as bool in your script and provide an explicit True or False value for this hyperparameter. [[REF]](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#prepare-a-pytorch-training-script).
|
||||
|
||||
</Tip>
|
||||
|
||||
### Launch Training
|
||||
|
||||
You can launch your training with 🤗 Accelerate CLI with
|
||||
|
||||
```
|
||||
accelerate launch path_to_script.py --args_to_the_script
|
||||
```
|
||||
|
||||
This will launch your training script using your configuration. The only thing you have to do is provide all the
|
||||
arguments needed by your training script as named arguments.
|
||||
|
||||
**Examples**
|
||||
|
||||
<Tip>
|
||||
|
||||
If you run one of the example scripts, don't forget to add `accelerator.save('/opt/ml/model')` to it.
|
||||
|
||||
</Tip>
|
||||
|
||||
```bash
|
||||
accelerate launch ./examples/sagemaker_example.py
|
||||
```
|
||||
|
||||
Outputs:
|
||||
|
||||
```
|
||||
Configuring Amazon SageMaker environment
|
||||
Converting Arguments to Hyperparameters
|
||||
Creating Estimator
|
||||
2021-04-08 11:56:50 Starting - Starting the training job...
|
||||
2021-04-08 11:57:13 Starting - Launching requested ML instancesProfilerReport-1617883008: InProgress
|
||||
.........
|
||||
2021-04-08 11:58:54 Starting - Preparing the instances for training.........
|
||||
2021-04-08 12:00:24 Downloading - Downloading input data
|
||||
2021-04-08 12:00:24 Training - Downloading the training image..................
|
||||
2021-04-08 12:03:39 Training - Training image download completed. Training in progress..
|
||||
........
|
||||
epoch 0: {'accuracy': 0.7598039215686274, 'f1': 0.8178438661710037}
|
||||
epoch 1: {'accuracy': 0.8357843137254902, 'f1': 0.882249560632689}
|
||||
epoch 2: {'accuracy': 0.8406862745098039, 'f1': 0.8869565217391304}
|
||||
........
|
||||
2021-04-08 12:05:40 Uploading - Uploading generated training model
|
||||
2021-04-08 12:05:40 Completed - Training job completed
|
||||
Training seconds: 331
|
||||
Billable seconds: 331
|
||||
You can find your model data at: s3://your-bucket/accelerate-sagemaker-1-2021-04-08-11-56-47-108/output/model.tar.gz
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Distributed Training: Data Parallelism
|
||||
|
||||
*currently in development, will be supported soon.*
|
||||
|
||||
### Distributed Training: Model Parallelism
|
||||
|
||||
*currently in development, will be supported soon.*
|
||||
|
||||
### Python packages and dependencies
|
||||
|
||||
🤗 Accelerate currently uses the 🤗 DLCs, with `transformers`, `datasets` and `tokenizers` pre-installed. If you
|
||||
want to use different/other Python packages you can do this by adding them to the `requirements.txt`. These packages
|
||||
will be installed before your training script is started.
|
||||
|
||||
### Remote scripts: Use scripts located on Github
|
||||
|
||||
*undecided if feature is needed. Contact us if you would like this feature.*
|
||||
|
||||
### Use Spot Instances
|
||||
|
||||
*undecided if feature is needed. Contact us if you would like this feature.*
|
||||
@ -1,169 +0,0 @@
|
||||
..
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
Amazon SageMaker
|
||||
=======================================================================================================================
|
||||
|
||||
Hugging Face and Amazon introduced new `Hugging Face Deep Learning Containers (DLCs)
|
||||
<https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers>`_ to
|
||||
make it easier than ever to train Hugging Face Transformer models in `Amazon SageMaker
|
||||
<https://aws.amazon.com/sagemaker/>`_.
|
||||
|
||||
To learn how to use the new 🤗 DLCs with the Amazon SageMaker to run your 🤗 Accelerate scripts and raw training loops.0
|
||||
|
||||
|
||||
|
||||
Getting Started
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
Setup & Installation
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
||||
Before you can run your 🤗 Accelerate scripts on Amazon SageMaker you need to sign up for an AWS account. If you do not
|
||||
have an AWS account yet learn more `here <https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html>`__.
|
||||
|
||||
After you have your AWS Account you need to install the ``sagemaker`` sdk for 🤗 Accelerate with.
|
||||
|
||||
.. code-block::
|
||||
|
||||
pip install "accelerate[sagemaker]" --upgrade
|
||||
|
||||
|
||||
🤗 Accelerate currently uses the 🤗 DLCs, with ``transformers``, ``datasets`` and ``tokenizers`` pre-installed. 🤗
|
||||
Accelerate is not in the DLC yet (will soon be added!) so to use it within Amazon SageMaker you need to create a
|
||||
``requirements.txt`` in the same directory where your training script is located and add it as dependency.
|
||||
|
||||
.. code-block::
|
||||
|
||||
accelerate
|
||||
|
||||
You should also add any other dependencies you have to this ``requirements.txt``.
|
||||
|
||||
|
||||
Configure 🤗 Accelerate
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
You can configure the launch configuration for Amazon SageMaker the same as you do for non SageMaker training jobs with
|
||||
the 🤗 Accelerate CLI.
|
||||
|
||||
.. code-block::
|
||||
|
||||
accelerate config
|
||||
# In which compute environment are you running? ([0] This machine, [1] AWS (Amazon SageMaker)): 1
|
||||
|
||||
|
||||
🤗 Accelerate will go through a questionnaire about your Amazon SageMaker setup and create a config file you can edit.
|
||||
|
||||
.. note::
|
||||
🤗 Accelerate is not saving any of your credentials.
|
||||
|
||||
|
||||
Prepare a 🤗 Accelerate fine-tuning script
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The training script is very similar to a training script you might run outside of SageMaker, but to save your model
|
||||
after training you need to specify either ``/opt/ml/model`` or use ``os.environ["SM_MODEL_DIR"]`` as your save
|
||||
directory. After training, artifacts in this directory are uploaded to S3.
|
||||
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
- torch.save('/opt/ml/model`)
|
||||
+ accelerator.save('/opt/ml/model')
|
||||
|
||||
|
||||
.. warning::
|
||||
SageMaker doesn’t support argparse actions. If you want to use, for example, boolean hyperparameters, you need to
|
||||
specify type as bool in your script and provide an explicit True or False value for this hyperparameter. `[REF]
|
||||
<https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#prepare-a-pytorch-training-script>`__.
|
||||
|
||||
|
||||
Launch Training
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
You can launch your training with 🤗 Accelerate CLI with
|
||||
|
||||
.. code-block::
|
||||
|
||||
accelerate launch path_to_script.py --args_to_the_script
|
||||
|
||||
|
||||
This will launch your training script using your configuration. The only thing you have to do is provide all the
|
||||
arguments needed by your training script as named arguments.
|
||||
|
||||
**Examples**
|
||||
|
||||
.. note::
|
||||
If you run one of the example scripts, don't forget to add ``accelerator.save('/opt/ml/model')`` to it.
|
||||
|
||||
.. code-block::
|
||||
|
||||
accelerate launch ./examples/sagemaker_example.py
|
||||
|
||||
|
||||
Outputs:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Configuring Amazon SageMaker environment
|
||||
Converting Arguments to Hyperparameters
|
||||
Creating Estimator
|
||||
2021-04-08 11:56:50 Starting - Starting the training job...
|
||||
2021-04-08 11:57:13 Starting - Launching requested ML instancesProfilerReport-1617883008: InProgress
|
||||
.........
|
||||
2021-04-08 11:58:54 Starting - Preparing the instances for training.........
|
||||
2021-04-08 12:00:24 Downloading - Downloading input data
|
||||
2021-04-08 12:00:24 Training - Downloading the training image..................
|
||||
2021-04-08 12:03:39 Training - Training image download completed. Training in progress..
|
||||
........
|
||||
epoch 0: {'accuracy': 0.7598039215686274, 'f1': 0.8178438661710037}
|
||||
epoch 1: {'accuracy': 0.8357843137254902, 'f1': 0.882249560632689}
|
||||
epoch 2: {'accuracy': 0.8406862745098039, 'f1': 0.8869565217391304}
|
||||
........
|
||||
2021-04-08 12:05:40 Uploading - Uploading generated training model
|
||||
2021-04-08 12:05:40 Completed - Training job completed
|
||||
Training seconds: 331
|
||||
Billable seconds: 331
|
||||
You can find your model data at: s3://your-bucket/accelerate-sagemaker-1-2021-04-08-11-56-47-108/output/model.tar.gz
|
||||
|
||||
|
||||
|
||||
Advanced Features
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
Distributed Training: Data Parallelism
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
*currently in development, will be supported soon.*
|
||||
|
||||
Distributed Training: Model Parallelism
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
*currently in development, will be supported soon.*
|
||||
|
||||
Python packages and dependencies
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
🤗 Accelerate currently uses the 🤗 DLCs, with ``transformers``, ``datasets`` and ``tokenizers`` pre-installed. If you
|
||||
want to use different/other Python packages you can do this by adding them to the ``requirements.txt``. These packages
|
||||
will be installed before your training script is started.
|
||||
|
||||
Remote scripts: Use scripts located on Github
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
*undecided if feature is needed. Contact us if you would like this feature.*
|
||||
|
||||
Use Spot Instances
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
*undecided if feature is needed. Contact us if you would like this feature.*
|
||||
163
docs/source/tracking.mdx
Normal file
163
docs/source/tracking.mdx
Normal file
@ -0,0 +1,163 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Tracking
|
||||
|
||||
There are a large number of experiment tracking API's available, however getting them all to work with in a multi-processing environment can oftentimes be complex.
|
||||
Accelerate provides a general tracking API that can be used to log useful items during your script through [`~Accelerator.log`]
|
||||
|
||||
## Integrated Trackers
|
||||
|
||||
Currently `Accelerate` supports three trackers out-of-the-box:
|
||||
|
||||
|
||||
[[autodoc]] tracking.TensorBoardTracker
|
||||
|
||||
[[autodoc]] tracking.WandBTracker
|
||||
|
||||
[[autodoc]] tracking.CometMLTracker
|
||||
|
||||
To use any of them, pass in the selected type(s) to the `log_with` parameter in [`Accelerate`]:
|
||||
```python
|
||||
from accelerate import Accelerator
|
||||
from accelerate.utils import LoggerType
|
||||
|
||||
accelerator = Accelerator(log_with="all") # For all available trackers in the environment
|
||||
accelerator = Accelerator(log_with="wandb")
|
||||
accelerator = Accelerator(log_with=["wandb", LoggerType.TENSORBOARD])
|
||||
```
|
||||
|
||||
At the start of your experiment [`~Accelerator.init_trackers`] should be used to setup your project, and potentially add any experiment hyperparameters to be logged:
|
||||
```python
|
||||
hps = {"num_iterations": 5, "learning_rate": 1e-2}
|
||||
accelerator.init_trackers("my_project", config=hps)
|
||||
```
|
||||
|
||||
When you are ready to log any data, [`~Accelerator.log`] should be used.
|
||||
A `step` can also be passed in to correlate the data with a particular step in the training loop.
|
||||
```python
|
||||
accelerator.log({"train_loss": 1.12, "valid_loss": 0.8}, step=1)
|
||||
```
|
||||
|
||||
Once you've finished training, make sure to run [`~Accelerator.end_training`] so that all the trackers can run their finish functionalities if they have any.
|
||||
```python
|
||||
accelerator.end_training()
|
||||
```
|
||||
|
||||
|
||||
A full example is below:
|
||||
```python
|
||||
from accelerate import Accelerator
|
||||
|
||||
accelerator = Accelerator(log_with="all")
|
||||
config = {
|
||||
"num_iterations": 5,
|
||||
"learning_rate": 1e-2,
|
||||
"loss_function": str(my_loss_function),
|
||||
}
|
||||
|
||||
accelerator.init_trackers("example_project", config=config)
|
||||
|
||||
my_model, my_optimizer, my_training_dataloader = accelerate.prepare(my_model, my_optimizer, my_training_dataloader)
|
||||
device = accelerator.device
|
||||
my_model.to(device)
|
||||
|
||||
for iteration in config["num_iterations"]:
|
||||
for step, batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
my_optimizer.step()
|
||||
accelerator.log({"training_loss": loss}, step=step)
|
||||
accelerator.end_training()
|
||||
```
|
||||
|
||||
|
||||
## Implementing Custom Trackers
|
||||
|
||||
To implement a new tracker to be used in `Accelerator`, a new one can be made through implementing the [`~GeneralTracker`] class.
|
||||
Every tracker must implement three functions:
|
||||
- `__init__`:
|
||||
- Should store a `run_name` and initialize the tracker API of the integrated library.
|
||||
- If a tracker stores their data locally (such as TensorBoard), a `logging_dir` parameter can be added.
|
||||
- `store_init_configuration`:
|
||||
- Should take in a `values` dictionary and store them as a one-time experiment configuration
|
||||
- `log`:
|
||||
- Should take in a `values` dictionary and a `step`, and should log them to the run
|
||||
|
||||
A brief example can be seen below with an integration with Weights and Biases, containing only the relevent information:
|
||||
```python
|
||||
from accelerate.tracking import GeneralTracker
|
||||
from typing import Optional
|
||||
|
||||
import wandb
|
||||
|
||||
|
||||
class MyCustomTracker(GeneralTracker):
|
||||
def __init__(self, run_name: str):
|
||||
self.run_name = run_name
|
||||
wandb.init(self.run_name)
|
||||
|
||||
def store_init_configuration(self, values: dict):
|
||||
wandb.config(values)
|
||||
|
||||
def log(self, values: dict, step: Optional[int] = None):
|
||||
wandb.log(values, step=step)
|
||||
```
|
||||
|
||||
When you are ready to build your `Accelerator` object, pass in an **instance** of your tracker to [`~Accelerator.log_with`] to have it automatically
|
||||
be used with the API:
|
||||
|
||||
```python
|
||||
tracker = MyCustomTracker("some_run_name")
|
||||
accelerator = Accelerator(log_with=tracker)
|
||||
```
|
||||
|
||||
These also can be mixed with existing trackers, including with `"all"`:
|
||||
|
||||
```python
|
||||
tracker = MyCustomTracker("some_run_name")
|
||||
accelerator = Accelerator(log_with=[tracker, "all"])
|
||||
```
|
||||
|
||||
## When a wrapper cannot work
|
||||
|
||||
If a library has an API that does not follow a strict `.log` with an overall dictionary such as Neptune.AI, logging can be done manually under an `if accelerator.is_main_process` statement:
|
||||
```diff
|
||||
from accelerate import Accelerator
|
||||
+ import neptune.new as neptune
|
||||
|
||||
accelerator = Accelerator()
|
||||
+ run = neptune.init(...)
|
||||
|
||||
my_model, my_optimizer, my_training_dataloader = accelerate.prepare(my_model, my_optimizer, my_training_dataloader)
|
||||
device = accelerator.device
|
||||
my_model.to(device)
|
||||
|
||||
for iteration in config["num_iterations"]:
|
||||
for batch in my_training_dataloader:
|
||||
my_optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = my_model(inputs)
|
||||
loss = my_loss_function(outputs, targets)
|
||||
total_loss += loss
|
||||
accelerator.backward(loss)
|
||||
my_optimizer.step()
|
||||
+ if accelerator.is_main_process:
|
||||
+ run["logs/training/batch/loss"].log(loss)
|
||||
```
|
||||
@ -183,3 +183,23 @@ To run it in each of these various modes, use the following commands:
|
||||
```
|
||||
* In PyTorch:
|
||||
Add an `xmp.spawn` line in your script as you usually do.
|
||||
|
||||
## Finer Examples
|
||||
|
||||
While the first two scripts are extremely barebones when it comes to what you can do with accelerate, more advanced features are documented in two other locations.
|
||||
|
||||
### `by_feature` examples
|
||||
|
||||
These scripts are *individual* examples highlighting one particular feature or use-case within Accelerate. They all stem from the [nlp_example.py](./nlp_example.py) script, and any changes or modifications is denoted with a `# New Code #` comment.
|
||||
|
||||
Read the README.md file located in the `by_feature` folder for more information.
|
||||
|
||||
### `complete_*` examples
|
||||
|
||||
These two scripts contain *every* single feature currently available in Accelerate in one place, as one giant script.
|
||||
|
||||
New arguments that can be passed include:
|
||||
|
||||
- `checkpointing_steps`, whether the various states should be saved at the end of every `n` steps, or `"epoch"` for each epoch. States are then saved to folders named `step_{n}` or `epoch_{n}`
|
||||
- `resume_from_checkpoint`, should be used if you want to resume training off of a previous call to the script and passed a `checkpointing_steps` to it.
|
||||
- `with_tracking`, should be used if you want to log the training run using all available experiment trackers in your environment. Currently supported trackers include TensorBoard, Weights and Biases, and CometML.
|
||||
|
||||
68
examples/by_feature/README.md
Normal file
68
examples/by_feature/README.md
Normal file
@ -0,0 +1,68 @@
|
||||
# What are these scripts?
|
||||
|
||||
All scripts in this folder originate from the `nlp_example.py` file, as it is a very simplistic NLP training example using Accelerate with zero extra features.
|
||||
|
||||
From there, each further script adds in just **one** feature of Accelerate, showing how you can quickly modify your own scripts to implement these capabilities.
|
||||
|
||||
A full example with all of these parts integrated together can be found in the `complete_nlp_example.py` script and `complete_cv_example.py` script.
|
||||
|
||||
Adjustments to each script from the base `nlp_example.py` file can be found quickly by searching for "# New Code #"
|
||||
|
||||
## Example Scripts by Feature and their Arguments
|
||||
|
||||
### Base Example (`../nlp_example.py`)
|
||||
|
||||
- Shows how to use `Accelerator` in an extremely simplistic PyTorch training loop
|
||||
- Arguments available:
|
||||
- `mixed_precision`, whether to use mixed precision. ("no", "fp16", or "bf16")
|
||||
- `cpu`, whether to train using only the CPU. (yes/no/1/0)
|
||||
|
||||
All following scripts also accept these arguments in addition to their added ones.
|
||||
|
||||
These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torch.distributed.launch`), such as:
|
||||
|
||||
```bash
|
||||
accelerate launch ../nlp_example.py --mixed_precision fp16 --cpu 0
|
||||
```
|
||||
|
||||
### Checkpointing and Resuming Training (`checkpointing.py`)
|
||||
|
||||
- Shows how to use `Accelerator.save_state` and `Accelerator.load_state` to save or continue training
|
||||
- **It is assumed you are continuing off the same training script**
|
||||
- Arguments available:
|
||||
- `checkpointing_steps`, after how many steps the various states should be saved. ("epoch", 1, 2, ...)
|
||||
- `output_dir`, where saved state folders should be saved to, default is current working directory
|
||||
- `resume_from_checkpoint`, what checkpoint folder to resume from. ("epoch_0", "step_22", ...)
|
||||
|
||||
These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torch.distributed.launch`), such as:
|
||||
|
||||
(Note, `resume_from_checkpoint` assumes that we've ran the script for one epoch with the `--checkpointing_steps epoch` flag)
|
||||
|
||||
```bash
|
||||
accelerate launch ./checkpointing.py --checkpointing_steps epoch output_dir "checkpointing_tutorial" --resume_from_checkpoint "checkpointing_tutorial/epoch_0"
|
||||
```
|
||||
|
||||
### Experiment Tracking (`tracking.py`)
|
||||
|
||||
- Shows how to use `Accelerate.init_trackers` and `Accelerator.log`
|
||||
- Can be used with Weights and Biases, TensorBoard, or CometML.
|
||||
- Arguments available:
|
||||
- `with_tracking`, whether to load in all available experiment trackers from the environment.
|
||||
|
||||
These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torch.distributed.launch`), such as:
|
||||
|
||||
```bash
|
||||
accelerate launch ./tracking.py --with_tracking
|
||||
```
|
||||
|
||||
### Cross Validation (`cross_validation.py`)
|
||||
|
||||
- Shows how to use `Accelerator.free_memory` and run cross validation efficiently with `datasets`.
|
||||
- Arguments available:
|
||||
- `num_folds`, the number of folds the training dataset should be split into.
|
||||
|
||||
These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torch.distributed.launch`), such as:
|
||||
|
||||
```bash
|
||||
accelerate launch ./cross_validation.py --num_folds 2
|
||||
```
|
||||
297
examples/by_feature/checkpointing.py
Normal file
297
examples/by_feature/checkpointing.py
Normal file
@ -0,0 +1,297 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate,
|
||||
# specifically showcasing the checkpointing capability,
|
||||
# and builds off the `nlp_example.py` script.
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To help focus on the differences in the code, building `DataLoaders`
|
||||
# was refactored into its own function.
|
||||
# New additions from the base script can be found quickly by
|
||||
# looking for the # New Code # tags
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
# New Code #
|
||||
# Parse out whether we are saving every epoch or after a certain number of batches
|
||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||
if args.checkpointing_steps == "epoch":
|
||||
checkpointing_steps = args.checkpointing_steps
|
||||
elif args.checkpointing_steps.isdigit():
|
||||
checkpointing_steps = int(args.checkpointing_steps)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
|
||||
)
|
||||
else:
|
||||
checkpointing_steps = None
|
||||
|
||||
set_seed(seed)
|
||||
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# New Code #
|
||||
# We need to keep track of how many total steps we have iterated over
|
||||
overall_step = 0
|
||||
# We also need to keep track of the stating epoch so files are named properly
|
||||
starting_epoch = 0
|
||||
|
||||
# We need to load the checkpoint back in before training here with `load_state`
|
||||
# The total number of epochs is adjusted based on where the state is being loaded from,
|
||||
# as we assume continuation of the same training script
|
||||
if args.resume_from_checkpoint:
|
||||
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||
accelerator.load_state(args.resume_from_checkpoint)
|
||||
path = os.path.basename(args.resume_from_checkpoint)
|
||||
else:
|
||||
# Get the most recent checkpoint
|
||||
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||
dirs.sort(key=os.path.getctime)
|
||||
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||
# Extract `epoch_{i}` or `step_{i}`
|
||||
training_difference = os.path.splitext(path)[0]
|
||||
|
||||
if "epoch" in training_difference:
|
||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||
resume_step = None
|
||||
else:
|
||||
resume_step = int(training_difference.replace("step_", ""))
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(starting_epoch, num_epochs):
|
||||
model.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# New Code #
|
||||
# We need to skip steps until we reach the resumed step during the first epoch
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
overall_step += 1
|
||||
continue
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss = loss / gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
# New Code #
|
||||
overall_step += 1
|
||||
|
||||
# New Code #
|
||||
# We save the model, optimizer, lr_scheduler, and seed states by calling `save_state`
|
||||
# These are saved to folders named `step_{overall_step}`
|
||||
# Will contain files: "pytorch_model.bin", "optimizer.bin", "scheduler.bin", and "random_states.pkl"
|
||||
# If mixed precision was used, will also save a "scalar.bin" file
|
||||
if isinstance(checkpointing_steps, int):
|
||||
output_dir = f"step_{overall_step}"
|
||||
if overall_step % checkpointing_steps == 0:
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True` (the default).
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
# It is slightly faster to call this once, than multiple times
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
# New Code #
|
||||
# We save the model, optimizer, lr_scheduler, and seed states by calling `save_state`
|
||||
# These are saved to folders named `epoch_{epoch}`
|
||||
# Will contain files: "pytorch_model.bin", "optimizer.bin", "scheduler.bin", and "random_states.pkl"
|
||||
# If mixed precision was used, will also save a "scalar.bin" file
|
||||
if checkpointing_steps == "epoch":
|
||||
output_dir = f"epoch_{epoch}"
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
parser.add_argument(
|
||||
"--checkpointing_steps",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default=".",
|
||||
help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume_from_checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If the training should continue from a checkpoint folder.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
275
examples/by_feature/cross_validation.py
Normal file
275
examples/by_feature/cross_validation.py
Normal file
@ -0,0 +1,275 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import DatasetDict, load_dataset, load_metric
|
||||
|
||||
# New Code #
|
||||
# We'll be using StratifiedKFold for this example
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate,
|
||||
# specifically showcasing how to perform Cross Validation,
|
||||
# and builds off the `nlp_example.py` script.
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To help focus on the differences in the code, building `DataLoaders`
|
||||
# was refactored into its own function.
|
||||
# New additions from the base script can be found quickly by
|
||||
# looking for the # New Code # tags
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
# New Code #
|
||||
# We need a different `get_dataloaders` function that will build dataloaders by indexs
|
||||
|
||||
|
||||
def get_fold_dataloaders(
|
||||
accelerator: Accelerator, dataset: DatasetDict, train_idxs: List[int], valid_idxs: List[int], batch_size: int = 16
|
||||
):
|
||||
"""
|
||||
Gets a set of train, valid, and test dataloaders for a particular fold
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
The main `Accelerator` object
|
||||
train_idxs (list of `int`):
|
||||
The split indicies for the training dataset
|
||||
valid_idxs (list of `int`):
|
||||
The split indicies for the validation dataset
|
||||
batch_size (`int`):
|
||||
The size of the minibatch. Default is 16
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = DatasetDict(
|
||||
{
|
||||
"train": dataset["train"].select(train_idxs),
|
||||
"validation": dataset["train"].select(valid_idxs),
|
||||
"test": dataset["validation"],
|
||||
}
|
||||
)
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
test_dataloader = DataLoader(
|
||||
tokenized_datasets["test"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader, test_dataloader
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# New Code #
|
||||
test_labels = None
|
||||
test_predictions = []
|
||||
# Download the dataset
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
# Create our splits
|
||||
kfold = StratifiedKFold(n_splits=int(args.num_folds))
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
set_seed(seed)
|
||||
|
||||
# New Code #
|
||||
# Create our folds:
|
||||
folds = kfold.split(np.zeros(datasets["train"].num_rows), datasets["train"]["label"])
|
||||
|
||||
# Iterate over them
|
||||
for train_idxs, valid_idxs in folds:
|
||||
train_dataloader, eval_dataloader, test_dataloader = get_fold_dataloaders(
|
||||
accelerator,
|
||||
datasets,
|
||||
train_idxs,
|
||||
valid_idxs,
|
||||
)
|
||||
if test_labels is None:
|
||||
test_labels = datasets["validation"]["label"]
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss = loss / gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
# New Code #
|
||||
# We also run predictions on the test set at the very end
|
||||
fold_predictions = []
|
||||
for step, batch in enumerate(test_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
fold_predictions.append(predictions.cpu())
|
||||
metric.add_batch(
|
||||
predictions=predictions.argmax(dim=-1),
|
||||
references=references,
|
||||
)
|
||||
test_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
test_predictions.append(torch.cat(fold_predictions, dim=0))
|
||||
# We now need to release all our memory and get rid of the current model, optimizer, etc
|
||||
accelerator.free_memory()
|
||||
# New Code #
|
||||
# Finally we check the accuracy of our folded results:
|
||||
preds = torch.stack(test_predictions, dim=0).sum(dim=0).div(int(config["n_splits"])).argmax(dim=-1)
|
||||
test_metric = metric.compute(predictions=preds, references=test_labels)
|
||||
accelerator.print("Average test metrics from all folds:", test_metric)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
# New Code #
|
||||
parser.add_argument("--num_folds", type=int, default=3, help="The number of splits to perform across the dataset")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
218
examples/by_feature/memory.py
Normal file
218
examples/by_feature/memory.py
Normal file
@ -0,0 +1,218 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
|
||||
# New Code #
|
||||
from accelerate.memory_utils import find_executable_batch_size
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate,
|
||||
# specifically showcasing how to ensure out-of-memory errors never
|
||||
# iterrupt training, and builds off the `nlp_example.py` script.
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# New additions from the base script can be found quickly by
|
||||
# looking for the # New Code # tags
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
set_seed(seed)
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
|
||||
# New Code #
|
||||
# We now can define an inner training loop function. It should take a batch size as the only parameter,
|
||||
# and build the dataloaders in there.
|
||||
# It also gets our decorator
|
||||
@find_executable_batch_size(starting_batch_size=batch_size)
|
||||
def inner_training_loop(batch_size):
|
||||
# And now just move everything below under this function
|
||||
# Ensure that anything declared outside this function is set as `nonlocal`
|
||||
# so it is in scope
|
||||
nonlocal model, optimizer
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss = loss / gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
# New Code #
|
||||
# And call it at the end with no arguments
|
||||
# Note: You could also refactor this outside of your training loop function
|
||||
inner_training_loop()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
215
examples/by_feature/multi_process_metrics.py
Normal file
215
examples/by_feature/multi_process_metrics.py
Normal file
@ -0,0 +1,215 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate,
|
||||
# specifically showcasing how to properly calculate the metrics on the
|
||||
# validation dataset when in a distributed system, and builds off the
|
||||
# `nlp_example.py` script.
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To help focus on the differences in the code, building `DataLoaders`
|
||||
# was refactored into its own function.
|
||||
# New additions from the base script can be found quickly by
|
||||
# looking for the # New Code # tags
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
set_seed(seed)
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss = loss / gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
model.eval()
|
||||
samples_seen = 0
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
# New Code #
|
||||
# First we check if it's a distributed system
|
||||
if accelerator.num_processes > 1:
|
||||
# Then see if we're on the last batch of our eval dataloader
|
||||
if step == len(eval_dataloader):
|
||||
# Last batch needs to be truncated on distributed systems as it contains additional samples
|
||||
predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
|
||||
references = references[: len(eval_dataloader.dataset) - samples_seen]
|
||||
else:
|
||||
# Otherwise we add the number of samples seen
|
||||
samples_seen += references.shape[0]
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
262
examples/by_feature/tracking.py
Normal file
262
examples/by_feature/tracking.py
Normal file
@ -0,0 +1,262 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate,
|
||||
# specifically showcasing the experiment tracking capability,
|
||||
# and builds off the `nlp_example.py` script.
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To help focus on the differences in the code, building `DataLoaders`
|
||||
# was refactored into its own function.
|
||||
# New additions from the base script can be found quickly by
|
||||
# looking for the # New Code # tags
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize Accelerator
|
||||
|
||||
# New Code #
|
||||
# We pass in "all" to `log_with` to grab all available trackers in the environment
|
||||
# Note: If using a custom `Tracker` class, should be passed in here such as:
|
||||
# >>> log_with = ["all", MyCustomTrackerClassInstance()]
|
||||
if args.with_tracking:
|
||||
accelerator = Accelerator(
|
||||
cpu=args.cpu, mixed_precision=args.mixed_precision, log_with="all", logging_dir=args.logging_dir
|
||||
)
|
||||
else:
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
set_seed(seed)
|
||||
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# New Code #
|
||||
# We need to initalize the trackers we use. Overall configurations can also be stored
|
||||
if args.with_tracking:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
# New Code #
|
||||
# For our tracking example, we will log the total loss of each epoch
|
||||
if args.with_tracking:
|
||||
total_loss = 0
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# New Code #
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True` (the default).
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
# It is slightly faster to call this once, than multiple times
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
# New Code #
|
||||
# To actually log, we call `Accelerator.log`
|
||||
# The values passed can be of `str`, `int`, or `float`
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"accuracy": eval_metric["accuracy"],
|
||||
"f1": eval_metric["f1"],
|
||||
"train_loss": total_loss,
|
||||
"epoch": epoch,
|
||||
}
|
||||
)
|
||||
|
||||
# New Code #
|
||||
# When a run is finished, you should call `accelerator.end_training()`
|
||||
# to close all of the open trackers
|
||||
if args.with_tracking:
|
||||
accelerator.end_training()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
parser.add_argument(
|
||||
"--with_tracking",
|
||||
action="store_true",
|
||||
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help="Location on where to store experiment tracking logs`",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
313
examples/complete_cv_example.py
Normal file
313
examples/complete_cv_example.py
Normal file
@ -0,0 +1,313 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.optim.lr_scheduler import OneCycleLR
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
import PIL
|
||||
from accelerate import Accelerator
|
||||
from timm import create_model
|
||||
from torchvision.transforms import Compose, RandomResizedCrop, Resize, ToTensor
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate
|
||||
#
|
||||
# This example trains a ResNet50 on the Oxford-IIT Pet Dataset
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
# Function to get the label from the filename
|
||||
def extract_label(fname):
|
||||
stem = fname.split(os.path.sep)[-1]
|
||||
return re.search(r"^(.*)_\d+\.jpg$", stem).groups()[0]
|
||||
|
||||
|
||||
class PetsDataset(Dataset):
|
||||
def __init__(self, file_names, image_transform=None, label_to_id=None):
|
||||
self.file_names = file_names
|
||||
self.image_transform = image_transform
|
||||
self.label_to_id = label_to_id
|
||||
|
||||
def __len__(self):
|
||||
return len(self.file_names)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
fname = self.file_names[idx]
|
||||
raw_image = PIL.Image.open(fname)
|
||||
image = raw_image.convert("RGB")
|
||||
if self.image_transform is not None:
|
||||
image = self.image_transform(image)
|
||||
label = extract_label(fname)
|
||||
if self.label_to_id is not None:
|
||||
label = self.label_to_id[label]
|
||||
return {"image": image, "label": label}
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
if args.with_tracking:
|
||||
accelerator = Accelerator(
|
||||
cpu=args.cpu, mixed_precision=args.mixed_precision, log_with="all", logging_dir=args.logging_dir
|
||||
)
|
||||
else:
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
image_size = config["image_size"]
|
||||
if not isinstance(image_size, (list, tuple)):
|
||||
image_size = (image_size, image_size)
|
||||
|
||||
# Parse out whether we are saving every epoch or after a certain number of batches
|
||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||
if args.checkpointing_steps == "epoch":
|
||||
checkpointing_steps = args.checkpointing_steps
|
||||
elif args.checkpointing_steps.isdigit():
|
||||
checkpointing_steps = int(args.checkpointing_steps)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
|
||||
)
|
||||
else:
|
||||
checkpointing_steps = None
|
||||
|
||||
# We need to initialize the trackers we use, and also store our configuration
|
||||
if args.with_tracking:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
|
||||
# Grab all the image filenames
|
||||
file_names = [os.path.join(args.data_dir, fname) for fname in os.listdir(args.data_dir) if fname.endswith(".jpg")]
|
||||
|
||||
# Build the label correspondences
|
||||
all_labels = [extract_label(fname) for fname in file_names]
|
||||
id_to_label = list(set(all_labels))
|
||||
id_to_label.sort()
|
||||
label_to_id = {lbl: i for i, lbl in enumerate(id_to_label)}
|
||||
|
||||
# Set the seed before splitting the data.
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
# Split our filenames between train and validation
|
||||
random_perm = np.random.permutation(len(file_names))
|
||||
cut = int(0.8 * len(file_names))
|
||||
train_split = random_perm[:cut]
|
||||
eval_split = random_perm[cut:]
|
||||
|
||||
# For training we use a simple RandomResizedCrop
|
||||
train_tfm = Compose([RandomResizedCrop(image_size, scale=(0.5, 1.0)), ToTensor()])
|
||||
train_dataset = PetsDataset(
|
||||
[file_names[i] for i in train_split], image_transform=train_tfm, label_to_id=label_to_id
|
||||
)
|
||||
|
||||
# For evaluation, we use a deterministic Resize
|
||||
eval_tfm = Compose([Resize(image_size), ToTensor()])
|
||||
eval_dataset = PetsDataset([file_names[i] for i in eval_split], image_transform=eval_tfm, label_to_id=label_to_id)
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
|
||||
eval_dataloader = DataLoader(eval_dataset, shuffle=False, batch_size=batch_size, num_workers=4)
|
||||
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = create_model("resnet50d", pretrained=True, num_classes=len(label_to_id))
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Freezing the base model
|
||||
for param in model.parameters():
|
||||
param.requires_grad = False
|
||||
for param in model.get_classifier().parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
# We normalize the batches of images to be a bit faster.
|
||||
mean = torch.tensor(model.default_cfg["mean"])[None, :, None, None].to(accelerator.device)
|
||||
std = torch.tensor(model.default_cfg["std"])[None, :, None, None].to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr / 25)
|
||||
|
||||
# Instantiate learning rate scheduler
|
||||
lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_dataloader))
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
# We need to keep track of how many total steps we have iterated over
|
||||
overall_step = 0
|
||||
# We also need to keep track of the stating epoch so files are named properly
|
||||
starting_epoch = 0
|
||||
|
||||
# Potentially load in the weights and states from a previous save
|
||||
if args.resume_from_checkpoint:
|
||||
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||
accelerator.load_state(args.resume_from_checkpoint)
|
||||
path = os.path.basename(args.resume_from_checkpoint)
|
||||
else:
|
||||
# Get the most recent checkpoint
|
||||
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||
dirs.sort(key=os.path.getctime)
|
||||
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||
# Extract `epoch_{i}` or `step_{i}`
|
||||
training_difference = os.path.splitext(path)[0]
|
||||
|
||||
if "epoch" in training_difference:
|
||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||
resume_step = None
|
||||
else:
|
||||
resume_step = int(training_difference.replace("step_", ""))
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(starting_epoch, num_epochs):
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
total_loss = 0
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
overall_step += 1
|
||||
continue
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch = {k: v.to(accelerator.device) for k, v in batch.items()}
|
||||
inputs = (batch["image"] - mean) / std
|
||||
outputs = model(inputs)
|
||||
loss = torch.nn.functional.cross_entropy(outputs, batch["label"])
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
overall_step += 1
|
||||
if isinstance(checkpointing_steps, int):
|
||||
output_dir = f"step_{overall_step}"
|
||||
if overall_step % checkpointing_steps == 0:
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
model.eval()
|
||||
accurate = 0
|
||||
num_elems = 0
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch = {k: v.to(accelerator.device) for k, v in batch.items()}
|
||||
inputs = (batch["image"] - mean) / std
|
||||
with torch.no_grad():
|
||||
outputs = model(inputs)
|
||||
predictions = outputs.argmax(dim=-1)
|
||||
accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["label"])
|
||||
num_elems += accurate_preds.shape[0]
|
||||
accurate += accurate_preds.long().sum()
|
||||
|
||||
eval_metric = accurate.item() / num_elems
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}: {100 * eval_metric:.2f}")
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{"accuracy": 100 * eval_metric, "total_loss": total_loss, "epoch": epoch}, step=overall_step
|
||||
)
|
||||
if checkpointing_steps == "epoch":
|
||||
output_dir = f"epoch_{epoch}"
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator.end_training()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument("--data_dir", required=True, help="The data folder on disk.")
|
||||
parser.add_argument("--fp16", action="store_true", help="If passed, will use FP16 training.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
parser.add_argument(
|
||||
"--checkpointing_steps",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default=".",
|
||||
help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume_from_checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If the training should continue from a checkpoint folder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_tracking",
|
||||
action="store_true",
|
||||
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help="Location on where to store experiment tracking logs`",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
301
examples/complete_nlp_example.py
Normal file
301
examples/complete_nlp_example.py
Normal file
@ -0,0 +1,301 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# This example also demonstrates the checkpointing and sharding capabilities
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
if args.with_tracking:
|
||||
accelerator = Accelerator(
|
||||
cpu=args.cpu, mixed_precision=args.mixed_precision, log_with="all", logging_dir=args.logging_dir
|
||||
)
|
||||
else:
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
|
||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||
if args.checkpointing_steps == "epoch":
|
||||
checkpointing_steps = args.checkpointing_steps
|
||||
elif args.checkpointing_steps.isdigit():
|
||||
checkpointing_steps = int(args.checkpointing_steps)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
|
||||
)
|
||||
else:
|
||||
checkpointing_steps = None
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
# We need to initialize the trackers we use, and also store our configuration
|
||||
if args.with_tracking:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
set_seed(seed)
|
||||
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# We need to keep track of how many total steps we have iterated over
|
||||
overall_step = 0
|
||||
# We also need to keep track of the stating epoch so files are named properly
|
||||
starting_epoch = 0
|
||||
|
||||
# Potentially load in the weights and states from a previous save
|
||||
if args.resume_from_checkpoint:
|
||||
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||
accelerator.load_state(args.resume_from_checkpoint)
|
||||
path = os.path.basename(args.resume_from_checkpoint)
|
||||
else:
|
||||
# Get the most recent checkpoint
|
||||
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||
dirs.sort(key=os.path.getctime)
|
||||
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||
# Extract `epoch_{i}` or `step_{i}`
|
||||
training_difference = os.path.splitext(path)[0]
|
||||
|
||||
if "epoch" in training_difference:
|
||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||
resume_step = None
|
||||
else:
|
||||
resume_step = int(training_difference.replace("step_", ""))
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(starting_epoch, num_epochs):
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
total_loss = 0
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
overall_step += 1
|
||||
continue
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss = loss / gradient_accumulation_steps
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
overall_step += 1
|
||||
|
||||
if isinstance(checkpointing_steps, int):
|
||||
output_dir = f"step_{overall_step}"
|
||||
if overall_step % checkpointing_steps == 0:
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
# It is slightly faster to call this once, than multiple times
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"accuracy": eval_metric["accuracy"],
|
||||
"f1": eval_metric["f1"],
|
||||
"train_loss": total_loss,
|
||||
"epoch": epoch,
|
||||
}
|
||||
)
|
||||
|
||||
if checkpointing_steps == "epoch":
|
||||
output_dir = f"epoch_{epoch}"
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator.end_training()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
parser.add_argument(
|
||||
"--checkpointing_steps",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume_from_checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If the training should continue from a checkpoint folder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_tracking",
|
||||
action="store_true",
|
||||
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default=".",
|
||||
help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help="Location on where to store experiment tracking logs`",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -73,7 +73,7 @@ class PetsDataset(Dataset):
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(fp16=args.fp16, cpu=args.cpu)
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mix_precision)
|
||||
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
@ -139,17 +139,16 @@ def training_function(config, args):
|
||||
# Instantiate optimizer
|
||||
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr / 25)
|
||||
|
||||
# Instantiate learning rate scheduler
|
||||
lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_dataloader))
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Instantiate learning rate scheduler after preparing the training dataloader as the prepare method
|
||||
# may change its length.
|
||||
lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=lr, epochs=num_epochs, steps_per_epoch=len(train_dataloader))
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
@ -167,7 +166,7 @@ def training_function(config, args):
|
||||
model.eval()
|
||||
accurate = 0
|
||||
num_elems = 0
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
for _, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch = {k: v.to(accelerator.device) for k, v in batch.items()}
|
||||
inputs = (batch["image"] - mean) / std
|
||||
@ -186,7 +185,21 @@ def training_function(config, args):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument("--data_dir", required=True, help="The data folder on disk.")
|
||||
parser.add_argument("--fp16", action="store_true", help="If passed, will use FP16 training.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--checkpointing_steps",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 3e-2, "num_epochs": 3, "seed": 42, "batch_size": 64, "image_size": 224}
|
||||
|
||||
@ -49,20 +49,19 @@ MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(fp16=args.fp16, cpu=args.cpu)
|
||||
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
@ -78,13 +77,7 @@ def training_function(config, args):
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets.rename_column_("label", "labels")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
@ -100,8 +93,29 @@ def training_function(config, args):
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
set_seed(seed)
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
set_seed(seed)
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
@ -113,19 +127,18 @@ def training_function(config, args):
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader
|
||||
)
|
||||
|
||||
# Instantiate learning rate scheduler after preparing the training dataloader as the prepare method
|
||||
# may change its length.
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=len(train_dataloader) * num_epochs,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
@ -150,9 +163,10 @@ def training_function(config, args):
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=accelerator.gather(predictions),
|
||||
references=accelerator.gather(batch["labels"]),
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
@ -162,8 +176,16 @@ def training_function(config, args):
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument("--fp16", type=bool, default=False, help="If passed, will use FP16 training.")
|
||||
parser.add_argument("--cpu", type=bool, default=False, help="If passed, will train on the CPU.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
30
setup.py
30
setup.py
@ -16,20 +16,19 @@ from setuptools import setup
|
||||
from setuptools import find_packages
|
||||
|
||||
extras = {}
|
||||
extras["quality"] = ["black == 21.4b0", "isort >= 5.5.4", "flake8 >= 3.8.3"]
|
||||
extras["docs"] = [
|
||||
"docutils==0.16.0",
|
||||
"recommonmark",
|
||||
"sphinx==3.2.1",
|
||||
"sphinx-markdown-tables",
|
||||
"sphinx-rtd-theme==0.4.3",
|
||||
"sphinx-copybutton",
|
||||
"sphinxext-opengraph==0.4.1",
|
||||
]
|
||||
extras["quality"] = ["black ~= 22.0", "isort >= 5.5.4", "flake8 >= 3.8.3"]
|
||||
extras["docs"] = []
|
||||
extras["test"] = [
|
||||
"pytest",
|
||||
"pytest-xdist",
|
||||
"pytest-subtests",
|
||||
"datasets",
|
||||
"transformers",
|
||||
"scipy",
|
||||
"sklearn"
|
||||
]
|
||||
extras["test_trackers"] = ["wandb", "comet-ml", "tensorflow"]
|
||||
extras["dev"] = extras["quality"] + extras["test"]
|
||||
|
||||
extras["sagemaker"] = [
|
||||
"sagemaker", # boto3 is a required package in sagemaker
|
||||
@ -37,7 +36,7 @@ extras["sagemaker"] = [
|
||||
|
||||
setup(
|
||||
name="accelerate",
|
||||
version="0.4.0.dev0",
|
||||
version="0.7.1",
|
||||
description="Accelerate",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
@ -56,7 +55,7 @@ setup(
|
||||
]
|
||||
},
|
||||
python_requires=">=3.6.0",
|
||||
install_requires=["torch>=1.4.0", "pyyaml"],
|
||||
install_requires=["torch>=1.4.0", "pyyaml", "numpy>=1.17"],
|
||||
extras_require=extras,
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
@ -73,7 +72,7 @@ setup(
|
||||
)
|
||||
|
||||
# Release checklist
|
||||
# 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py.
|
||||
# 1. Change the version in __init__.py and setup.py.
|
||||
# 2. Commit these changes with the message: "Release: VERSION"
|
||||
# 3. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
|
||||
# Push the tag to git: git push --tags origin main
|
||||
@ -85,8 +84,9 @@ setup(
|
||||
# twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
|
||||
# 6. Check that you can install it in a virtualenv by running:
|
||||
# pip install -i https://testpypi.python.org/pypi accelerate
|
||||
# accelerate env
|
||||
# accelerate test
|
||||
# 7. Upload the final version to actual pypi:
|
||||
# twine upload dist/* -r pypi
|
||||
# 8. Add release notes to the tag in github once everything is looking hunky-dory.
|
||||
# 9. Add the release version to docs/source/_static/js/custom.js and .github/deploy_doc.sh
|
||||
# 10. Update the version in __init__.py, setup.py to the new version "-dev" and push to master
|
||||
# 9. Update the version in __init__.py, setup.py to the new version "-dev" and push to master
|
||||
|
||||
@ -2,10 +2,10 @@
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
__version__ = "0.4.0.dev0"
|
||||
__version__ = "0.7.1"
|
||||
|
||||
from .accelerator import Accelerator
|
||||
from .kwargs_handlers import DistributedDataParallelKwargs, GradScalerKwargs
|
||||
from .notebook_launcher import notebook_launcher
|
||||
from .kwargs_handlers import DistributedDataParallelKwargs, GradScalerKwargs, InitProcessGroupKwargs
|
||||
from .launchers import debug_launcher, notebook_launcher
|
||||
from .state import DistributedType
|
||||
from .utils import DeepSpeedPlugin, synchronize_rng_states
|
||||
|
||||
@ -14,6 +14,8 @@
|
||||
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from typing import List, Optional, Union
|
||||
|
||||
@ -21,17 +23,25 @@ import torch
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
|
||||
from .data_loader import prepare_data_loader
|
||||
from .kwargs_handlers import DistributedDataParallelKwargs, GradScalerKwargs, KwargsHandler
|
||||
from .kwargs_handlers import DistributedDataParallelKwargs, GradScalerKwargs, InitProcessGroupKwargs, KwargsHandler
|
||||
from .optimizer import AcceleratedOptimizer
|
||||
from .scheduler import AcceleratedScheduler
|
||||
from .state import AcceleratorState, DistributedType, is_deepspeed_available
|
||||
from .tracking import LOGGER_TYPE_TO_CLASS, GeneralTracker, filter_trackers
|
||||
from .utils import (
|
||||
DeepSpeedPlugin,
|
||||
FullyShardedDataParallelPlugin,
|
||||
LoggerType,
|
||||
PrecisionType,
|
||||
RNGType,
|
||||
convert_outputs_to_fp32,
|
||||
extract_model_from_parallel,
|
||||
gather,
|
||||
get_pretty_name,
|
||||
pad_across_processes,
|
||||
reduce,
|
||||
save,
|
||||
wait_for_everyone,
|
||||
)
|
||||
@ -39,6 +49,7 @@ from .utils import (
|
||||
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
from .deepspeed_utils import DeepSpeedEngineWrapper, DeepSpeedOptimizerWrapper
|
||||
|
||||
import logging
|
||||
@ -52,44 +63,65 @@ class Accelerator:
|
||||
Creates an instance of an accelerator for distributed training (on multi-GPU, TPU) or mixed precision training.
|
||||
|
||||
Args:
|
||||
device_placement (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
device_placement (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the accelerator should put objects on device (tensors yielded by the dataloader, model,
|
||||
etc...).
|
||||
split_batches (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
split_batches (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If
|
||||
:obj:`True` the actual batch size used will be the same on any kind of distributed processes, but it must
|
||||
be a round multiple of the :obj:`num_processes` you are using. If :obj:`False`, actual batch size used will
|
||||
be the one set in your script multiplied by the number of processes.
|
||||
fp16 (:obj:`bool`, `optional`):
|
||||
Whether or not to use mixed precision training. Will default to the value in the environment variable
|
||||
:obj:`USE_FP16`, which will use the default value in the accelerate config of the current system or the
|
||||
flag passed with the :obj:`accelerate.launch` command.
|
||||
cpu (:obj:`bool`, `optional`):
|
||||
Whether or not to force the script to execute on CPU. Will ignore GPU available if set to :obj:`True` and
|
||||
force the execution on one process only.
|
||||
deepspeed_plugin (:obj:`DeepSpeedPlugin`, `optional`):
|
||||
`True` the actual batch size used will be the same on any kind of distributed processes, but it must be a
|
||||
round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set
|
||||
in your script multiplied by the number of processes.
|
||||
mixed_precision (`str`, *optional*):
|
||||
Whether or not to use mixed precision training (fp16 or bfloat16). Choose from 'no','fp16','bf16'. Will
|
||||
default to the value in the environment variable `MIXED_PRECISION`, which will use the default value in the
|
||||
accelerate config of the current system or the flag passed with the `accelerate.launch` command. 'fp16'
|
||||
requires pytorch 1.6 or higher. 'bf16' requires pytorch 1.10 or higher.
|
||||
cpu (`bool`, *optional*):
|
||||
Whether or not to force the script to execute on CPU. Will ignore GPU available if set to `True` and force
|
||||
the execution on one process only.
|
||||
deepspeed_plugin (`DeepSpeedPlugin`, *optional*):
|
||||
Tweak your DeepSpeed related args using this argument. This argument is optional and can be configured
|
||||
directly using `accelerate config`
|
||||
rng_types (list of :obj:`str` or :class:`~accelerate.utils.RNGType`):
|
||||
directly using *accelerate config*
|
||||
fsdp_plugin (`FullyShardedDataParallelPlugin`, *optional*):
|
||||
Tweak your FSDP related args using this argument. This argument is optional and can be configured directly
|
||||
using *accelerate config*
|
||||
rng_types (list of `str` or [`~utils.RNGType`]):
|
||||
The list of random number generators to synchronize at the beginning of each iteration in your prepared
|
||||
dataloaders. Should be one or several of:
|
||||
|
||||
- :obj:`"torch"`: the base torch random number generator
|
||||
- :obj:`"cuda"`: the CUDA random number generator (GPU only)
|
||||
- :obj:`"xla"`: the XLA random number generator (TPU only)
|
||||
- :obj:`"generator"`: the :obj:`torch.Generator` of the sampler (or batch sampler if there is no sampler in
|
||||
your dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.
|
||||
- `"torch"`: the base torch random number generator
|
||||
- `"cuda"`: the CUDA random number generator (GPU only)
|
||||
- `"xla"`: the XLA random number generator (TPU only)
|
||||
- `"generator"`: the `torch.Generator` of the sampler (or batch sampler if there is no sampler in your
|
||||
dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.
|
||||
|
||||
Will default to :obj:`["torch"]` for PyTorch versions <=1.5.1 and :obj:`["generator"]` for PyTorch versions
|
||||
>= 1.6.
|
||||
kwargs_handlers (list of kwargs handlers, `optional`)
|
||||
A list of :obj:`KwargHandler` to customize how the objects related to distributed training or mixed
|
||||
precision are created. See :doc:`kwargs` for more information.
|
||||
Will default to `["torch"]` for PyTorch versions <=1.5.1 and `["generator"]` for PyTorch versions >= 1.6.
|
||||
log_with (list of `str`, [`~utils.LoggerType`] or [`~tracking.GeneralTracker`], *optional*):
|
||||
A list of loggers to be setup for experiment tracking. Should be one or several of:
|
||||
|
||||
- `"all"`
|
||||
- `"tensorboard"`
|
||||
- `"wandb"`
|
||||
- `"comet_ml"`
|
||||
If `"all`" is selected, will pick up all available trackers in the environment and intialize them. Can also
|
||||
accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
|
||||
logging_dir (`str`, `os.PathLike`, *optional*):
|
||||
A path to a directory for storing logs of locally-compatible loggers.
|
||||
dispatch_batches (`bool`, *optional*):
|
||||
If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process
|
||||
and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose
|
||||
underlying dataset is an `IterableDataset`, `False` otherwise.
|
||||
step_scheduler_with_optimizer (`bool`, *optional`, defaults to `True`):
|
||||
Set `True` if the learning rate scheduler is stepped at the same time as the optimizer, `False` if only
|
||||
done under certain circumstances (at the end of each epoch, for instance).
|
||||
kwargs_handlers (`List[KwargHandler]`, *optional*)
|
||||
A list of `KwargHandler` to customize how the objects related to distributed training or mixed precision
|
||||
are created. See [kwargs](kwargs) for more information.
|
||||
|
||||
Attributes
|
||||
|
||||
- **device** (:obj:`torch.device`) -- The device to use.
|
||||
- **state** (:class:`~accelerate.AcceleratorState`) -- The distributed setup state.
|
||||
- **device** (`torch.device`) -- The device to use.
|
||||
- **state** ([`~state.AcceleratorState`]) -- The distributed setup state.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -97,11 +129,31 @@ class Accelerator:
|
||||
device_placement: bool = True,
|
||||
split_batches: bool = False,
|
||||
fp16: bool = None,
|
||||
mixed_precision: Union[PrecisionType, str] = None,
|
||||
cpu: bool = False,
|
||||
deepspeed_plugin: DeepSpeedPlugin = None,
|
||||
fsdp_plugin: FullyShardedDataParallelPlugin = None,
|
||||
rng_types: Optional[List[Union[str, RNGType]]] = None,
|
||||
log_with: Optional[List[Union[str, LoggerType, GeneralTracker]]] = None,
|
||||
logging_dir: Optional[Union[str, os.PathLike]] = None,
|
||||
dispatch_batches: Optional[bool] = None,
|
||||
step_scheduler_with_optimizer: bool = True,
|
||||
kwargs_handlers: Optional[List[KwargsHandler]] = None,
|
||||
):
|
||||
self.logging_dir = logging_dir
|
||||
self.log_with = filter_trackers(log_with, self.logging_dir)
|
||||
|
||||
if mixed_precision is not None:
|
||||
mixed_precision = str(mixed_precision)
|
||||
if mixed_precision not in PrecisionType:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {mixed_precision}. Choose between {PrecisionType.list()}"
|
||||
)
|
||||
|
||||
if fp16:
|
||||
warnings.warn('fp16=True is deprecated. Use mixed_precision="fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
|
||||
if deepspeed_plugin is None: # init from env variables
|
||||
deepspeed_plugin = DeepSpeedPlugin() if os.environ.get("USE_DEEPSPEED", "false") == "true" else None
|
||||
else:
|
||||
@ -109,14 +161,19 @@ class Accelerator:
|
||||
deepspeed_plugin, DeepSpeedPlugin
|
||||
), "`deepspeed_plugin` must be a DeepSpeedPlugin object."
|
||||
|
||||
self.state = AcceleratorState(fp16=fp16, cpu=cpu, deepspeed_plugin=deepspeed_plugin, _from_accelerator=True)
|
||||
|
||||
self.device_placement = device_placement
|
||||
self.split_batches = split_batches
|
||||
if os.environ.get("USE_FSDP", "false") == "true":
|
||||
if version.parse(torch.__version__) < version.parse("1.12.0.dev20220418+cu113"):
|
||||
raise ValueError("FSDP requires PyTorch >= 1.12.0.dev20220418+cu113")
|
||||
if fsdp_plugin is None: # init from env variables
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
else:
|
||||
if not isinstance(fsdp_plugin, FullyShardedDataParallelPlugin):
|
||||
raise TypeError("`fsdp_plugin` must be a FullyShardedDataParallelPlugin object.")
|
||||
|
||||
# Kwargs handlers
|
||||
self.ddp_handler = None
|
||||
self.scaler_handler = None
|
||||
self.init_handler = None
|
||||
if kwargs_handlers is not None:
|
||||
for handler in kwargs_handlers:
|
||||
assert isinstance(handler, KwargsHandler), f"Unsupported kwargs handler passed: {handler}."
|
||||
@ -130,21 +187,58 @@ class Accelerator:
|
||||
raise ValueError("You can only pass one `GradScalerKwargs` in `kwargs_handler`.")
|
||||
else:
|
||||
self.scaler_handler = handler
|
||||
elif isinstance(handler, InitProcessGroupKwargs):
|
||||
if self.init_handler is not None:
|
||||
raise ValueError("You can only pass one `InitProcessGroupKwargs` in `kwargs_handler`.")
|
||||
else:
|
||||
self.init_handler = handler
|
||||
|
||||
kwargs = self.init_handler.to_kwargs() if self.init_handler is not None else {}
|
||||
self.state = AcceleratorState(
|
||||
mixed_precision=mixed_precision,
|
||||
cpu=cpu,
|
||||
deepspeed_plugin=deepspeed_plugin,
|
||||
fsdp_plugin=fsdp_plugin,
|
||||
_from_accelerator=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.device_placement = device_placement
|
||||
self.split_batches = split_batches
|
||||
self.dispatch_batches = dispatch_batches
|
||||
if dispatch_batches is True and version.parse(torch.__version__) < version.parse("1.8.0"):
|
||||
raise ImportError(
|
||||
"Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
|
||||
)
|
||||
self.step_scheduler_with_optimizer = step_scheduler_with_optimizer
|
||||
|
||||
# Mixed precision attributes
|
||||
self.scaler = None
|
||||
self.native_amp = False
|
||||
if self.state.use_fp16:
|
||||
if self.state.mixed_precision == "fp16":
|
||||
self.native_amp = version.parse(torch.__version__) >= version.parse("1.6")
|
||||
if version.parse(torch.__version__) < version.parse("1.6"):
|
||||
raise ValueError("fp16 mixed precision requires PyTorch >= 1.6")
|
||||
|
||||
kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
|
||||
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
||||
elif self.state.mixed_precision == "bf16":
|
||||
self.native_amp = version.parse(torch.__version__) >= version.parse("1.10")
|
||||
if mixed_precision == "bf16" and version.parse(torch.__version__) < version.parse("1.10"):
|
||||
raise ValueError("bf16 mixed precision requires PyTorch >= 1.10")
|
||||
|
||||
kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
|
||||
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
||||
|
||||
# Internal references to the training objects
|
||||
self._optimizers = []
|
||||
self._models = []
|
||||
self._schedulers = []
|
||||
self._custom_objects = []
|
||||
|
||||
# RNG Types
|
||||
if rng_types is None:
|
||||
self.rng_types = rng_types
|
||||
if self.rng_types is None:
|
||||
self.rng_types = ["torch"] if version.parse(torch.__version__) <= version.parse("1.5.1") else ["generator"]
|
||||
|
||||
@property
|
||||
@ -179,11 +273,21 @@ class Accelerator:
|
||||
|
||||
@property
|
||||
def use_fp16(self):
|
||||
return self.mixed_precision != "no"
|
||||
|
||||
@property
|
||||
def mixed_precision(self):
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
use_fp16 = self.state.deepspeed_plugin.deepspeed_config["fp16"]["enabled"]
|
||||
config = self.state.deepspeed_plugin.deepspeed_config
|
||||
if config.get("fp16", {}).get("enabled", False):
|
||||
mixed_precision = "fp16"
|
||||
elif config.get("bf16", {}).get("enabled", False):
|
||||
mixed_precision = "bf16"
|
||||
else:
|
||||
mixed_precision = "no"
|
||||
else:
|
||||
use_fp16 = self.state.use_fp16
|
||||
return use_fp16
|
||||
mixed_precision = self.state.mixed_precision
|
||||
return mixed_precision
|
||||
|
||||
@contextmanager
|
||||
def local_main_process_first(self):
|
||||
@ -214,35 +318,98 @@ class Accelerator:
|
||||
|
||||
def print(self, *args, **kwargs):
|
||||
"""
|
||||
Use in replacement of :obj:`print()` to only print once per server.
|
||||
Use in replacement of `print()` to only print once per server.
|
||||
"""
|
||||
if self.is_local_main_process:
|
||||
print(*args, **kwargs)
|
||||
|
||||
def _prepare_one(self, obj):
|
||||
if isinstance(obj, torch.utils.data.DataLoader):
|
||||
def _prepare_one(self, obj, first_pass=False):
|
||||
# First pass of preparation: DataLoader, model, optimizer
|
||||
if isinstance(obj, torch.utils.data.DataLoader) and first_pass:
|
||||
return self.prepare_data_loader(obj)
|
||||
elif isinstance(obj, torch.nn.Module):
|
||||
elif isinstance(obj, torch.nn.Module) and first_pass:
|
||||
self._models.append(obj)
|
||||
return self.prepare_model(obj)
|
||||
elif isinstance(obj, torch.optim.Optimizer):
|
||||
elif isinstance(obj, torch.optim.Optimizer) and first_pass:
|
||||
optimizer = self.prepare_optimizer(obj)
|
||||
self._optimizers.append(optimizer)
|
||||
return optimizer
|
||||
# Second pass of preparation: LR scheduler (which need the full list of optimizers)
|
||||
elif isinstance(obj, torch.optim.lr_scheduler._LRScheduler) and not first_pass:
|
||||
scheduler = self.prepare_scheduler(obj)
|
||||
self._schedulers.append(scheduler)
|
||||
return scheduler
|
||||
else:
|
||||
return obj
|
||||
|
||||
def _prepare_fsdp(self, *args):
|
||||
result = []
|
||||
for obj in args:
|
||||
if isinstance(obj, torch.nn.Module):
|
||||
model = obj
|
||||
break
|
||||
optimizers = []
|
||||
|
||||
self._schedulers = []
|
||||
self._models = []
|
||||
intermediate_result = []
|
||||
for obj in args:
|
||||
if isinstance(obj, torch.optim.Optimizer):
|
||||
if len(obj.param_groups) > 1:
|
||||
logger.warn(
|
||||
"FSDP Warning: When using FSDP, several parameter groups will be conflated into "
|
||||
"a single one due to nested module wrapping and parameter flattening."
|
||||
)
|
||||
optimizer = obj.optimizer.__class__(model.parameters(), **obj.optimizer.defaults)
|
||||
obj = self.prepare_optimizer(optimizer)
|
||||
optimizers.append(obj)
|
||||
elif isinstance(obj, torch.nn.Module):
|
||||
self._models.append(obj)
|
||||
intermediate_result.append(obj)
|
||||
|
||||
for obj in intermediate_result:
|
||||
if isinstance(obj, AcceleratedScheduler):
|
||||
obj.optimizer = optimizers
|
||||
for i, opt in enumerate(self._optimizers):
|
||||
if getattr(obj.scheduler, "optimizer", None) == opt.optimizer:
|
||||
obj.scheduler.optimizer = optimizers[i]
|
||||
obj.optimizers = [optimizers[i]]
|
||||
break
|
||||
self._schedulers.append(obj)
|
||||
result.append(obj)
|
||||
self._optimizers = optimizers
|
||||
return tuple(result)
|
||||
|
||||
def prepare(self, *args):
|
||||
"""
|
||||
Prepare all objects passed in :obj:`args` for distributed training and mixed precision, then return them in the
|
||||
same order.
|
||||
Prepare all objects passed in `args` for distributed training and mixed precision, then return them in the same
|
||||
order.
|
||||
|
||||
Accepts the following type of objects:
|
||||
|
||||
- :obj:`torch.utils.data.DataLoader`: PyTorch Dataloader
|
||||
- :obj:`torch.nn.Module`: PyTorch Module
|
||||
- :obj:`torch.optim.Optimizer`: PyTorch Optimizer
|
||||
- `torch.utils.data.DataLoader`: PyTorch Dataloader
|
||||
- `torch.nn.Module`: PyTorch Module
|
||||
- `torch.optim.Optimizer`: PyTorch Optimizer
|
||||
"""
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
model_count = 0
|
||||
optimizer_present = False
|
||||
for obj in args:
|
||||
if isinstance(obj, torch.nn.Module):
|
||||
model_count += 1
|
||||
if isinstance(obj, torch.optim.Optimizer):
|
||||
optimizer_present = True
|
||||
if model_count > 1 and optimizer_present:
|
||||
raise ValueError(
|
||||
"For FSDP to work with multiple models (>1), "
|
||||
"prepare must be called for all the models before optimizers are created"
|
||||
)
|
||||
elif model_count == 1 and optimizer_present:
|
||||
logger.warn(
|
||||
"FSDP Warning: When using FSDP, "
|
||||
"it is efficient and recommended to call prepare for the model before creating the optimizer"
|
||||
)
|
||||
|
||||
# On TPUs, putting the model on the XLA device will create new parameters, so the corresponding optimizer will
|
||||
# have parameters disconnected from the model (so no training :-( ).
|
||||
# If the model and optimizer have parameters on different devices we raise an error.
|
||||
@ -266,7 +433,8 @@ class Accelerator:
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
result = self._prepare_deepspeed(*args)
|
||||
else:
|
||||
result = tuple(self._prepare_one(obj) for obj in args)
|
||||
result = tuple(self._prepare_one(obj, first_pass=True) for obj in args)
|
||||
result = tuple(self._prepare_one(obj) for obj in result)
|
||||
|
||||
if tpu_should_fix_optimizer:
|
||||
# 2. grabbing new model parameters
|
||||
@ -278,24 +446,43 @@ class Accelerator:
|
||||
if isinstance(obj, torch.optim.Optimizer):
|
||||
obj._switch_parameters(mapping)
|
||||
|
||||
if self.distributed_type == DistributedType.FSDP and model_count == 1 and optimizer_present:
|
||||
result = self._prepare_fsdp(*result)
|
||||
|
||||
return result if len(result) > 1 else result[0]
|
||||
|
||||
def prepare_model(self, model):
|
||||
if self.device_placement:
|
||||
if self.device_placement and self.distributed_type != DistributedType.FSDP:
|
||||
model = model.to(self.device)
|
||||
if self.distributed_type == DistributedType.MULTI_GPU:
|
||||
kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model,
|
||||
device_ids=[self.local_process_index],
|
||||
output_device=self.local_process_index,
|
||||
**kwargs,
|
||||
model, device_ids=[self.local_process_index], output_device=self.local_process_index, **kwargs
|
||||
)
|
||||
elif self.distributed_type == DistributedType.FSDP:
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
|
||||
fsdp_plugin = self.state.fsdp_plugin
|
||||
model = FSDP(
|
||||
model,
|
||||
sharding_strategy=fsdp_plugin.sharding_strategy,
|
||||
cpu_offload=fsdp_plugin.cpu_offload,
|
||||
auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
|
||||
backward_prefetch=fsdp_plugin.backward_prefetch,
|
||||
ignored_modules=fsdp_plugin.ignored_modules,
|
||||
)
|
||||
if not fsdp_plugin.cpu_offload.offload_params:
|
||||
model.to(self.device)
|
||||
elif self.distributed_type == DistributedType.MULTI_CPU:
|
||||
kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
|
||||
if self.native_amp:
|
||||
model.forward = torch.cuda.amp.autocast()(model.forward)
|
||||
if self.mixed_precision == "fp16" and version.parse(torch.__version__) >= version.parse("1.10"):
|
||||
model.forward = torch.cuda.amp.autocast(dtype=torch.float16)(model.forward)
|
||||
elif self.mixed_precision == "bf16":
|
||||
model.forward = torch.cuda.amp.autocast(dtype=torch.bfloat16)(model.forward)
|
||||
else:
|
||||
model.forward = torch.cuda.amp.autocast()(model.forward)
|
||||
model.forward = convert_outputs_to_fp32(model.forward)
|
||||
return model
|
||||
|
||||
@ -321,7 +508,10 @@ class Accelerator:
|
||||
batch_size_per_device * deepspeed_plugin.gradient_accumulation_steps * self.num_processes
|
||||
)
|
||||
|
||||
result = [self._prepare_one(obj) if isinstance(obj, torch.utils.data.DataLoader) else obj for obj in args]
|
||||
result = [
|
||||
self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
|
||||
for obj in args
|
||||
]
|
||||
|
||||
model = None
|
||||
optimizer = None
|
||||
@ -336,8 +526,12 @@ class Accelerator:
|
||||
is_adamw = isinstance(optimizer, torch.optim.AdamW)
|
||||
if (is_adam or is_adamw) and deepspeed_plugin.offload_optimizer_device == "cpu":
|
||||
defaults = optimizer.defaults
|
||||
params = []
|
||||
for group in optimizer.param_groups:
|
||||
params.extend(group["params"])
|
||||
|
||||
optimizer = deepspeed.ops.adam.DeepSpeedCPUAdam(
|
||||
model.parameters(),
|
||||
params,
|
||||
lr=defaults["lr"],
|
||||
bias_correction=True,
|
||||
betas=defaults["betas"],
|
||||
@ -384,14 +578,30 @@ class Accelerator:
|
||||
split_batches=self.split_batches,
|
||||
put_on_device=self.device_placement,
|
||||
rng_types=self.rng_types.copy(),
|
||||
dispatch_batches=self.dispatch_batches,
|
||||
)
|
||||
|
||||
def prepare_optimizer(self, optimizer):
|
||||
return AcceleratedOptimizer(optimizer, device_placement=self.device_placement, scaler=self.scaler)
|
||||
|
||||
def prepare_scheduler(self, scheduler):
|
||||
# We try to find the optimizer associated with `scheduler`, the default is the full list.
|
||||
optimizer = self._optimizers
|
||||
for opt in self._optimizers:
|
||||
if getattr(scheduler, "optimizer", None) == opt.optimizer:
|
||||
optimizer = opt
|
||||
break
|
||||
|
||||
return AcceleratedScheduler(
|
||||
scheduler,
|
||||
optimizer,
|
||||
step_with_optimizer=self.step_scheduler_with_optimizer,
|
||||
split_batches=self.split_batches,
|
||||
)
|
||||
|
||||
def backward(self, loss, **kwargs):
|
||||
"""
|
||||
Use :obj:`accelerator.backward(loss)` in lieu of :obj:`loss.backward()`.
|
||||
Use `accelerator.backward(loss)` in lieu of `loss.backward()`.
|
||||
"""
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
self.deepspeed_engine.backward(loss, **kwargs)
|
||||
@ -405,11 +615,11 @@ class Accelerator:
|
||||
Unscale the gradients in mixed precision training with AMP. This is a noop in all other settings.
|
||||
|
||||
Args:
|
||||
optimizer (:obj:`torch.optim.Optimizer` or :obj:`List[torch.optim.Optimizer]`, `optional`):
|
||||
optimizer (`torch.optim.Optimizer` or `List[torch.optim.Optimizer]`, *optional*):
|
||||
The optimizer(s) for which to unscale gradients. If not set, will unscale gradients on all optimizers
|
||||
that were passed to :meth:`~accelerate.Accelerator.prepare`.
|
||||
that were passed to [`~Accelerator.prepare`].
|
||||
"""
|
||||
if self.state.use_fp16 and self.native_amp:
|
||||
if self.use_fp16 and self.native_amp:
|
||||
if optimizer is None:
|
||||
# TODO: this unscales all optimizers where we should only unscale the one where parameters are.
|
||||
optimizer = self._optimizers
|
||||
@ -422,61 +632,78 @@ class Accelerator:
|
||||
|
||||
def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
|
||||
"""
|
||||
Should be used in place of :func:`torch.nn.utils.clip_grad_norm_`.
|
||||
Should be used in place of `torch.nn.utils.clip_grad_norm_`.
|
||||
"""
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
parameters = [p for p in parameters]
|
||||
for model in self._models:
|
||||
if parameters == [p for p in model.parameters()]:
|
||||
model.clip_grad_norm_(max_norm, norm_type)
|
||||
return
|
||||
self.unscale_gradients()
|
||||
torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
|
||||
|
||||
def clip_grad_value_(self, parameters, clip_value):
|
||||
"""
|
||||
Should be used in place of :func:`torch.nn.utils.clip_grad_value_`.
|
||||
Should be used in place of `torch.nn.utils.clip_grad_value_`.
|
||||
"""
|
||||
self.unscale_gradients()
|
||||
torch.nn.utils.clip_grad_value_(parameters, clip_value)
|
||||
|
||||
def gather(self, tensor):
|
||||
"""
|
||||
Gather the values in `tensor` accross all processes and concatenate them on the first dimension. Useful to
|
||||
Gather the values in *tensor* across all processes and concatenate them on the first dimension. Useful to
|
||||
regroup the predictions from all processes when doing evaluation.
|
||||
|
||||
Note:
|
||||
This gather happens in all processes.
|
||||
|
||||
Args:
|
||||
tensor (:obj:`torch.Tensor`, or a nested tuple/list/dictionary of :obj:`torch.Tensor`):
|
||||
tensor (`torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`):
|
||||
The tensors to gather across all processes.
|
||||
|
||||
Returns:
|
||||
:obj:`torch.Tensor`, or a nested tuple/list/dictionary of :obj:`torch.Tensor`: The gathered tensor(s). Note
|
||||
that the first dimension of the result is `num_processes` multiplied by the first dimension of the input
|
||||
tensors.
|
||||
`torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`: The gathered tensor(s). Note that the
|
||||
first dimension of the result is *num_processes* multiplied by the first dimension of the input tensors.
|
||||
"""
|
||||
return gather(tensor)
|
||||
|
||||
def reduce(self, tensor: torch.Tensor, reduction="sum"):
|
||||
"""
|
||||
Reduce the values in *tensor* across all processes based on *reduction*.
|
||||
|
||||
Args:
|
||||
tensor (`torch.Tensor`):
|
||||
The tensors to reduce across all processes.
|
||||
reduction (`str`, *optional*, defaults to "sum"):
|
||||
A reduction type, can be one of 'sum', 'mean', or 'none'. If 'none', will not perform any operation.
|
||||
"""
|
||||
reduce(tensor, reduction)
|
||||
|
||||
def pad_across_processes(self, tensor, dim=0, pad_index=0, pad_first=False):
|
||||
"""
|
||||
Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so
|
||||
they can safely be gathered.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of :obj:`torch.Tensor`):
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to gather.
|
||||
dim (:obj:`int`, `optional`, defaults to 0):
|
||||
dim (`int`, *optional*, defaults to 0):
|
||||
The dimension on which to pad.
|
||||
pad_index (:obj:`int`, `optional`, defaults to 0):
|
||||
pad_index (`int`, *optional*, defaults to 0):
|
||||
The value with which to pad.
|
||||
pad_first (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
pad_first (`bool`, *optional*, defaults to `False`):
|
||||
Whether to pad at the beginning or the end.
|
||||
"""
|
||||
return pad_across_processes(tensor, dim=dim, pad_index=pad_index, pad_first=pad_first)
|
||||
|
||||
def unwrap_model(self, model):
|
||||
"""
|
||||
Unwraps the :obj:`model` from the additional layer possible added by :meth:`~accelerate.Accelerator.prepare`.
|
||||
Useful before saving the model.
|
||||
Unwraps the `model` from the additional layer possible added by [`~Accelerator.prepare`]. Useful before saving
|
||||
the model.
|
||||
|
||||
Args:
|
||||
model (:obj:`torch.nn.Module`):
|
||||
model (`torch.nn.Module`):
|
||||
The model to unwrap.
|
||||
"""
|
||||
return extract_model_from_parallel(model)
|
||||
@ -488,28 +715,131 @@ class Accelerator:
|
||||
"""
|
||||
wait_for_everyone()
|
||||
|
||||
def init_trackers(self, project_name: str, config: Optional[dict] = None):
|
||||
"""
|
||||
Initializes a run for all trackers stored in `self.log_with`, potentially with starting configurations
|
||||
|
||||
Args:
|
||||
project_name (`str`):
|
||||
The name of the project. All trackers will save their data based on this
|
||||
config (`dict`, *optional*):
|
||||
Optional starting configuration to be logged.
|
||||
"""
|
||||
self.trackers = []
|
||||
for tracker in self.log_with:
|
||||
if issubclass(type(tracker), GeneralTracker):
|
||||
# Custom trackers are already initialized
|
||||
self.trackers.append(tracker)
|
||||
else:
|
||||
tracker_init = LOGGER_TYPE_TO_CLASS[str(tracker)]
|
||||
if getattr(tracker_init, "requires_logging_directory"):
|
||||
# We can skip this check since it was done in `__init__`
|
||||
self.trackers.append(tracker_init(project_name, self.logging_dir))
|
||||
else:
|
||||
self.trackers.append(tracker_init(project_name))
|
||||
if config is not None:
|
||||
for tracker in self.trackers:
|
||||
tracker.store_init_configuration(config)
|
||||
|
||||
def log(self, values: dict, step: Optional[int] = None):
|
||||
"""
|
||||
Logs `values` to all stored trackers in `self.trackers`.
|
||||
|
||||
Args:
|
||||
values (`dict`):
|
||||
Values should be a dictionary-like object containing only types `int`, `float`, or `str`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
if self.is_main_process:
|
||||
for tracker in self.trackers:
|
||||
tracker.log(values, step=step)
|
||||
|
||||
def end_training(self):
|
||||
"""
|
||||
Runs any special end training behaviors, such as stopping trackers
|
||||
"""
|
||||
if self.is_main_process:
|
||||
for tracker in self.trackers:
|
||||
tracker.finish()
|
||||
|
||||
def save(self, obj, f):
|
||||
"""
|
||||
Save the object passed to disk once per machine. Use in place of :obj:`torch.save`.
|
||||
Save the object passed to disk once per machine. Use in place of `torch.save`.
|
||||
|
||||
Args:
|
||||
obj: The object to save.
|
||||
f (:obj:`str` or :obj:`os.PathLike`):
|
||||
Where to save the content of :obj:`obj`.
|
||||
f (`str` or `os.PathLike`):
|
||||
Where to save the content of `obj`.
|
||||
"""
|
||||
save(obj, f)
|
||||
|
||||
def save_state(self, output_dir: str):
|
||||
"""
|
||||
Saves the current states of the model, optimizer, scaler, RNG generators, and registered objects.
|
||||
|
||||
Args:
|
||||
output_dir (`str` or `os.PathLike`):
|
||||
The name of the folder to save all relevant weights and states.
|
||||
"""
|
||||
# Check if folder exists
|
||||
output_dir = os.path.expanduser(output_dir)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logger.info(f"Saving current state to {output_dir}")
|
||||
weights = [self.get_state_dict(m) for m in self._models]
|
||||
save_location = save_accelerator_state(
|
||||
output_dir, weights, self._optimizers, self._schedulers, self.state.process_index, self.scaler
|
||||
)
|
||||
for i, obj in enumerate(self._custom_objects):
|
||||
save_custom_state(obj, output_dir, i)
|
||||
return save_location
|
||||
|
||||
def load_state(self, input_dir: str):
|
||||
"""
|
||||
Loads the current states of the model, optimizer, scaler, RNG generators, and registered objects.
|
||||
|
||||
Args:
|
||||
input_dir (`str` or `os.PathLike`):
|
||||
The name of the folder all relevant weights and states were saved in.
|
||||
"""
|
||||
# Check if folder exists
|
||||
input_dir = os.path.expanduser(input_dir)
|
||||
if not os.path.isdir(input_dir):
|
||||
raise ValueError(f"Tried to find {input_dir} but folder does not exist")
|
||||
logger.info(f"Loading states from {input_dir}")
|
||||
load_accelerator_state(
|
||||
input_dir, self._models, self._optimizers, self._schedulers, self.state.process_index, self.scaler
|
||||
)
|
||||
custom_checkpoints = [f for f in os.listdir(input_dir) if "custom_checkpoint" in f]
|
||||
if len(custom_checkpoints) != len(self._custom_objects):
|
||||
err = "Warning! Number of found checkpoints does not match the number of registered objects:"
|
||||
err += f"\n\tFound checkpoints: {len(custom_checkpoints)}"
|
||||
err += f"\n\tRegistered objects: {len(self._custom_objects)}\nSkipping."
|
||||
logger.warn(err)
|
||||
else:
|
||||
logger.info(f"Loading in {len(custom_checkpoints)} custom states")
|
||||
for index, obj in enumerate(self._custom_objects):
|
||||
load_custom_state(obj, input_dir, index)
|
||||
|
||||
def free_memory(self):
|
||||
"""
|
||||
Will release all references to the internal objects stored and call the garbage collector. You should call this
|
||||
method between two trainings with different models/optimizers.
|
||||
"""
|
||||
self._schedulers = []
|
||||
self._optimizers = []
|
||||
self._models = []
|
||||
self.deepspeed_engine = None
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def clear(self):
|
||||
"""
|
||||
Alias for [`Accelerate.free_memory`], releases all references to the internal objects stored and call the
|
||||
garbage collector. You should call this method between two trainings with different models/optimizers.
|
||||
"""
|
||||
self.free_memory()
|
||||
|
||||
def _get_named_parameters(self, *args):
|
||||
named_parameters = {}
|
||||
for obj in args:
|
||||
@ -553,6 +883,30 @@ class Accelerator:
|
||||
|
||||
return state_dict
|
||||
|
||||
def register_for_checkpointing(self, *objects):
|
||||
"""
|
||||
Makes note of `objects` and will save or load them in during `save_state` or `load_state`.
|
||||
|
||||
These should be utilized when the state is being loaded or saved in the same script. It is not designed to be
|
||||
used in different scripts
|
||||
|
||||
<Tip>
|
||||
|
||||
Every `object` must have a `load_state_dict` and `state_dict` function to be stored.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
invalid_objects = []
|
||||
for obj in objects:
|
||||
if not hasattr(obj, "state_dict") or not hasattr(obj, "load_state_dict"):
|
||||
invalid_objects.append(obj)
|
||||
if len(invalid_objects) > 0:
|
||||
err = "All `objects` must include a `state_dict` and `load_state_dict` function to be stored. The following inputs are invalid:"
|
||||
for index, obj in enumerate(invalid_objects):
|
||||
err += f"\n\t- Item at index {index}, `{get_pretty_name(obj)}`"
|
||||
raise ValueError(err)
|
||||
self._custom_objects.extend(objects)
|
||||
|
||||
@contextmanager
|
||||
def autocast(self):
|
||||
"""
|
||||
@ -560,10 +914,16 @@ class Accelerator:
|
||||
different will happen otherwise.
|
||||
"""
|
||||
if self.native_amp:
|
||||
autocast_context = torch.cuda.amp.autocast()
|
||||
if self.mixed_precision == "fp16" and version.parse(torch.__version__) >= version.parse("1.10"):
|
||||
autocast_context = torch.cuda.amp.autocast(dtype=torch.float16)
|
||||
elif self.mixed_precision == "bf16":
|
||||
autocast_context = torch.cuda.amp.autocast(dtype=torch.bfloat16)
|
||||
else:
|
||||
autocast_context = torch.cuda.amp.autocast()
|
||||
|
||||
autocast_context.__enter__()
|
||||
yield
|
||||
autocast_context.__exit__()
|
||||
autocast_context.__exit__(*sys.exc_info())
|
||||
else:
|
||||
yield
|
||||
|
||||
@ -574,6 +934,6 @@ class Accelerator:
|
||||
case the learning rate should not be changed.
|
||||
"""
|
||||
for optimizer in self._optimizers:
|
||||
if optimizer.is_overflow:
|
||||
if optimizer.step_was_skipped:
|
||||
return True
|
||||
return False
|
||||
|
||||
177
src/accelerate/checkpointing.py
Normal file
177
src/accelerate/checkpointing.py
Normal file
@ -0,0 +1,177 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.cuda.amp import GradScaler
|
||||
|
||||
from .state import is_tpu_available
|
||||
from .utils import MODEL_NAME, OPTIMIZER_NAME, RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME, get_pretty_name, save
|
||||
|
||||
|
||||
if is_tpu_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def save_accelerator_state(
|
||||
output_dir: str,
|
||||
model_states: List[dict],
|
||||
optimizers: list,
|
||||
schedulers: list,
|
||||
process_index: int,
|
||||
scaler: GradScaler = None,
|
||||
):
|
||||
"""
|
||||
Saves the current states of the models, optimizers, scaler, and RNG generators to a given directory.
|
||||
|
||||
Args:
|
||||
output_dir (`str` or `os.PathLike`):
|
||||
The name of the folder to save all relevant weights and states.
|
||||
model_states (`List[torch.nn.Module]`):
|
||||
A list of model states
|
||||
optimizers (`List[torch.optim.Optimizer]`):
|
||||
A list of optimizer instances
|
||||
schedulers (`List[torch.optim.lr_scheduler._LRScheduler]`):
|
||||
A list of learning rate schedulers
|
||||
process_index (`int`):
|
||||
The current process index in the Accelerator state
|
||||
scaler (`torch.cuda.amp.GradScaler`, *optional*):
|
||||
An optional gradient scaler instance to save
|
||||
"""
|
||||
# Model states
|
||||
for i, state in enumerate(model_states):
|
||||
weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin"
|
||||
output_model_file = os.path.join(output_dir, weights_name)
|
||||
save(state, output_model_file)
|
||||
logger.info(f"Model weights saved in {output_model_file}")
|
||||
# Optimizer states
|
||||
for i, opt in enumerate(optimizers):
|
||||
state = opt.state_dict()
|
||||
optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
|
||||
output_optimizer_file = os.path.join(output_dir, optimizer_name)
|
||||
save(state, output_optimizer_file)
|
||||
logger.info(f"Optimizer state saved in {output_optimizer_file}")
|
||||
# Scheduler states
|
||||
for i, scheduler in enumerate(schedulers):
|
||||
state = scheduler.state_dict()
|
||||
scheduler_name = f"{SCHEDULER_NAME}.bin" if i == 0 else f"{SCHEDULER_NAME}_{i}.bin"
|
||||
output_scheduler_file = os.path.join(output_dir, scheduler_name)
|
||||
save(state, output_scheduler_file)
|
||||
logger.info(f"Scheduler state saved in {output_scheduler_file}")
|
||||
# GradScaler state
|
||||
if scaler is not None:
|
||||
state = scaler.state_dict()
|
||||
output_scaler_file = os.path.join(output_dir, SCALER_NAME)
|
||||
torch.save(state, output_scaler_file)
|
||||
logger.info(f"Gradient scaler state saved in {output_scaler_file}")
|
||||
# Random number generator states
|
||||
states = {}
|
||||
states_name = f"{RNG_STATE_NAME}_{process_index}.pkl"
|
||||
states["random_state"] = random.getstate()
|
||||
states["numpy_random_seed"] = np.random.get_state()
|
||||
states["torch_manual_seed"] = torch.get_rng_state()
|
||||
states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
if is_tpu_available():
|
||||
states["xm_seed"] = torch.tensor(xm.get_rng_state())
|
||||
output_states_file = os.path.join(output_dir, states_name)
|
||||
torch.save(states, output_states_file)
|
||||
logger.info(f"Random states saved in {output_states_file}")
|
||||
return output_dir
|
||||
|
||||
|
||||
def load_accelerator_state(input_dir, models, optimizers, schedulers, process_index, scaler=None):
|
||||
"""
|
||||
Loads states of the models, optimizers, scaler, and RNG generators from a given directory.
|
||||
|
||||
Args:
|
||||
input_dir (`str` or `os.PathLike`):
|
||||
The name of the folder to load all relevant weights and states.
|
||||
model_stmodelsates (`List[torch.nn.Module]`):
|
||||
A list of model instances
|
||||
optimizers (`List[torch.optim.Optimizer]`):
|
||||
A list of optimizer instances
|
||||
schedulers (`List[torch.optim.lr_scheduler._LRScheduler]`):
|
||||
A list of learning rate schedulers
|
||||
process_index (`int`):
|
||||
The current process index in the Accelerator state
|
||||
scaler (`torch.cuda.amp.GradScaler`, *optional*):
|
||||
An optional *GradScaler* instance to load
|
||||
"""
|
||||
# Model states
|
||||
for i, model in enumerate(models):
|
||||
weights_name = f"{MODEL_NAME}.bin" if i == 0 else f"{MODEL_NAME}_{i}.bin"
|
||||
input_model_file = os.path.join(input_dir, weights_name)
|
||||
models[i].load_state_dict(torch.load(input_model_file, map_location="cpu"))
|
||||
logger.info("All model weights loaded successfully")
|
||||
|
||||
# Optimizer states
|
||||
for i, opt in enumerate(optimizers):
|
||||
optimizer_name = f"{OPTIMIZER_NAME}.bin" if i == 0 else f"{OPTIMIZER_NAME}_{i}.bin"
|
||||
input_optimizer_file = os.path.join(input_dir, optimizer_name)
|
||||
optimizers[i].load_state_dict(torch.load(input_optimizer_file, map_location="cpu"))
|
||||
logger.info("All optimizer states loaded successfully")
|
||||
|
||||
# Scheduler states
|
||||
for i, scheduler in enumerate(schedulers):
|
||||
scheduler_name = f"{SCHEDULER_NAME}.bin" if i == 0 else f"{SCHEDULER_NAME}_{i}.bin"
|
||||
input_scheduler_file = os.path.join(input_dir, scheduler_name)
|
||||
scheduler.load_state_dict(torch.load(input_scheduler_file))
|
||||
logger.info("All scheduler states loaded successfully")
|
||||
|
||||
# GradScaler state
|
||||
if scaler is not None:
|
||||
input_scaler_file = os.path.join(input_dir, SCALER_NAME)
|
||||
scaler.load_state_dict(torch.load(input_scaler_file))
|
||||
logger.info("GradScaler state loaded successfully")
|
||||
|
||||
# Random states
|
||||
states = torch.load(os.path.join(input_dir, f"{RNG_STATE_NAME}_{process_index}.pkl"))
|
||||
random.setstate(states["random_state"])
|
||||
np.random.set_state(states["numpy_random_seed"])
|
||||
torch.set_rng_state(states["torch_manual_seed"])
|
||||
torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
if is_tpu_available():
|
||||
xm.set_rng_state(states["xm_seed"])
|
||||
logger.info("All random states loaded successfully")
|
||||
|
||||
|
||||
def save_custom_state(obj, path, index: int = 0):
|
||||
"""
|
||||
Saves the state of `obj` to `{path}/custom_checkpoint_{index}.pkl`
|
||||
"""
|
||||
# Should this be the right way to get a qual_name type value from `obj`?
|
||||
save_location = Path(path) / f"custom_checkpoint_{index}.pkl"
|
||||
logger.info(f"Saving the state of {get_pretty_name(obj)} to {save_location}")
|
||||
torch.save(obj.state_dict(), save_location)
|
||||
|
||||
|
||||
def load_custom_state(obj, path, index: int = 0):
|
||||
"""
|
||||
Loads the state of `obj` at `{path}/custom_checkpoint_{index}.pkl`
|
||||
"""
|
||||
load_location = f"{path}/custom_checkpoint_{index}.pkl"
|
||||
logger.info(f"Loading the state of {get_pretty_name(obj)} from {load_location}")
|
||||
obj.load_state_dict(torch.load(load_location))
|
||||
@ -17,6 +17,7 @@
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from accelerate.commands.config import config_command_parser
|
||||
from accelerate.commands.env import env_command_parser
|
||||
from accelerate.commands.launch import launch_command_parser
|
||||
from accelerate.commands.test import test_command_parser
|
||||
|
||||
@ -29,6 +30,7 @@ def main():
|
||||
config_command_parser(subparsers=subparsers)
|
||||
launch_command_parser(subparsers=subparsers)
|
||||
test_command_parser(subparsers=subparsers)
|
||||
env_command_parser(subparsers=subparsers)
|
||||
|
||||
# Let's go
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -48,7 +48,7 @@ def config_command_parser(subparsers=None):
|
||||
"--config_file",
|
||||
default=None,
|
||||
help=(
|
||||
"The path to use to store the config file. Will default to a file named default_config.json in the cache "
|
||||
"The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
|
||||
"location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
|
||||
"such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
|
||||
"with 'huggingface'."
|
||||
|
||||
@ -52,7 +52,19 @@ def get_cluster_input():
|
||||
lambda x: int(x),
|
||||
)
|
||||
|
||||
deepspeed_config = None
|
||||
if distributed_type == DistributedType.NO:
|
||||
use_cpu = _ask_field(
|
||||
"Do you want to run your training on CPU only (even if a GPU is available)? [yes/NO]:",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
elif distributed_type == DistributedType.MULTI_CPU:
|
||||
use_cpu = True
|
||||
else:
|
||||
use_cpu = False
|
||||
|
||||
deepspeed_config = {}
|
||||
if distributed_type in [DistributedType.MULTI_GPU, DistributedType.NO]:
|
||||
use_deepspeed = _ask_field(
|
||||
"Do you want to use DeepSpeed? [yes/NO]: ",
|
||||
@ -66,7 +78,6 @@ def get_cluster_input():
|
||||
is_deepspeed_available()
|
||||
), "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
|
||||
|
||||
deepspeed_config = {}
|
||||
if distributed_type == DistributedType.DEEPSPEED:
|
||||
deepspeed_config["zero_stage"] = _ask_field(
|
||||
"What should be your DeepSpeed's ZeRO optimization stage (0, 1, 2, 3)? [2]: ",
|
||||
@ -87,6 +98,34 @@ def get_cluster_input():
|
||||
default=1,
|
||||
)
|
||||
|
||||
fsdp_config = {}
|
||||
if distributed_type in [DistributedType.MULTI_GPU]:
|
||||
use_fsdp = _ask_field(
|
||||
"Do you want to use FullyShardedDataParallel? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if use_fsdp:
|
||||
distributed_type = DistributedType.FSDP
|
||||
if distributed_type == DistributedType.FSDP:
|
||||
fsdp_config["sharding_strategy"] = _ask_field(
|
||||
"What should be your sharding strategy ([1] FULL_SHARD, [2] SHARD_GRAD_OP)? [1]: ",
|
||||
lambda x: int(x),
|
||||
default=1,
|
||||
)
|
||||
fsdp_config["offload_params"] = _ask_field(
|
||||
"Do you want to offload parameters and gradients to CPU? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
fsdp_config["min_num_params"] = _ask_field(
|
||||
"What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ",
|
||||
lambda x: int(x),
|
||||
default=1e8,
|
||||
)
|
||||
|
||||
if distributed_type == DistributedType.TPU:
|
||||
main_training_function = _ask_field(
|
||||
"What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
|
||||
@ -103,24 +142,25 @@ def get_cluster_input():
|
||||
)
|
||||
|
||||
if distributed_type != DistributedType.TPU:
|
||||
fp16 = _ask_field(
|
||||
"Do you wish to use FP16 (mixed precision)? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
mixed_precision = _ask_field(
|
||||
"Do you wish to use FP16 or BF16 (mixed precision)? [NO/fp16/bf16]: ",
|
||||
lambda x: str(x).lower(),
|
||||
default="no",
|
||||
)
|
||||
else:
|
||||
fp16 = False
|
||||
mixed_precision = "no"
|
||||
|
||||
return ClusterConfig(
|
||||
compute_environment=ComputeEnvironment.LOCAL_MACHINE,
|
||||
distributed_type=distributed_type,
|
||||
num_processes=num_processes,
|
||||
fp16=fp16,
|
||||
mixed_precision=mixed_precision,
|
||||
machine_rank=machine_rank,
|
||||
num_machines=num_machines,
|
||||
main_process_ip=main_process_ip,
|
||||
main_process_port=main_process_port,
|
||||
main_training_function=main_training_function,
|
||||
deepspeed_config=deepspeed_config,
|
||||
fsdp_config=fsdp_config,
|
||||
use_cpu=use_cpu,
|
||||
)
|
||||
|
||||
@ -28,7 +28,7 @@ hf_cache_home = os.path.expanduser(
|
||||
os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
|
||||
)
|
||||
cache_dir = os.path.join(hf_cache_home, "accelerate")
|
||||
default_json_config_file = os.path.join(cache_dir, "default_config.json")
|
||||
default_json_config_file = os.path.join(cache_dir, "default_config.yaml")
|
||||
default_yaml_config_file = os.path.join(cache_dir, "default_config.yaml")
|
||||
|
||||
# For backward compatibility: the default config is the json one if it's the only existing file.
|
||||
@ -66,7 +66,8 @@ def load_config_from_file(config_file):
|
||||
class BaseConfig:
|
||||
compute_environment: ComputeEnvironment
|
||||
distributed_type: Union[DistributedType, SageMakerDistributedType]
|
||||
fp16: bool
|
||||
mixed_precision: str
|
||||
use_cpu: bool
|
||||
|
||||
def to_dict(self):
|
||||
result = self.__dict__
|
||||
@ -83,6 +84,12 @@ class BaseConfig:
|
||||
config_dict = json.load(f)
|
||||
if "compute_environment" not in config_dict:
|
||||
config_dict["compute_environment"] = ComputeEnvironment.LOCAL_MACHINE
|
||||
if "mixed_precision" not in config_dict:
|
||||
config_dict["mixed_precision"] = "fp16" if ("fp16" in config_dict and config_dict["fp16"]) else "no"
|
||||
if "fp16" in config_dict: # Convert the config to the new format.
|
||||
del config_dict["fp16"]
|
||||
if "use_cpu" not in config_dict:
|
||||
config_dict["use_cpu"] = False
|
||||
return cls(**config_dict)
|
||||
|
||||
def to_json_file(self, json_file):
|
||||
@ -97,6 +104,14 @@ class BaseConfig:
|
||||
config_dict = yaml.safe_load(f)
|
||||
if "compute_environment" not in config_dict:
|
||||
config_dict["compute_environment"] = ComputeEnvironment.LOCAL_MACHINE
|
||||
|
||||
if "mixed_precision" not in config_dict:
|
||||
config_dict["mixed_precision"] = "fp16" if ("fp16" in config_dict and config_dict["fp16"]) else "no"
|
||||
if "fp16" in config_dict: # Convert the config to the new format.
|
||||
del config_dict["fp16"]
|
||||
if "use_cpu" not in config_dict:
|
||||
config_dict["use_cpu"] = False
|
||||
|
||||
return cls(**config_dict)
|
||||
|
||||
def to_yaml_file(self, yaml_file):
|
||||
@ -121,6 +136,15 @@ class ClusterConfig(BaseConfig):
|
||||
|
||||
# args for deepspeed_plugin
|
||||
deepspeed_config: dict = None
|
||||
# args for fsdp
|
||||
fsdp_config: dict = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.deepspeed_config is None:
|
||||
self.deepspeed_config = {}
|
||||
if self.fsdp_config is None:
|
||||
self.fsdp_config = {}
|
||||
return super().__post_init__()
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -20,7 +20,7 @@ from accelerate.state import ComputeEnvironment, SageMakerDistributedType
|
||||
from accelerate.utils import is_boto3_available
|
||||
|
||||
from .config_args import SageMakerConfig
|
||||
from .config_utils import _ask_field, _convert_sagemaker_distributed_mode, _convert_yes_no_to_bool
|
||||
from .config_utils import _ask_field, _convert_sagemaker_distributed_mode
|
||||
|
||||
|
||||
if is_boto3_available():
|
||||
@ -139,11 +139,11 @@ def get_sagemaker_input():
|
||||
lambda x: int(x),
|
||||
default=2,
|
||||
)
|
||||
fp16 = _ask_field(
|
||||
"Do you wish to use FP16 (mixed precision)? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
|
||||
mixed_precision = _ask_field(
|
||||
"Do you wish to use FP16 or BF16 (mixed precision)? [No/FP16/BF16]: ",
|
||||
lambda x: str(x),
|
||||
default="No",
|
||||
)
|
||||
|
||||
return SageMakerConfig(
|
||||
@ -153,6 +153,6 @@ def get_sagemaker_input():
|
||||
profile=aws_profile,
|
||||
region=aws_region,
|
||||
iam_role_name=iam_role_name,
|
||||
fp16=fp16,
|
||||
mixed_precision=mixed_precision,
|
||||
num_machines=num_machines,
|
||||
)
|
||||
|
||||
68
src/accelerate/commands/env.py
Normal file
68
src/accelerate/commands/env.py
Normal file
@ -0,0 +1,68 @@
|
||||
import argparse
|
||||
import os
|
||||
import platform
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from accelerate import __version__ as version
|
||||
from accelerate.commands.config import default_config_file, load_config_from_file
|
||||
|
||||
|
||||
def env_command_parser(subparsers=None):
|
||||
if subparsers is not None:
|
||||
parser = subparsers.add_parser("env")
|
||||
else:
|
||||
parser = argparse.ArgumentParser("Accelerate env command")
|
||||
|
||||
parser.add_argument(
|
||||
"--config_file", default=None, help="The config file to use for the default values in the launching script."
|
||||
)
|
||||
|
||||
if subparsers is not None:
|
||||
parser.set_defaults(func=env_command)
|
||||
return parser
|
||||
|
||||
|
||||
def env_command(args):
|
||||
pt_version = torch.__version__
|
||||
pt_cuda_available = torch.cuda.is_available()
|
||||
|
||||
accelerate_config = "Not found"
|
||||
# Get the default from the config file.
|
||||
if args.config_file is not None or os.path.isfile(default_config_file):
|
||||
accelerate_config = load_config_from_file(args.config_file).to_dict()
|
||||
|
||||
info = {
|
||||
"`Accelerate` version": version,
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Numpy version": np.__version__,
|
||||
"PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
|
||||
}
|
||||
|
||||
print("\nCopy-and-paste the text below in your GitHub issue\n")
|
||||
print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]))
|
||||
|
||||
print("- `Accelerate` default config:" if args.config_file is None else "- `Accelerate` config passed:")
|
||||
accelerate_config_str = (
|
||||
"\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
|
||||
if isinstance(accelerate_config, dict)
|
||||
else f"\t{accelerate_config}"
|
||||
)
|
||||
print(accelerate_config_str)
|
||||
|
||||
info["`Accelerate` configs"] = accelerate_config
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = env_command_parser()
|
||||
args = parser.parse_args()
|
||||
env_command(args)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@ -19,6 +19,7 @@ import importlib
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import warnings
|
||||
from ast import literal_eval
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
@ -26,7 +27,7 @@ from typing import Dict, List
|
||||
from accelerate.commands.config import default_config_file, load_config_from_file
|
||||
from accelerate.commands.config.config_args import SageMakerConfig
|
||||
from accelerate.state import ComputeEnvironment, DistributedType
|
||||
from accelerate.utils import PrepareForLaunch, is_sagemaker_available
|
||||
from accelerate.utils import PrecisionType, PrepareForLaunch, is_sagemaker_available
|
||||
|
||||
|
||||
def launch_command_parser(subparsers=None):
|
||||
@ -50,9 +51,42 @@ def launch_command_parser(subparsers=None):
|
||||
action="store_true",
|
||||
help="Whether to use deepspeed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_fsdp",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Whether to use fsdp.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offload_params",
|
||||
default="false",
|
||||
type=str,
|
||||
help="Decides Whether (true|false) to offload parameters and gradients to CPU. (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min_num_params",
|
||||
type=int,
|
||||
default=1e8,
|
||||
help="FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sharding_strategy",
|
||||
type=int,
|
||||
default=1,
|
||||
help="FSDP's Sharding Strategy. (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tpu", default=False, action="store_true", help="Whether or not this should launch a TPU training."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether or not to use mixed precision training. "
|
||||
"Choose between FP16 and BF16 (bfloat16) training. "
|
||||
"BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--fp16", default=False, action="store_true", help="Whether or not to use mixed precision training."
|
||||
)
|
||||
@ -81,6 +115,17 @@ def launch_command_parser(subparsers=None):
|
||||
default=None,
|
||||
help="The name of the main function to be executed in your script (only for TPU training).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--module",
|
||||
action="store_true",
|
||||
help="Change each process to interpret the launch script as a Python module, executing with the same behavior as 'python -m'.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no_python",
|
||||
action="store_true",
|
||||
help="Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--aws_access_key_id",
|
||||
type=str,
|
||||
@ -129,12 +174,30 @@ def launch_command_parser(subparsers=None):
|
||||
|
||||
|
||||
def simple_launcher(args):
|
||||
cmd = [sys.executable, args.training_script]
|
||||
cmd = []
|
||||
if args.no_python and args.module:
|
||||
raise ValueError("--module and --no_python cannot be used together")
|
||||
if not args.no_python:
|
||||
cmd.append(sys.executable)
|
||||
if args.module:
|
||||
cmd.append("-m")
|
||||
cmd.append(args.training_script)
|
||||
cmd.extend(args.training_script_args)
|
||||
|
||||
current_env = os.environ.copy()
|
||||
current_env["USE_CPU"] = str(args.cpu)
|
||||
current_env["USE_FP16"] = str(args.fp16)
|
||||
try:
|
||||
mixed_precision = PrecisionType(args.mixed_precision.lower())
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
|
||||
)
|
||||
|
||||
if args.fp16:
|
||||
warnings.warn('--fp16 flag is deprecated. Use "--mixed_precision fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
|
||||
current_env["MIXED_PRECISION"] = str(mixed_precision)
|
||||
|
||||
process = subprocess.Popen(cmd, env=current_env)
|
||||
process.wait()
|
||||
@ -143,8 +206,7 @@ def simple_launcher(args):
|
||||
|
||||
|
||||
def multi_gpu_launcher(args):
|
||||
cmd = [sys.executable, "-m", "torch.distributed.launch"]
|
||||
cmd.extend(["--use_env"])
|
||||
cmd = [sys.executable, "-m", "torch.distributed.launch", "--use_env"]
|
||||
if args.num_machines > 1:
|
||||
cmd.extend(
|
||||
[
|
||||
@ -164,12 +226,34 @@ def multi_gpu_launcher(args):
|
||||
cmd.extend(["--nproc_per_node", str(args.num_processes)])
|
||||
if args.main_process_port is not None:
|
||||
cmd.extend(["--master_port", str(args.main_process_port)])
|
||||
|
||||
if args.module and args.no_python:
|
||||
raise ValueError("--module and --no_python cannot be used together")
|
||||
elif args.module:
|
||||
cmd.append("--module")
|
||||
elif args.no_python:
|
||||
cmd.append("--no_python")
|
||||
cmd.append(args.training_script)
|
||||
cmd.extend(args.training_script_args)
|
||||
|
||||
current_env = os.environ.copy()
|
||||
current_env["USE_FP16"] = str(args.fp16)
|
||||
try:
|
||||
mixed_precision = PrecisionType(args.mixed_precision.lower())
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
|
||||
)
|
||||
|
||||
if args.fp16:
|
||||
warnings.warn('--fp16 flag is deprecated. Use "--mixed_precision fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
|
||||
current_env["MIXED_PRECISION"] = str(mixed_precision)
|
||||
if args.use_fsdp:
|
||||
current_env["USE_FSDP"] = "true"
|
||||
current_env["FSDP_OFFLOAD_PARAMS"] = str(args.offload_params).lower()
|
||||
current_env["FSDP_MIN_NUM_PARAMS"] = str(args.min_num_params)
|
||||
current_env["FSDP_SHARDING_STRATEGY"] = str(args.sharding_strategy)
|
||||
process = subprocess.Popen(cmd, env=current_env)
|
||||
process.wait()
|
||||
if process.returncode != 0:
|
||||
@ -177,8 +261,7 @@ def multi_gpu_launcher(args):
|
||||
|
||||
|
||||
def deepspeed_launcher(args):
|
||||
|
||||
cmd = ["deepspeed"]
|
||||
cmd = ["deepspeed", "--no_local_rank"]
|
||||
if args.num_machines > 1:
|
||||
cmd.extend(
|
||||
[
|
||||
@ -197,11 +280,28 @@ def deepspeed_launcher(args):
|
||||
else:
|
||||
cmd.extend(["--num_gpus", str(args.num_processes)])
|
||||
|
||||
if args.module and args.no_python:
|
||||
raise ValueError("--module and --no_python cannot be used together")
|
||||
elif args.module:
|
||||
cmd.append("--module")
|
||||
elif args.no_python:
|
||||
cmd.append("--no_python")
|
||||
cmd.append(args.training_script)
|
||||
cmd.extend(args.training_script_args)
|
||||
|
||||
current_env = os.environ.copy()
|
||||
current_env["USE_FP16"] = str(args.fp16)
|
||||
try:
|
||||
mixed_precision = PrecisionType(args.mixed_precision.lower())
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
|
||||
)
|
||||
|
||||
if args.fp16:
|
||||
warnings.warn('--fp16 flag is deprecated. Use "--mixed_precision fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
|
||||
current_env["MIXED_PRECISION"] = str(mixed_precision)
|
||||
current_env["USE_DEEPSPEED"] = "true"
|
||||
current_env["DEEPSPEED_ZERO_STAGE"] = str(args.zero_stage)
|
||||
current_env["GRADIENT_ACCUMULATION_STEPS"] = str(args.gradient_accumulation_steps)
|
||||
@ -216,21 +316,28 @@ def deepspeed_launcher(args):
|
||||
def tpu_launcher(args):
|
||||
import torch_xla.distributed.xla_multiprocessing as xmp
|
||||
|
||||
# Import training_script as a module.
|
||||
script_path = Path(args.training_script)
|
||||
sys.path.append(str(script_path.parent.resolve()))
|
||||
mod_name = script_path.stem
|
||||
if args.no_python:
|
||||
raise ValueError("--no_python cannot be used with TPU launcher")
|
||||
|
||||
if args.module:
|
||||
mod_name = args.training_script
|
||||
else:
|
||||
# Import training_script as a module
|
||||
script_path = Path(args.training_script)
|
||||
sys.path.append(str(script_path.parent.resolve()))
|
||||
mod_name = script_path.stem
|
||||
|
||||
mod = importlib.import_module(mod_name)
|
||||
if not hasattr(mod, args.main_training_function):
|
||||
raise ValueError(
|
||||
f"Your training script should have a function named {args.main_training_function}, or you should pass a "
|
||||
"different value to `--main_training_function`."
|
||||
)
|
||||
main_function = getattr(mod, args.main_training_function)
|
||||
|
||||
# Patch sys.argv
|
||||
sys.argv = [args.training_script] + args.training_script_args
|
||||
sys.argv = [mod.__file__] + args.training_script_args
|
||||
|
||||
main_function = getattr(mod, args.main_training_function)
|
||||
xmp.spawn(PrepareForLaunch(main_function), args=(), nprocs=args.num_processes)
|
||||
|
||||
|
||||
@ -281,6 +388,11 @@ def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
|
||||
raise ImportError(
|
||||
"Please install sagemaker to be able to launch training on Amazon SageMaker with `pip install accelerate[sagemaker]`"
|
||||
)
|
||||
if args.module or args.no_python:
|
||||
raise ValueError(
|
||||
"SageMaker requires a python training script file and cannot be used with --module or --no_python"
|
||||
)
|
||||
|
||||
from sagemaker.huggingface import HuggingFace
|
||||
|
||||
# configure environment
|
||||
@ -309,8 +421,19 @@ def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
|
||||
print("Converting Arguments to Hyperparameters")
|
||||
hyperparameters = _convert_nargs_to_dict(args.training_script_args)
|
||||
|
||||
environment = {"USE_FP16": args.fp16} # Environment variables to be set for use during training job
|
||||
try:
|
||||
mixed_precision = PrecisionType(args.mixed_precision.lower())
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
|
||||
)
|
||||
|
||||
if args.fp16:
|
||||
warnings.warn('--fp16 flag is deprecated. Use "--mixed_precision fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
|
||||
# Environment variables to be set for use during training job
|
||||
environment = {"MIXED_PRECISION": str(mixed_precision)}
|
||||
# configure distribution set up
|
||||
distribution = None # TODO: not yet implemented
|
||||
|
||||
@ -338,8 +461,8 @@ def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
|
||||
|
||||
def launch_command(args):
|
||||
# Sanity checks
|
||||
if sum([args.multi_gpu, args.tpu, args.use_deepspeed]) > 1:
|
||||
raise ValueError("You can only pick one between `--multi_gpu`, `--use_deepspeed`, `--tpu`.")
|
||||
if sum([args.multi_gpu, args.tpu, args.use_deepspeed, args.use_fsdp]) > 1:
|
||||
raise ValueError("You can only pick one between `--multi_gpu`, `--use_deepspeed`, `--tpu`, `--use_fsdp`.")
|
||||
|
||||
defaults = None
|
||||
# Get the default from the config file.
|
||||
@ -349,6 +472,7 @@ def launch_command(args):
|
||||
args.use_deepspeed = defaults.distributed_type == DistributedType.DEEPSPEED
|
||||
args.multi_gpu = defaults.distributed_type == DistributedType.MULTI_GPU
|
||||
args.tpu = defaults.distributed_type == DistributedType.TPU
|
||||
args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
|
||||
if defaults.compute_environment == ComputeEnvironment.LOCAL_MACHINE:
|
||||
# Update args with the defaults
|
||||
for name, attr in defaults.__dict__.items():
|
||||
@ -356,17 +480,22 @@ def launch_command(args):
|
||||
for k in defaults.deepspeed_config:
|
||||
if getattr(args, k) is None:
|
||||
setattr(args, k, defaults.deepspeed_config[k])
|
||||
for k in defaults.fsdp_config:
|
||||
setattr(args, k, defaults.fsdp_config[k])
|
||||
continue
|
||||
|
||||
# Those args are handled separately
|
||||
if (
|
||||
name not in ["compute_environment", "fp16", "distributed_type"]
|
||||
name not in ["compute_environment", "fp16", "mixed_precision", "distributed_type"]
|
||||
and getattr(args, name, None) is None
|
||||
):
|
||||
setattr(args, name, attr)
|
||||
|
||||
if not args.fp16:
|
||||
args.fp16 = defaults.fp16
|
||||
if not args.mixed_precision:
|
||||
if args.fp16:
|
||||
args.mixed_precision = "fp16"
|
||||
else:
|
||||
args.mixed_precision = defaults.mixed_precision
|
||||
else:
|
||||
if args.num_processes is None:
|
||||
args.num_processes = 1
|
||||
@ -374,6 +503,8 @@ def launch_command(args):
|
||||
# Use the proper launcher
|
||||
if args.use_deepspeed and not args.cpu:
|
||||
deepspeed_launcher(args)
|
||||
elif args.use_fsdp and not args.cpu:
|
||||
multi_gpu_launcher(args)
|
||||
elif args.multi_gpu and not args.cpu:
|
||||
multi_gpu_launcher(args)
|
||||
elif args.tpu and not args.cpu:
|
||||
|
||||
@ -30,7 +30,7 @@ def test_command_parser(subparsers=None):
|
||||
"--config_file",
|
||||
default=None,
|
||||
help=(
|
||||
"The path to use to store the config file. Will default to a file named default_config.json in the cache "
|
||||
"The path to use to store the config file. Will default to a file named default_config.yaml in the cache "
|
||||
"location, which is the content of the environment `HF_HOME` suffixed with 'accelerate', or if you don't have "
|
||||
"such an environment variable, your cache directory ('~/.cache' or the content of `XDG_CACHE_HOME`) suffixed "
|
||||
"with 'huggingface'."
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
@ -20,7 +21,18 @@ from torch.utils.data import BatchSampler, DataLoader, IterableDataset
|
||||
from packaging import version
|
||||
|
||||
from .state import AcceleratorState, DistributedType, is_tpu_available
|
||||
from .utils import RNGType, send_to_device, synchronize_rng_states
|
||||
from .utils import (
|
||||
RNGType,
|
||||
broadcast,
|
||||
broadcast_object_list,
|
||||
concatenate,
|
||||
find_batch_size,
|
||||
get_data_structure,
|
||||
initialize_tensors,
|
||||
send_to_device,
|
||||
slice_tensors,
|
||||
synchronize_rng_states,
|
||||
)
|
||||
|
||||
|
||||
if is_tpu_available():
|
||||
@ -55,34 +67,34 @@ for v, additional_kwargs in _PYTORCH_DATALOADER_ADDITIONAL_KWARGS.items():
|
||||
|
||||
class BatchSamplerShard(BatchSampler):
|
||||
"""
|
||||
Wraps a PyTorch :obj:`BatchSampler` to generate batches for one of the processes only. Instances of this class will
|
||||
always yield a number of batches that is a round multiple of :obj:`num_processes` and that all have the same size.
|
||||
Depending on the value of the :obj:`drop_last` attribute of the batch sampler passed, it will either stop the
|
||||
iteration at the first batch that would be too small / not present on all processes or loop with indices from the
|
||||
beginning.
|
||||
Wraps a PyTorch `BatchSampler` to generate batches for one of the processes only. Instances of this class will
|
||||
always yield a number of batches that is a round multiple of `num_processes` and that all have the same size.
|
||||
Depending on the value of the `drop_last` attribute of the batch sampler passed, it will either stop the iteration
|
||||
at the first batch that would be too small / not present on all processes or loop with indices from the beginning.
|
||||
|
||||
Args:
|
||||
batch_sampler (:obj:`torch.utils.data.sampler.BatchSampler`):
|
||||
batch_sampler (`torch.utils.data.sampler.BatchSampler`):
|
||||
The batch sampler to split in several shards.
|
||||
num_processes (:obj:`int`, `optional`, defaults to 1):
|
||||
num_processes (`int`, *optional*, defaults to 1):
|
||||
The number of processes running concurrently.
|
||||
process_index (:obj:`int`, `optional`, defaults to 0):
|
||||
process_index (`int`, *optional*, defaults to 0):
|
||||
The index of the current process.
|
||||
split_batches (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
split_batches (`bool`, *optional*, defaults to `False`):
|
||||
Whether the shards should be created by splitting a batch to give a piece of it on each process, or by
|
||||
yielding different full batches on each process.
|
||||
|
||||
On two processes with a sampler of :obj:`[[0, 1, 2, 3], [4, 5, 6, 7]]`, this will result in:
|
||||
On two processes with a sampler of `[[0, 1, 2, 3], [4, 5, 6, 7]]`, this will result in:
|
||||
|
||||
- the sampler on process 0 to yield :obj:`[0, 1, 2, 3]` and the sampler on process 1 to yield :obj:`[4, 5,
|
||||
6, 7]` if this argument is set to :obj:`False`.
|
||||
- the sampler on process 0 to yield :obj:`[0, 1]` then :obj:`[4, 5]` and the sampler on process 1 to yield
|
||||
:obj:`[2, 3]` then :obj:`[6, 7]` if this argument is set to :obj:`True`.
|
||||
- the sampler on process 0 to yield `[0, 1, 2, 3]` and the sampler on process 1 to yield `[4, 5, 6, 7]` if
|
||||
this argument is set to `False`.
|
||||
- the sampler on process 0 to yield `[0, 1]` then `[4, 5]` and the sampler on process 1 to yield `[2, 3]`
|
||||
then `[6, 7]` if this argument is set to `True`.
|
||||
|
||||
.. warning::
|
||||
<Tip warning={true}>
|
||||
|
||||
This does not support :obj:`BatchSampler` with varying batch size yet.
|
||||
"""
|
||||
This does not support `BatchSampler` with varying batch size yet.
|
||||
|
||||
</Tip>"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -176,35 +188,35 @@ class BatchSamplerShard(BatchSampler):
|
||||
|
||||
class IterableDatasetShard(IterableDataset):
|
||||
"""
|
||||
Wraps a PyTorch :obj:`IterableDataset` to generate samples for one of the processes only. Instances of this class
|
||||
will always yield a number of samples that is a round multiple of the actual batch size (depending of the value of
|
||||
:obj:`split_batches`, this is either :obj:`batch_size` or :obj:`batch_size x num_processes`). Depending on the
|
||||
value of the :obj:`drop_last` attribute of the batch sampler passed, it will either stop the iteration at the first
|
||||
batch that would be too small or loop with indices from the beginning.
|
||||
Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class will
|
||||
always yield a number of samples that is a round multiple of the actual batch size (depending of the value of
|
||||
`split_batches`, this is either `batch_size` or `batch_size x num_processes`). Depending on the value of the
|
||||
`drop_last` attribute of the batch sampler passed, it will either stop the iteration at the first batch that would
|
||||
be too small or loop with indices from the beginning.
|
||||
|
||||
Args:
|
||||
dataset (:obj:`torch.utils.data.dataset.IterableDataset`):
|
||||
dataset (`torch.utils.data.dataset.IterableDataset`):
|
||||
The batch sampler to split in several shards.
|
||||
batch_size (:obj:`int`, `optional`, defaults to 1):
|
||||
The size of the batches per shard (if :obj:`split_batches=False`) or the size of the batches (if
|
||||
:obj:`split_batches=True`).
|
||||
drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
batch_size (`int`, *optional*, defaults to 1):
|
||||
The size of the batches per shard (if `split_batches=False`) or the size of the batches (if
|
||||
`split_batches=True`).
|
||||
drop_last (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the
|
||||
beginning.
|
||||
num_processes (:obj:`int`, `optional`, defaults to 1):
|
||||
num_processes (`int`, *optional*, defaults to 1):
|
||||
The number of processes running concurrently.
|
||||
process_index (:obj:`int`, `optional`, defaults to 0):
|
||||
process_index (`int`, *optional*, defaults to 0):
|
||||
The index of the current process.
|
||||
split_batches (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
split_batches (`bool`, *optional*, defaults to `False`):
|
||||
Whether the shards should be created by splitting a batch to give a piece of it on each process, or by
|
||||
yielding different full batches on each process.
|
||||
|
||||
On two processes with an iterable dataset yielding of :obj:`[0, 1, 2, 3, 4, 5, 6, 7]`, this will result in:
|
||||
On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7]`, this will result in:
|
||||
|
||||
- the shard on process 0 to yield :obj:`[0, 1, 2, 3]` and the shard on process 1 to yield :obj:`[4, 5, 6,
|
||||
7]` if this argument is set to :obj:`False`.
|
||||
- the shard on process 0 to yield :obj:`[0, 1, 4, 5]` and the sampler on process 1 to yield :obj:`[2, 3, 6,
|
||||
7]` if this argument is set to :obj:`True`.
|
||||
- the shard on process 0 to yield `[0, 1, 2, 3]` and the shard on process 1 to yield `[4, 5, 6, 7]` if this
|
||||
argument is set to `False`.
|
||||
- the shard on process 0 to yield `[0, 1, 4, 5]` and the sampler on process 1 to yield `[2, 3, 6, 7]` if
|
||||
this argument is set to `True`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -216,7 +228,7 @@ class IterableDatasetShard(IterableDataset):
|
||||
process_index: int = 0,
|
||||
split_batches: bool = False,
|
||||
):
|
||||
if split_batches and batch_size % num_processes != 0:
|
||||
if split_batches and batch_size > 1 and batch_size % num_processes != 0:
|
||||
raise ValueError(
|
||||
f"To use `IterableDatasetShard` in `split_batches` mode, the batch size ({batch_size}) "
|
||||
f"needs to be a round multiple of the number of processes ({num_processes})."
|
||||
@ -257,25 +269,25 @@ class IterableDatasetShard(IterableDataset):
|
||||
|
||||
class DataLoaderShard(DataLoader):
|
||||
"""
|
||||
Subclass of a PyTorch :obj:`DataLoader` that will deal with device placement and current distributed setup.
|
||||
Subclass of a PyTorch `DataLoader` that will deal with device placement and current distributed setup.
|
||||
|
||||
Args:
|
||||
dataset (:obj:`torch.utils.data.dataset.Dataset`):
|
||||
dataset (`torch.utils.data.dataset.Dataset`):
|
||||
The dataset to use to build this datalaoder.
|
||||
device (:obj:`torch.device`, `optional`):
|
||||
device (`torch.device`, *optional*):
|
||||
If passed, the device to put all batches on.
|
||||
rng_types (list of :obj:`str` or :class:`~accelerate.utils.RNGType`):
|
||||
rng_types (list of `str` or [`~utils.RNGType`]):
|
||||
The list of random number generators to synchronize at the beginning of each iteration. Should be one or
|
||||
several of:
|
||||
|
||||
- :obj:`"torch"`: the base torch random number generator
|
||||
- :obj:`"cuda"`: the CUDA random number generator (GPU only)
|
||||
- :obj:`"xla"`: the XLA random number generator (TPU only)
|
||||
- :obj:`"generator"`: an optional :obj:`torch.Generator`
|
||||
generator (:obj:`torch.Generator`, `optional`):
|
||||
- `"torch"`: the base torch random number generator
|
||||
- `"cuda"`: the CUDA random number generator (GPU only)
|
||||
- `"xla"`: the XLA random number generator (TPU only)
|
||||
- `"generator"`: an optional `torch.Generator`
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator to keep synchronized across processes.
|
||||
kwargs:
|
||||
All other keyword arguments to pass to the regular :obj:`DataLoader` initialization.
|
||||
All other keyword arguments to pass to the regular `DataLoader` initialization.
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, device=None, rng_types=None, generator=None, **kwargs):
|
||||
@ -294,6 +306,114 @@ class DataLoaderShard(DataLoader):
|
||||
yield batch if self.device is None else send_to_device(batch, self.device)
|
||||
|
||||
|
||||
class DataLoaderDispatcher(DataLoader):
|
||||
"""
|
||||
Subclass of a PyTorch `DataLoader` that will iterate and preprocess on process 0 only, then dispatch on each
|
||||
process their part of the batch.
|
||||
|
||||
Args:
|
||||
split_batches (`bool`, *optional*, defaults to `False`):
|
||||
Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
|
||||
yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
|
||||
`num_processes` batches at each iteration).
|
||||
|
||||
Another way to see this is that the observed batch size will be the same as the initial `dataloader` if
|
||||
this option is set to `True`, the batch size of the initial `dataloader` multiplied by `num_processes`
|
||||
otherwise.
|
||||
|
||||
Setting this option to `True` requires that the batch size of the `dataloader` is a round multiple of
|
||||
`batch_size`.
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, split_batches: bool = False, **kwargs):
|
||||
super().__init__(dataset, **kwargs)
|
||||
self.split_batches = split_batches
|
||||
if version.parse(torch.__version__) < version.parse("1.8.0"):
|
||||
raise ImportError(
|
||||
"Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
|
||||
)
|
||||
|
||||
def __iter__(self):
|
||||
state = AcceleratorState()
|
||||
if state.process_index == 0:
|
||||
# We only iterate through the DataLoader on process 0.
|
||||
main_iterator = super().__iter__()
|
||||
stop_iteration = False
|
||||
first_batch = None
|
||||
while not stop_iteration:
|
||||
# On process 0, we gather the batch to dispatch.
|
||||
if state.process_index == 0:
|
||||
try:
|
||||
if self.split_batches:
|
||||
# One batch of the main iterator is dispatched and split.
|
||||
batch = next(main_iterator)
|
||||
else:
|
||||
# num_processes batches of the main iterator are concatenated then dispatched and split.
|
||||
# We add the batches one by one so we have the remainder available when drop_last=False.
|
||||
batches = []
|
||||
for _ in range(state.num_processes):
|
||||
batches.append(next(main_iterator))
|
||||
batch = concatenate(batches, dim=0)
|
||||
# In both cases, we need to get the structure of the batch that we will broadcast on other
|
||||
# processes to initialize the tensors with the right shape.
|
||||
# data_structure, stop_iteration
|
||||
batch_info = [get_data_structure(batch), False]
|
||||
except StopIteration:
|
||||
batch_info = [None, True]
|
||||
else:
|
||||
batch_info = [None, stop_iteration]
|
||||
|
||||
# This is inplace, so after this instruction, every process has the same `batch_info` as process 0.
|
||||
broadcast_object_list(batch_info)
|
||||
stop_iteration = batch_info[1]
|
||||
if stop_iteration:
|
||||
# If drop_last is False and split_batches is False, we may have a remainder to take care of.
|
||||
if not self.split_batches and not self.drop_last:
|
||||
if state.process_index == 0 and len(batches) > 0:
|
||||
batch = concatenate(batches, dim=0)
|
||||
batch_info = [get_data_structure(batch), False]
|
||||
else:
|
||||
batch_info = [None, True]
|
||||
broadcast_object_list(batch_info)
|
||||
if batch_info[1]:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
if state.process_index != 0:
|
||||
# Initialize tensors on other processes than process 0.
|
||||
batch = initialize_tensors(batch_info[0])
|
||||
batch = send_to_device(batch, state.device)
|
||||
# Broadcast the batch before splitting it.
|
||||
batch = broadcast(batch, from_process=0)
|
||||
|
||||
if not self.drop_last and first_batch is None:
|
||||
# We keep at least num processes elements of the first batch to be able to complete the last batch
|
||||
first_batch = slice_tensors(batch, slice(0, state.num_processes))
|
||||
|
||||
observed_batch_size = find_batch_size(batch)
|
||||
batch_size = observed_batch_size // state.num_processes
|
||||
|
||||
if not self.drop_last and stop_iteration and observed_batch_size % state.num_processes != 0:
|
||||
# If the last batch is not complete, let's add the first batch to it.
|
||||
batch = concatenate([batch, first_batch], dim=0)
|
||||
batch_size += 1
|
||||
|
||||
data_slice = slice(state.process_index * batch_size, (state.process_index + 1) * batch_size)
|
||||
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
xm.mark_step()
|
||||
yield slice_tensors(batch, data_slice)
|
||||
|
||||
def __len__(self):
|
||||
state = AcceleratorState()
|
||||
whole_length = super().__len__()
|
||||
if self.drop_last:
|
||||
return whole_length // state.num_processes
|
||||
else:
|
||||
return math.ceil(whole_length / state.num_processes)
|
||||
|
||||
|
||||
def prepare_data_loader(
|
||||
dataloader: DataLoader,
|
||||
device: Optional[torch.device] = None,
|
||||
@ -302,55 +422,69 @@ def prepare_data_loader(
|
||||
split_batches: bool = False,
|
||||
put_on_device: bool = False,
|
||||
rng_types: Optional[List[Union[str, RNGType]]] = None,
|
||||
dispatch_batches: Optional[bool] = None,
|
||||
) -> DataLoader:
|
||||
"""
|
||||
Wraps a PyTorch :obj:`DataLoader` to generate batches for one of the processes only.
|
||||
Wraps a PyTorch `DataLoader` to generate batches for one of the processes only.
|
||||
|
||||
Depending on the value of the :obj:`drop_last` attribute of the :obj:`dataloader` passed, it will either stop the
|
||||
iteration at the first batch that would be too small / not present on all processes or loop with indices from the
|
||||
beginning.
|
||||
Depending on the value of the `drop_last` attribute of the `dataloader` passed, it will either stop the iteration
|
||||
at the first batch that would be too small / not present on all processes or loop with indices from the beginning.
|
||||
|
||||
Args:
|
||||
dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):
|
||||
dataloader (`torch.utils.data.dataloader.DataLoader`):
|
||||
The data loader to split across several devices.
|
||||
device (:obj:`torch.device`):
|
||||
The target device for the returned :obj:`DataLoader`.
|
||||
num_processes (:obj:`int`, `optional`):
|
||||
device (`torch.device`):
|
||||
The target device for the returned `DataLoader`.
|
||||
num_processes (`int`, *optional*):
|
||||
The number of processes running concurrently. Will default to the value given by
|
||||
:class:`~accelerate.AcceleratorState`.
|
||||
process_index (:obj:`int`, `optional`):
|
||||
The index of the current process. Will default to the value given by :class:`~accelerate.AcceleratorState`.
|
||||
split_batches (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether the resulting :obj:`DataLoader` should split the batches of the original data loader across devices
|
||||
or yield full batches (in which case it will yield batches starting at the :obj:`process_index`-th and
|
||||
advancing of :obj:`num_processes` batches at each iteration).
|
||||
[`~state.AcceleratorState`].
|
||||
process_index (`int`, *optional*):
|
||||
The index of the current process. Will default to the value given by [`~state.AcceleratorState`].
|
||||
split_batches (`bool`, *optional*, defaults to `False`):
|
||||
Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
|
||||
yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
|
||||
`num_processes` batches at each iteration).
|
||||
|
||||
Another way to see this is that the observed batch size will be the same as the initial :obj:`dataloader`
|
||||
if this option is set to :obj:`True`, the batch size of the initial :obj:`dataloader` multiplied by
|
||||
:obj:`num_processes` otherwise.
|
||||
Another way to see this is that the observed batch size will be the same as the initial `dataloader` if
|
||||
this option is set to `True`, the batch size of the initial `dataloader` multiplied by `num_processes`
|
||||
otherwise.
|
||||
|
||||
Setting this option to :obj:`True` requires that the batch size of the :obj:`dataloader` is a round
|
||||
multiple of :obj:`batch_size`.
|
||||
put_on_device (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to put the batches on :obj:`device` (only works if the batches are nested list, tuples or
|
||||
Setting this option to `True` requires that the batch size of the `dataloader` is a round multiple of
|
||||
`batch_size`.
|
||||
put_on_device (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to put the batches on `device` (only works if the batches are nested list, tuples or
|
||||
dictionaries of tensors).
|
||||
rng_types (list of :obj:`str` or :class:`~accelerate.utils.RNGType`):
|
||||
rng_types (list of `str` or [`~utils.RNGType`]):
|
||||
The list of random number generators to synchronize at the beginning of each iteration. Should be one or
|
||||
several of:
|
||||
|
||||
- :obj:`"torch"`: the base torch random number generator
|
||||
- :obj:`"cuda"`: the CUDA random number generator (GPU only)
|
||||
- :obj:`"xla"`: the XLA random number generator (TPU only)
|
||||
- :obj:`"generator"`: the :obj:`torch.Generator` of the sampler (or batch sampler if there is no sampler in
|
||||
your dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.
|
||||
- `"torch"`: the base torch random number generator
|
||||
- `"cuda"`: the CUDA random number generator (GPU only)
|
||||
- `"xla"`: the XLA random number generator (TPU only)
|
||||
- `"generator"`: the `torch.Generator` of the sampler (or batch sampler if there is no sampler in your
|
||||
dataloader) or of the iterable dataset (if it exists) if the underlying dataset is of that type.
|
||||
|
||||
dispatch_batches (`bool`, *optional*):
|
||||
If set to `True`, the datalaoder prepared is only iterated through on the main process and then the batches
|
||||
are split and broadcast to each process. Will default to `True` when the underlying dataset is an
|
||||
`IterableDataset`, `False` otherwise.
|
||||
|
||||
Returns:
|
||||
:obj:`torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches
|
||||
`torch.utils.data.dataloader.DataLoader`: A new data loader that will yield the portion of the batches
|
||||
|
||||
.. warning::
|
||||
<Tip warning={true}>
|
||||
|
||||
This does not support :obj:`BatchSampler` with varying batch size yet.
|
||||
"""
|
||||
This does not support `BatchSampler` with varying batch size yet.
|
||||
|
||||
</Tip>"""
|
||||
if dispatch_batches is None:
|
||||
if version.parse(torch.__version__) < version.parse("1.8.0") or not put_on_device:
|
||||
dispatch_batches = False
|
||||
else:
|
||||
dispatch_batches = isinstance(dataloader.dataset, IterableDataset)
|
||||
|
||||
if dispatch_batches and not put_on_device:
|
||||
raise ValueError("Using `dispatch_batches=True` requires `put_on_device=True`.")
|
||||
# Grab defaults from AcceleratorState
|
||||
state = AcceleratorState()
|
||||
if num_processes is None:
|
||||
@ -359,10 +493,10 @@ def prepare_data_loader(
|
||||
process_index = state.process_index
|
||||
|
||||
# Sanity check
|
||||
if split_batches and dataloader.batch_size % num_processes != 0:
|
||||
if split_batches and dataloader.batch_size > 1 and dataloader.batch_size % num_processes != 0:
|
||||
raise ValueError(
|
||||
f"Using `split_batches=True` requires that the batch size ({dataloader.batch_size}) "
|
||||
f"to be a round multiple of the number of processes ({num_processes})."
|
||||
f"To use a `DataLoader` in `split_batches` mode, the batch size ({dataloader.batch_size}) "
|
||||
f"needs to be a round multiple of the number of processes ({num_processes})."
|
||||
)
|
||||
|
||||
new_dataset = dataloader.dataset
|
||||
@ -370,7 +504,7 @@ def prepare_data_loader(
|
||||
new_batch_sampler = dataloader.batch_sampler if not isinstance(new_dataset, IterableDataset) else None
|
||||
generator = getattr(dataloader, "generator", None)
|
||||
# No change if no multiprocess
|
||||
if num_processes != 1:
|
||||
if num_processes != 1 and not dispatch_batches:
|
||||
if isinstance(new_dataset, IterableDataset):
|
||||
if getattr(dataloader.dataset, "generator", None) is not None:
|
||||
generator = dataloader.dataset.generator
|
||||
@ -419,8 +553,14 @@ def prepare_data_loader(
|
||||
|
||||
# Need to provide batch_size as batch_sampler is None for Iterable dataset
|
||||
if new_batch_sampler is None:
|
||||
kwargs["drop_last"] = dataloader.drop_last
|
||||
kwargs["batch_size"] = dataloader.batch_size // num_processes if split_batches else dataloader.batch_size
|
||||
|
||||
if dispatch_batches:
|
||||
return DataLoaderDispatcher(
|
||||
new_dataset, split_batches=split_batches, batch_sampler=new_batch_sampler, **kwargs
|
||||
)
|
||||
|
||||
return DataLoaderShard(
|
||||
new_dataset,
|
||||
device=device if put_on_device else None,
|
||||
|
||||
@ -71,7 +71,7 @@ class DeepSpeedOptimizerWrapper(AcceleratedOptimizer):
|
||||
Internal wrapper around a deepspeed optimizer.
|
||||
|
||||
Args:
|
||||
optimizer (:obj:`torch.optim.optimizer.Optimizer`):
|
||||
optimizer (`torch.optim.optimizer.Optimizer`):
|
||||
The optimizer to wrap.
|
||||
"""
|
||||
|
||||
|
||||
@ -14,11 +14,13 @@
|
||||
|
||||
import copy
|
||||
from dataclasses import dataclass
|
||||
from datetime import timedelta
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class KwargsHandler:
|
||||
"""
|
||||
Internal mixin that implements a :obj:`to_kwargs()` method for a dataclass.
|
||||
Internal mixin that implements a `to_kwargs()` method for a dataclass.
|
||||
"""
|
||||
|
||||
def to_dict(self):
|
||||
@ -36,15 +38,16 @@ class KwargsHandler:
|
||||
@dataclass
|
||||
class DistributedDataParallelKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your :class:`~accelerate.Accelerator` to customize how your model is wrapped in a
|
||||
:obj:`torch.nn.parallel.DistributedDataParallel`. Please refer to the documentation of this `wrapper
|
||||
<https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__ for more information
|
||||
on each argument.
|
||||
Use this object in your [`Accelerator`] to customize how your model is wrapped in a
|
||||
`torch.nn.parallel.DistributedDataParallel`. Please refer to the documentation of this
|
||||
[wrapper](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) for more
|
||||
information on each argument.
|
||||
|
||||
.. warning::
|
||||
<Tip warning={true}>
|
||||
|
||||
:obj:`gradient_as_bucket_view` is only available in PyTorch 1.7.0 and later versions.
|
||||
"""
|
||||
`gradient_as_bucket_view` is only available in PyTorch 1.7.0 and later versions.
|
||||
|
||||
</Tip>"""
|
||||
|
||||
dim: int = 0
|
||||
broadcast_buffers: bool = True
|
||||
@ -57,17 +60,31 @@ class DistributedDataParallelKwargs(KwargsHandler):
|
||||
@dataclass
|
||||
class GradScalerKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your :class:`~accelerate.Accelerator` to customize the behavior of mixed precision, specifically
|
||||
how the :obj:`torch.cuda.amp.GradScaler` used is created. Please refer to the documentation of this `scaler
|
||||
<https://pytorch.org/docs/stable/amp.html?highlight=gradscaler>`__ for more information on each argument.
|
||||
Use this object in your [`Accelerator`] to customize the behavior of mixed precision, specifically how the
|
||||
`torch.cuda.amp.GradScaler` used is created. Please refer to the documentation of this
|
||||
[scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more information on each argument.
|
||||
|
||||
.. warning::
|
||||
<Tip warning={true}>
|
||||
|
||||
:obj:`GradScaler` is only available in PyTorch 1.5.0 and later versions.
|
||||
"""
|
||||
`GradScaler` is only available in PyTorch 1.5.0 and later versions.
|
||||
|
||||
</Tip>"""
|
||||
|
||||
init_scale: float = 65536.0
|
||||
growth_factor: float = 2.0
|
||||
backoff_factor: float = 0.5
|
||||
growth_interval: int = 2000
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class InitProcessGroupKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your [`Accelerator`] to customize the initialization of the distributed processes. Please refer
|
||||
to the documentation of this
|
||||
[method](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
|
||||
information on each argument.
|
||||
"""
|
||||
|
||||
init_method: Optional[str] = None
|
||||
timeout: timedelta = timedelta(seconds=1800)
|
||||
|
||||
@ -14,32 +14,34 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .state import AcceleratorState
|
||||
from .utils import PrepareForLaunch
|
||||
from .utils import PrecisionType, PrepareForLaunch, patch_environment
|
||||
|
||||
|
||||
def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, use_port="29500"):
|
||||
def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mixed_precision="no", use_port="29500"):
|
||||
"""
|
||||
Launches a training function, using several processes if it's possible in the current environment (TPU with
|
||||
multiple cores for instance).
|
||||
|
||||
Args:
|
||||
function (:obj:`Callable`):
|
||||
function (`Callable`):
|
||||
The training function to execute. If it accepts arguments, the first argument should be the index of the
|
||||
process run.
|
||||
args (:obj:`Tuple`):
|
||||
Tuple of arguments to pass to the function (it will receive :obj:`*args`).
|
||||
num_processes (:obj:`int`, `optional`):
|
||||
args (`Tuple`):
|
||||
Tuple of arguments to pass to the function (it will receive `*args`).
|
||||
num_processes (`int`, *optional*):
|
||||
The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
|
||||
the number of GPUs available otherwise.
|
||||
use_fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If :obj:`True`, will use mixed precision training on multi-GPU.
|
||||
use_port (:obj:`str`, `optional`, defaults to :obj:`"29500"`):
|
||||
mixed_precision (`str`, *optional*, defaults to `"no"`):
|
||||
If `fp16` or `bf16`, will use mixed precision training on multi-GPU.
|
||||
use_port (`str`, *optional*, defaults to `"29500"`):
|
||||
The port to use to communicate between processes when launching a multi-GPU training.
|
||||
"""
|
||||
# Are we in a google colab or a Kaggle Kernel?
|
||||
@ -105,22 +107,26 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, use
|
||||
"function."
|
||||
)
|
||||
|
||||
try:
|
||||
mixed_precision = PrecisionType(mixed_precision.lower())
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
|
||||
)
|
||||
|
||||
if use_fp16:
|
||||
warnings.warn('use_fp16=True is deprecated. Use mixed_precision="fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
|
||||
# torch.distributed will expect a few environment variable to be here. We set the ones common to each
|
||||
# process here (the other ones will be set be the launcher).
|
||||
os.environ["WORLD_SIZE"] = str(num_processes)
|
||||
os.environ["MASTER_ADDR"] = "127.0.0.1"
|
||||
os.environ["MASTER_PORT"] = str(use_port)
|
||||
os.environ["USE_FP16"] = str(use_fp16)
|
||||
with patch_environment(
|
||||
world_size=num_processes, master_addr="127.0.01", master_port=use_port, mixed_precision=mixed_precision
|
||||
):
|
||||
launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
|
||||
|
||||
launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
|
||||
try:
|
||||
print(f"Launching a training on {num_processes} GPUs.")
|
||||
print(f"Launching training on {num_processes} GPUs.")
|
||||
start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")
|
||||
finally:
|
||||
# Clean up the environment variables set.
|
||||
del os.environ["WORLD_SIZE"]
|
||||
del os.environ["MASTER_ADDR"]
|
||||
del os.environ["MASTER_PORT"]
|
||||
|
||||
else:
|
||||
# No need for a distributed launch otherwise as it's either CPU or one GPU.
|
||||
@ -129,3 +135,45 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, use
|
||||
else:
|
||||
print("Launching training on CPU.")
|
||||
function(*args)
|
||||
|
||||
|
||||
def debug_launcher(function, args=(), num_processes=2):
|
||||
"""
|
||||
Launches a training function using several processes on CPU for debugging purposes.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This function is provided for internal testing and debugging, but it's not intended for real trainings. It will
|
||||
only use the CPU.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
function (`Callable`):
|
||||
The training function to execute.
|
||||
args (`Tuple`):
|
||||
Tuple of arguments to pass to the function (it will receive `*args`).
|
||||
num_processes (`int`, *optional*, defaults to 2):
|
||||
The number of processes to use for training.
|
||||
"""
|
||||
if version.parse(torch.__version__) < version.parse("1.5.0"):
|
||||
raise ImportError(
|
||||
"Using `debug_launcher` for distributed training on GPUs require torch >= 1.5.0, got "
|
||||
f"{torch.__version__}."
|
||||
)
|
||||
|
||||
from torch.multiprocessing import start_processes
|
||||
|
||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
||||
# torch.distributed will expect a few environment variable to be here. We set the ones common to each
|
||||
# process here (the other ones will be set be the launcher).
|
||||
with patch_environment(
|
||||
world_size=num_processes,
|
||||
master_addr="127.0.01",
|
||||
master_port="29500",
|
||||
mixed_precision="no",
|
||||
accelerate_debug_rdv_file=tmp_file.name,
|
||||
use_cpu="yes",
|
||||
):
|
||||
launcher = PrepareForLaunch(function, debug=True)
|
||||
start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")
|
||||
88
src/accelerate/memory_utils.py
Normal file
88
src/accelerate/memory_utils.py
Normal file
@ -0,0 +1,88 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
A collection of utilities for ensuring that training can always occur. Heavily influenced by the
|
||||
[toma](https://github.com/BlackHC/toma) library.
|
||||
"""
|
||||
|
||||
import functools
|
||||
import gc
|
||||
import inspect
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def should_reduce_batch_size(exception: Exception) -> bool:
|
||||
"""
|
||||
Checks if `exception` relates to CUDA out-of-memory, CUDNN not supported, or CPU out-of-memory
|
||||
|
||||
Args:
|
||||
exception (`Exception`):
|
||||
An exception
|
||||
"""
|
||||
_statements = [
|
||||
"CUDA out of memory.", # CUDA OOM
|
||||
"cuDNN error: CUDNN_STATUS_NOT_SUPPORTED.", # CUDNN SNAFU
|
||||
"DefaultCPUAllocator: can't allocate memory", # CPU OOM
|
||||
]
|
||||
if isinstance(exception, RuntimeError) and len(exception.args) == 1:
|
||||
return any(err in exception.args[0] for err in _statements)
|
||||
return False
|
||||
|
||||
|
||||
def find_executable_batch_size(function: callable = None, starting_batch_size: int = 128):
|
||||
"""
|
||||
A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
|
||||
CUDNN, the batch size is cut in half and passed to `function`
|
||||
|
||||
`function` must take in a `batch_size` parameter as its first argument.
|
||||
|
||||
Args:
|
||||
function (`callable`, *optional*):
|
||||
A function to wrap
|
||||
starting_batch_size (`int`, *optional*):
|
||||
The batch size to try and fit into memory
|
||||
"""
|
||||
if function is None:
|
||||
return functools.partial(find_executable_batch_size, starting_batch_size=starting_batch_size)
|
||||
|
||||
batch_size = starting_batch_size
|
||||
|
||||
def decorator(*args, **kwargs):
|
||||
nonlocal batch_size
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
params = list(inspect.signature(function).parameters.keys())
|
||||
# Guard against user error
|
||||
if len(params) < (len(args) + 1):
|
||||
arg_str = ", ".join([f"{arg}={value}" for arg, value in zip(params[1:], args[1:])])
|
||||
raise TypeError(
|
||||
f"Batch size was passed into `{function.__name__}` as the first argument when called."
|
||||
f"Remove this as the decorator already does so: `{function.__name__}({arg_str})`"
|
||||
)
|
||||
while True:
|
||||
if batch_size == 0:
|
||||
raise RuntimeError("No executable batch size found, reached zero.")
|
||||
try:
|
||||
return function(batch_size, *args, **kwargs)
|
||||
except Exception as e:
|
||||
if should_reduce_batch_size(e):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
batch_size //= 2
|
||||
else:
|
||||
raise
|
||||
|
||||
return decorator
|
||||
@ -12,6 +12,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from packaging import version
|
||||
@ -39,31 +42,39 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
Internal wrapper around a torch optimizer.
|
||||
|
||||
Args:
|
||||
optimizer (:obj:`torch.optim.optimizer.Optimizer`):
|
||||
optimizer (`torch.optim.optimizer.Optimizer`):
|
||||
The optimizer to wrap.
|
||||
device_placement (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
device_placement (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the optimizer should handle device placement. If so, it will place the state dictionary of
|
||||
:obj:`optimizer` on the right device.
|
||||
scaler (:obj:`torch.cuda.amp.grad_scaler.GradScaler`, `optional`):
|
||||
`optimizer` on the right device.
|
||||
scaler (`torch.cuda.amp.grad_scaler.GradScaler`, *optional*):
|
||||
The scaler to use in the step function if training with mixed precision.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, device_placement=True, scaler=None):
|
||||
self.optimizer = optimizer
|
||||
self.scaler = scaler
|
||||
self.state = AcceleratorState()
|
||||
self.accelerator_state = AcceleratorState()
|
||||
self.device_placement = device_placement
|
||||
self._is_overflow = False
|
||||
|
||||
# Handle device placement
|
||||
if device_placement:
|
||||
state_dict = self.optimizer.state_dict()
|
||||
if self.state.distributed_type == DistributedType.TPU:
|
||||
xm.send_cpu_data_to_device(state_dict, self.state.device)
|
||||
if self.accelerator_state.distributed_type == DistributedType.TPU:
|
||||
xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
|
||||
else:
|
||||
state_dict = move_to_device(state_dict, self.state.device)
|
||||
state_dict = move_to_device(state_dict, self.accelerator_state.device)
|
||||
self.optimizer.load_state_dict(state_dict)
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
return self.optimizer.state
|
||||
|
||||
@state.setter
|
||||
def state(self, state):
|
||||
self.optimizer.state = state
|
||||
|
||||
@property
|
||||
def param_groups(self):
|
||||
return self.optimizer.param_groups
|
||||
@ -84,8 +95,8 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
self.optimizer.add_param_group(param_group)
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
if self.state.distributed_type == DistributedType.TPU and self.device_placement:
|
||||
xm.send_cpu_data_to_device(state_dict, self.state.device)
|
||||
if self.accelerator_state.distributed_type == DistributedType.TPU and self.device_placement:
|
||||
xm.send_cpu_data_to_device(state_dict, self.accelerator_state.device)
|
||||
self.optimizer.load_state_dict(state_dict)
|
||||
|
||||
def state_dict(self):
|
||||
@ -100,12 +111,18 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
)
|
||||
self.optimizer.zero_grad()
|
||||
else:
|
||||
if set_to_none is None:
|
||||
set_to_none = False
|
||||
self.optimizer.zero_grad(set_to_none=set_to_none)
|
||||
accept_arg = "set_to_none" in inspect.signature(self.optimizer.zero_grad).parameters
|
||||
if accept_arg:
|
||||
if set_to_none is None:
|
||||
set_to_none = False
|
||||
self.optimizer.zero_grad(set_to_none=set_to_none)
|
||||
else:
|
||||
if set_to_none is not None:
|
||||
raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
def step(self, closure=None):
|
||||
if self.state.distributed_type == DistributedType.TPU:
|
||||
if self.accelerator_state.distributed_type == DistributedType.TPU:
|
||||
optimizer_args = {"closure": closure} if closure is not None else {}
|
||||
xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args)
|
||||
elif self.scaler is not None:
|
||||
@ -125,4 +142,14 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
@property
|
||||
def is_overflow(self):
|
||||
"""Whether or not the optimizer step was done, or skipped because of gradient overflow."""
|
||||
warnings.warn(
|
||||
"The `is_overflow` property is deprecated and will be removed in version 1.0 of Accelerate use "
|
||||
"`optimizer.step_was_skipped` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
return self._is_overflow
|
||||
|
||||
@property
|
||||
def step_was_skipped(self):
|
||||
"""Whether or not the optimizer step was skipped."""
|
||||
return self._is_overflow
|
||||
|
||||
80
src/accelerate/scheduler.py
Normal file
80
src/accelerate/scheduler.py
Normal file
@ -0,0 +1,80 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .state import AcceleratorState
|
||||
|
||||
|
||||
class AcceleratedScheduler:
|
||||
"""
|
||||
A wrapper around a learning rate scheduler that will only step when the optimizer(s) have a training step. Useful
|
||||
to avoid making a scheduler step too fast when:
|
||||
|
||||
- gradients went overflow and there was no training step (in mixed precision training)
|
||||
- step was skipped because of gradient accumulation
|
||||
|
||||
Args:
|
||||
scheduler (`torch.optim.lr_scheduler._LRScheduler`):
|
||||
The scheduler to wrap.
|
||||
optimizers (one or a list of `torch.optim.Optimizer`):
|
||||
The optimizers used.
|
||||
step_with_optimizer (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the scheduler should be stepped at each optimizer step.
|
||||
split_batches (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the dataloaders split one batch across the different processes (so batch size is the same
|
||||
regardless of the number of processes) or create batches on each process (so batch size is the original
|
||||
batch size multiplied by the number of processes).
|
||||
"""
|
||||
|
||||
def __init__(self, scheduler, optimizers, step_with_optimizer: bool = True, split_batches: bool = False):
|
||||
self.scheduler = scheduler
|
||||
self.optimizers = optimizers if isinstance(optimizers, (list, tuple)) else [optimizers]
|
||||
self.split_batches = split_batches
|
||||
self.step_with_optimizer = step_with_optimizer
|
||||
|
||||
def step(self, *args, **kwargs):
|
||||
if not self.step_with_optimizer:
|
||||
# No link between scheduler and optimizer -> just step
|
||||
self.scheduler.step(*args, **kwargs)
|
||||
return
|
||||
|
||||
# Otherwise, first make sure the optimizer was stepped.
|
||||
for opt in self.optimizers:
|
||||
if opt.step_was_skipped:
|
||||
return
|
||||
|
||||
if self.split_batches:
|
||||
# Split batches -> the training dataloader batch size is not changed so one step per training step
|
||||
self.scheduler.step(*args, **kwargs)
|
||||
else:
|
||||
# Otherwise the training dataloader batch size was multiplied by `num_processes`, so we need to do
|
||||
# num_processes steps per training step
|
||||
num_processes = AcceleratorState().num_processes
|
||||
for _ in range(num_processes):
|
||||
self.scheduler.step(*args, **kwargs)
|
||||
|
||||
# Passthroughs
|
||||
def get_last_lr(self):
|
||||
return self.scheduler.get_last_lr()
|
||||
|
||||
def state_dict(self):
|
||||
return self.scheduler.state_dict()
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
self.scheduler.load_state_dict(state_dict)
|
||||
|
||||
def get_lr(self):
|
||||
return self.scheduler.get_lr()
|
||||
|
||||
def print_lr(self, *args, **kwargs):
|
||||
return self.scheduler.print_lr(*args, **kwargs)
|
||||
@ -66,6 +66,11 @@ def parse_flag_from_env(key, default=False):
|
||||
return strtobool(value) == 1 # As its name indicates `strtobool` actually returns an int...
|
||||
|
||||
|
||||
def parse_choice_from_env(key, default="no"):
|
||||
value = os.environ.get(key, str(default))
|
||||
return value
|
||||
|
||||
|
||||
class DistributedType(str, Enum):
|
||||
"""
|
||||
Represents a type of distributed environment.
|
||||
@ -84,6 +89,7 @@ class DistributedType(str, Enum):
|
||||
MULTI_CPU = "MULTI_CPU"
|
||||
MULTI_GPU = "MULTI_GPU"
|
||||
DEEPSPEED = "DEEPSPEED"
|
||||
FSDP = "FSDP"
|
||||
TPU = "TPU"
|
||||
|
||||
|
||||
@ -122,27 +128,39 @@ class ComputeEnvironment(str, Enum):
|
||||
# Inspired by Alex Martelli's 'Borg'.
|
||||
class AcceleratorState:
|
||||
"""
|
||||
This is a variation of a `singleton class <https://en.wikipedia.org/wiki/Singleton_pattern>`__ in the sense that
|
||||
all instance of :obj:`AcceleratorState` share the same state, which is initialized on the first instantiation.
|
||||
This is a variation of a [singleton class](https://en.wikipedia.org/wiki/Singleton_pattern) in the sense that all
|
||||
instance of `AcceleratorState` share the same state, which is initialized on the first instantiation.
|
||||
|
||||
Attributes
|
||||
Attributes:
|
||||
|
||||
- **device** (:obj:`torch.device`) -- The device to use.
|
||||
- **distributed_type** (:obj:`~accelerate.state.DistributedType`) -- The type of distributed environment
|
||||
currently in use.
|
||||
- **num_processes** (:obj:`int`) -- The number of processes currently launched in parallel.
|
||||
- **process_index** (:obj:`int`) -- The index of the current process.
|
||||
- **local_process_index** (:obj:`int`) -- The index of the current process on the current server.
|
||||
- **use_fp16** (:obj:`bool`) -- Whether or not the current script will use mixed precision.
|
||||
- **device** (`torch.device`) -- The device to use.
|
||||
- **distributed_type** (`~accelerate.state.DistributedType`) -- The type of distributed environment currently
|
||||
in use.
|
||||
- **num_processes** (`int`) -- The number of processes currently launched in parallel.
|
||||
- **process_index** (`int`) -- The index of the current process.
|
||||
- **local_process_index** (`int`) -- The index of the current process on the current server.
|
||||
- **mixed_precision** (`str`) -- Whether or not the current script will use mixed precision. If you are using
|
||||
mixed precision, define if you want to use FP16 or BF16 (bfloat16) as the floating point.
|
||||
"""
|
||||
|
||||
_shared_state = {}
|
||||
|
||||
def __init__(self, fp16: bool = None, cpu: bool = False, deepspeed_plugin=None, _from_accelerator: bool = False):
|
||||
def __init__(
|
||||
self,
|
||||
mixed_precision: str = None,
|
||||
cpu: bool = False,
|
||||
deepspeed_plugin=None,
|
||||
fsdp_plugin=None,
|
||||
_from_accelerator: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.__dict__ = self._shared_state
|
||||
if parse_flag_from_env("USE_CPU"):
|
||||
cpu = True
|
||||
if not getattr(self, "initialized", False):
|
||||
self.backend = None
|
||||
self.deepspeed_plugin = None
|
||||
mixed_precision = mixed_precision.lower() if mixed_precision else None
|
||||
if not _from_accelerator:
|
||||
raise ValueError(
|
||||
"Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` "
|
||||
@ -154,35 +172,49 @@ class AcceleratorState:
|
||||
self.process_index = xm.get_ordinal()
|
||||
self.local_process_index = xm.get_local_ordinal()
|
||||
self.device = xm.xla_device()
|
||||
self.use_fp16 = False
|
||||
self.mixed_precision = "no"
|
||||
elif os.environ.get("USE_DEEPSPEED", "false") == "true" and not cpu:
|
||||
assert (
|
||||
is_deepspeed_available()
|
||||
), "DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source"
|
||||
self.distributed_type = DistributedType.DEEPSPEED
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
torch.distributed.init_process_group(backend="nccl", **kwargs)
|
||||
self.backend = "nccl"
|
||||
self.num_processes = torch.distributed.get_world_size()
|
||||
self.process_index = torch.distributed.get_rank()
|
||||
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
|
||||
self.device = torch.device("cuda", self.local_process_index)
|
||||
torch.cuda.set_device(self.device)
|
||||
self.use_fp16 = False # deepspeed handles fp16 using deepspeed_config
|
||||
fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16
|
||||
deepspeed_plugin.deepspeed_config.update({"fp16": {"enabled": fp16}})
|
||||
self.mixed_precision = "no" # deepspeed handles mixed_precision using deepspeed_config
|
||||
mixed_precision = (
|
||||
parse_choice_from_env("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
|
||||
)
|
||||
if mixed_precision == "fp16":
|
||||
deepspeed_plugin.deepspeed_config.update({"fp16": {"enabled": True}})
|
||||
elif mixed_precision == "bf16":
|
||||
deepspeed_plugin.deepspeed_config.update({"bfloat16": {"enabled": True}})
|
||||
self.deepspeed_plugin = deepspeed_plugin
|
||||
elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
|
||||
self.distributed_type = DistributedType.MULTI_GPU
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
torch.distributed.init_process_group(backend="nccl", **kwargs)
|
||||
self.backend = "nccl"
|
||||
self.num_processes = torch.distributed.get_world_size()
|
||||
self.process_index = torch.distributed.get_rank()
|
||||
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
|
||||
self.device = torch.device("cuda", self.local_process_index)
|
||||
torch.cuda.set_device(self.device)
|
||||
self.use_fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16
|
||||
self.mixed_precision = (
|
||||
parse_choice_from_env("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
|
||||
)
|
||||
if os.environ.get("USE_FSDP", "false") == "true":
|
||||
self.distributed_type = DistributedType.FSDP
|
||||
if self.mixed_precision != "no":
|
||||
raise ValueError(
|
||||
"Mixed precision is currently not supported for FSDP. Please set `mixed_precision` to `no`."
|
||||
)
|
||||
self.fsdp_plugin = fsdp_plugin
|
||||
elif get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1:
|
||||
self.distributed_type = DistributedType.MULTI_CPU
|
||||
if is_ccl_available() and get_int_from_env(["CCL_WORKER_COUNT"], 0) > 0:
|
||||
@ -213,31 +245,39 @@ class AcceleratorState:
|
||||
"please try exporting rank 0's hostname as MASTER_ADDR"
|
||||
)
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group(backend, rank=rank, world_size=size)
|
||||
torch.distributed.init_process_group(backend, rank=rank, world_size=size, **kwargs)
|
||||
self.backend = backend
|
||||
self.num_processes = torch.distributed.get_world_size()
|
||||
self.process_index = torch.distributed.get_rank()
|
||||
self.local_process_index = local_rank
|
||||
self.device = torch.device("cpu")
|
||||
self.use_fp16 = False
|
||||
self.mixed_precision = "no"
|
||||
else:
|
||||
self.distributed_type = DistributedType.NO
|
||||
self.num_processes = 1
|
||||
self.process_index = self.local_process_index = 0
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() and not cpu else "cpu")
|
||||
self.use_fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16
|
||||
self.mixed_precision = (
|
||||
parse_choice_from_env("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
|
||||
)
|
||||
self.initialized = True
|
||||
|
||||
def __repr__(self):
|
||||
use_fp16 = self.deepspeed_plugin.fp16 if self.distributed_type == DistributedType.DEEPSPEED else self.use_fp16
|
||||
mixed_precision = self.mixed_precision
|
||||
|
||||
repr = (
|
||||
f"Distributed environment: {self.distributed_type}{(' Backend: ' + self.backend) if self.backend else ''}\n"
|
||||
f"Num processes: {self.num_processes}\n"
|
||||
f"Process index: {self.process_index}\n"
|
||||
f"Local process index: {self.local_process_index}\n"
|
||||
f"Device: {self.device}\n"
|
||||
f"Use FP16 precision: {use_fp16}\n"
|
||||
f"Mixed precision type: {mixed_precision}\n"
|
||||
)
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
repr += f"ds_config: {self.deepspeed_plugin.ds_config}\n"
|
||||
repr += f"ds_config: {self.deepspeed_plugin.deepspeed_config}\n"
|
||||
return repr
|
||||
|
||||
# For backward compatibility
|
||||
@property
|
||||
def use_fp16(self):
|
||||
return self.mixed_precision != "no"
|
||||
|
||||
139
src/accelerate/test_utils/examples.py
Normal file
139
src/accelerate/test_utils/examples.py
Normal file
@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
A collection of utilities for comparing `examples/complete_*_example.py` scripts with the capabilities inside of each
|
||||
`examples/by_feature` example. `compare_against_test` is the main function that should be used when testing, while the
|
||||
others are used to either get the code that matters, or to preprocess them (such as stripping comments)
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
|
||||
def get_function_contents_by_name(lines: List[str], name: str):
|
||||
"""
|
||||
Extracts a function from `lines` of segmented source code with the name `name`.
|
||||
|
||||
Args:
|
||||
lines (`List[str]`):
|
||||
Source code of a script seperated by line.
|
||||
name (`str`):
|
||||
The name of the function to extract. Should be either `training_function` or `main`
|
||||
"""
|
||||
if name != "training_function" and name != "main":
|
||||
raise ValueError(f"Incorrect function name passed: {name}, choose either 'main' or 'training_function'")
|
||||
good_lines, found_start = [], False
|
||||
for line in lines:
|
||||
if not found_start and f"def {name}" in line:
|
||||
found_start = True
|
||||
good_lines.append(line)
|
||||
continue
|
||||
if found_start:
|
||||
if name == "training_function" and "def main" in line:
|
||||
return good_lines
|
||||
if name == "main" and "if __name__" in line:
|
||||
return good_lines
|
||||
good_lines.append(line)
|
||||
|
||||
|
||||
def clean_lines(lines: List[str]):
|
||||
"""
|
||||
Filters `lines` and removes any entries that start with a comment ('#') or is just a newline ('\n')
|
||||
|
||||
Args:
|
||||
lines (`List[str]`):
|
||||
Source code of a script seperated by line.
|
||||
"""
|
||||
return [line for line in lines if not line.lstrip().startswith("#") and line != "\n"]
|
||||
|
||||
|
||||
def compare_against_test(base_filename: str, feature_filename: str, parser_only: bool, secondary_filename: str = None):
|
||||
"""
|
||||
Tests whether the additional code inside of `feature_filename` was implemented in `base_filename`. This should be
|
||||
used when testing to see if `complete_*_.py` examples have all of the implementations from each of the
|
||||
`examples/by_feature/*` scripts.
|
||||
|
||||
It utilizes `nlp_example.py` to extract out all of the repeated training code, so that only the new additional code
|
||||
is examined and checked. If something *other* than `nlp_example.py` should be used, such as `cv_example.py` for the
|
||||
`complete_cv_example.py` script, it should be passed in for the `secondary_filename` parameter.
|
||||
|
||||
Args:
|
||||
base_filename (`str` or `os.PathLike`):
|
||||
The filepath of a single "complete" example script to test, such as `examples/complete_cv_example.py`
|
||||
feature_filename (`str` or `os.PathLike`):
|
||||
The filepath of a single feature example script. The contents of this script are checked to see if they
|
||||
exist in `base_filename`
|
||||
parser_only (`bool`):
|
||||
Whether to compare only the `main()` sections in both files, or to compare the contents of
|
||||
`training_loop()`
|
||||
secondary_filename (`str`, *optional*):
|
||||
A potential secondary filepath that should be included in the check. This function extracts the base
|
||||
functionalities off of "examples/nlp_example.py", so if `base_filename` is a script other than
|
||||
`complete_nlp_example.py`, the template script should be included here. Such as `examples/cv_example.py`
|
||||
"""
|
||||
with open(base_filename, "r") as f:
|
||||
base_file_contents = f.readlines()
|
||||
with open(os.path.abspath(os.path.join("examples", "nlp_example.py")), "r") as f:
|
||||
full_file_contents = f.readlines()
|
||||
with open(feature_filename, "r") as f:
|
||||
feature_file_contents = f.readlines()
|
||||
if secondary_filename is not None:
|
||||
with open(secondary_filename, "r") as f:
|
||||
secondary_file_contents = f.readlines()
|
||||
|
||||
# This is our base, we remove all the code from here in our `full_filename` and `feature_filename` to find the new content
|
||||
if parser_only:
|
||||
base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "main"))
|
||||
full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "main"))
|
||||
feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "main"))
|
||||
if secondary_filename is not None:
|
||||
secondary_file_func = clean_lines(get_function_contents_by_name(secondary_file_contents, "main"))
|
||||
else:
|
||||
base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "training_function"))
|
||||
full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "training_function"))
|
||||
feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "training_function"))
|
||||
if secondary_filename is not None:
|
||||
secondary_file_func = clean_lines(
|
||||
get_function_contents_by_name(secondary_file_contents, "training_function")
|
||||
)
|
||||
|
||||
_dl_line = "train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)\n"
|
||||
|
||||
# Specific code in our script that differs from the full version, aka what is new
|
||||
new_feature_code = []
|
||||
passed_idxs = [] # We keep track of the idxs just in case it's a repeated statement
|
||||
for i, line in enumerate(feature_file_func):
|
||||
if i not in passed_idxs:
|
||||
if (line not in full_file_func) and (line.lstrip() != _dl_line):
|
||||
new_feature_code.append(line)
|
||||
passed_idxs.append(i)
|
||||
|
||||
# Extract out just the new parts from the full_file_training_func
|
||||
new_full_example_parts = []
|
||||
passed_idxs = [] # We keep track of the idxs just in case it's a repeated statement
|
||||
for i, line in enumerate(base_file_func):
|
||||
if i not in passed_idxs:
|
||||
if (line not in full_file_func) and (line.lstrip() != _dl_line):
|
||||
new_full_example_parts.append(line)
|
||||
passed_idxs.append(i)
|
||||
|
||||
# Finally, get the overall diff
|
||||
diff_from_example = [line for line in new_feature_code if line not in new_full_example_parts]
|
||||
if secondary_filename is not None:
|
||||
diff_from_two = [line for line in full_file_contents if line not in secondary_file_func]
|
||||
diff_from_example = [line for line in diff_from_example if line not in diff_from_two]
|
||||
|
||||
return diff_from_example
|
||||
@ -36,14 +36,14 @@ def init_state_check():
|
||||
def rng_sync_check():
|
||||
state = AcceleratorState()
|
||||
synchronize_rng_states(["torch"])
|
||||
assert are_the_same_tensors(torch.get_rng_state())
|
||||
assert are_the_same_tensors(torch.get_rng_state()), "RNG states improperly synchronized on CPU."
|
||||
if state.distributed_type == DistributedType.MULTI_GPU:
|
||||
synchronize_rng_states(["cuda"])
|
||||
assert are_the_same_tensors(torch.cuda.get_rng_state())
|
||||
assert are_the_same_tensors(torch.cuda.get_rng_state()), "RNG states improperly synchronized on GPU."
|
||||
if version.parse(torch.__version__) >= version.parse("1.6.0"):
|
||||
generator = torch.Generator()
|
||||
synchronize_rng_states(["generator"], generator=generator)
|
||||
assert are_the_same_tensors(generator.get_state())
|
||||
assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."
|
||||
|
||||
if state.local_process_index == 0:
|
||||
print("All rng are properly synched.")
|
||||
@ -59,7 +59,9 @@ def dl_preparation_check():
|
||||
for batch in dl:
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result)
|
||||
assert torch.equal(result.cpu(), torch.arange(0, length).long())
|
||||
|
||||
print(state.process_index, result, type(dl))
|
||||
assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
|
||||
|
||||
dl = DataLoader(range(length), batch_size=8)
|
||||
dl = prepare_data_loader(
|
||||
@ -74,7 +76,7 @@ def dl_preparation_check():
|
||||
for batch in dl:
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result)
|
||||
assert torch.equal(result.cpu(), torch.arange(0, length).long())
|
||||
assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
|
||||
|
||||
if state.process_index == 0:
|
||||
print("Non-shuffled dataloader passing.")
|
||||
@ -86,7 +88,7 @@ def dl_preparation_check():
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result).tolist()
|
||||
result.sort()
|
||||
assert result == list(range(length))
|
||||
assert result == list(range(length)), "Wrong shuffled dataloader result."
|
||||
|
||||
dl = DataLoader(range(length), batch_size=8, shuffle=True)
|
||||
dl = prepare_data_loader(
|
||||
@ -102,12 +104,77 @@ def dl_preparation_check():
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result).tolist()
|
||||
result.sort()
|
||||
assert result == list(range(length))
|
||||
assert result == list(range(length)), "Wrong shuffled dataloader result."
|
||||
|
||||
if state.local_process_index == 0:
|
||||
print("Shuffled dataloader passing.")
|
||||
|
||||
|
||||
def central_dl_preparation_check():
|
||||
state = AcceleratorState()
|
||||
length = 32 * state.num_processes
|
||||
|
||||
dl = DataLoader(range(length), batch_size=8)
|
||||
dl = prepare_data_loader(
|
||||
dl, state.device, state.num_processes, state.process_index, put_on_device=True, dispatch_batches=True
|
||||
)
|
||||
result = []
|
||||
for batch in dl:
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result)
|
||||
assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
|
||||
|
||||
dl = DataLoader(range(length), batch_size=8)
|
||||
dl = prepare_data_loader(
|
||||
dl,
|
||||
state.device,
|
||||
state.num_processes,
|
||||
state.process_index,
|
||||
put_on_device=True,
|
||||
split_batches=True,
|
||||
dispatch_batches=True,
|
||||
)
|
||||
result = []
|
||||
for batch in dl:
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result)
|
||||
assert torch.equal(result.cpu(), torch.arange(0, length).long()), "Wrong non-shuffled dataloader result."
|
||||
|
||||
if state.process_index == 0:
|
||||
print("Non-shuffled central dataloader passing.")
|
||||
|
||||
dl = DataLoader(range(length), batch_size=8, shuffle=True)
|
||||
dl = prepare_data_loader(
|
||||
dl, state.device, state.num_processes, state.process_index, put_on_device=True, dispatch_batches=True
|
||||
)
|
||||
result = []
|
||||
for batch in dl:
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result).tolist()
|
||||
result.sort()
|
||||
assert result == list(range(length)), "Wrong shuffled dataloader result."
|
||||
|
||||
dl = DataLoader(range(length), batch_size=8, shuffle=True)
|
||||
dl = prepare_data_loader(
|
||||
dl,
|
||||
state.device,
|
||||
state.num_processes,
|
||||
state.process_index,
|
||||
put_on_device=True,
|
||||
split_batches=True,
|
||||
dispatch_batches=True,
|
||||
)
|
||||
result = []
|
||||
for batch in dl:
|
||||
result.append(gather(batch))
|
||||
result = torch.cat(result).tolist()
|
||||
result.sort()
|
||||
assert result == list(range(length)), "Wrong shuffled dataloader result."
|
||||
|
||||
if state.local_process_index == 0:
|
||||
print("Shuffled central dataloader passing.")
|
||||
|
||||
|
||||
def mock_training(length, batch_size, generator):
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
@ -132,8 +199,8 @@ def training_check():
|
||||
length = batch_size * 4 * state.num_processes
|
||||
|
||||
train_set, old_model = mock_training(length, batch_size * state.num_processes, generator)
|
||||
assert are_the_same_tensors(old_model.a)
|
||||
assert are_the_same_tensors(old_model.b)
|
||||
assert are_the_same_tensors(old_model.a), "Did not obtain the same model on both processes."
|
||||
assert are_the_same_tensors(old_model.b), "Did not obtain the same model on both processes."
|
||||
|
||||
accelerator = Accelerator()
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
@ -152,8 +219,8 @@ def training_check():
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a)
|
||||
assert torch.allclose(old_model.b, model.b)
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
accelerator.print("Training yielded the same results on one CPU or distributed setup with no batch split.")
|
||||
|
||||
@ -174,12 +241,35 @@ def training_check():
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a)
|
||||
assert torch.allclose(old_model.b, model.b)
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
accelerator.print("Training yielded the same results on one CPU or distributes setup with batch split.")
|
||||
|
||||
# Mostly a test that FP16 doesn't crash as the operation inside the model is not converted to FP16
|
||||
print("FP16 training check.")
|
||||
accelerator = Accelerator(mixed_precision="fp16")
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
# TEST that previous fp16 flag still works
|
||||
print("Legacy FP16 training check.")
|
||||
accelerator = Accelerator(fp16=True)
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
@ -197,8 +287,30 @@ def training_check():
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a)
|
||||
assert torch.allclose(old_model.b, model.b)
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
# Mostly a test that BF16 doesn't crash as the operation inside the model is not converted to BF16
|
||||
print("BF16 training check.")
|
||||
accelerator = Accelerator(mixed_precision="bf16")
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
|
||||
def main():
|
||||
@ -215,6 +327,11 @@ def main():
|
||||
if state.local_process_index == 0:
|
||||
print("\n**DataLoader integration test**")
|
||||
dl_preparation_check()
|
||||
central_dl_preparation_check()
|
||||
|
||||
# Trainings are not exactly the same in DeepSpeed and CPU mode
|
||||
if state.distributed_type == DistributedType.DEEPSPEED:
|
||||
return
|
||||
|
||||
if state.local_process_index == 0:
|
||||
print("\n**Training integration test**")
|
||||
|
||||
@ -13,13 +13,116 @@
|
||||
# limitations under the License.
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from distutils.util import strtobool
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
from unittest import mock
|
||||
|
||||
import torch
|
||||
|
||||
from ..state import AcceleratorState, is_tpu_available
|
||||
from ..utils import gather
|
||||
from ..utils import gather, is_tensorflow_available
|
||||
|
||||
|
||||
def parse_flag_from_env(key, default=False):
|
||||
try:
|
||||
value = os.environ[key]
|
||||
except KeyError:
|
||||
# KEY isn't set, default to `default`.
|
||||
_value = default
|
||||
else:
|
||||
# KEY is set, convert it to True or False.
|
||||
try:
|
||||
_value = strtobool(value)
|
||||
except ValueError:
|
||||
# More values are supported, but let's keep the message simple.
|
||||
raise ValueError(f"If set, {key} must be yes or no.")
|
||||
return _value
|
||||
|
||||
|
||||
_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
|
||||
|
||||
|
||||
def slow(test_case):
|
||||
"""
|
||||
Decorator marking a test as slow. Slow tests are skipped by default. Set the RUN_SLOW environment variable to a
|
||||
truthy value to run them.
|
||||
"""
|
||||
if not _run_slow_tests:
|
||||
return unittest.skip("test is slow")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
class TempDirTestCase(unittest.TestCase):
|
||||
"""
|
||||
A TestCase class that keeps a single `tempfile.TemporaryDirectory` open for the duration of the class, wipes its
|
||||
data at the start of a test, and then destroyes it at the end of the TestCase.
|
||||
|
||||
Useful for when a class or API requires a single constant folder throughout it's use, such as Weights and Biases
|
||||
|
||||
The temporary directory location will be stored in `self.tmpdir`
|
||||
"""
|
||||
|
||||
clear_on_setup = True
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"Creates a `tempfile.TemporaryDirectory` and stores it in `cls.tmpdir`"
|
||||
cls.tmpdir = tempfile.mkdtemp()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"Remove `cls.tmpdir` after test suite has finished"
|
||||
if os.path.exists(cls.tmpdir):
|
||||
shutil.rmtree(cls.tmpdir)
|
||||
|
||||
def setUp(self):
|
||||
"Destroy all contents in `self.tmpdir`, but not `self.tmpdir`"
|
||||
if self.clear_on_setup:
|
||||
for path in Path(self.tmpdir).glob("**/*"):
|
||||
if path.is_file():
|
||||
path.unlink()
|
||||
elif path.is_dir():
|
||||
shutil.rmtree(path)
|
||||
|
||||
|
||||
class MockingTestCase(unittest.TestCase):
|
||||
"""
|
||||
A TestCase class designed to dynamically add various mockers that should be used in every test, mimicking the
|
||||
behavior of a class-wide mock when defining one normally will not do.
|
||||
|
||||
Useful when a mock requires specific information available only initialized after `TestCase.setUpClass`, such as
|
||||
setting an environment variable with that information.
|
||||
|
||||
The `add_mocks` function should be ran at the end of a `TestCase`'s `setUp` function, after a call to
|
||||
`super().setUp()` such as:
|
||||
```python
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
mocks = mock.patch.dict(os.environ, {"SOME_ENV_VAR", "SOME_VALUE"})
|
||||
self.add_mocks(mocks)
|
||||
```
|
||||
"""
|
||||
|
||||
def add_mocks(self, mocks: Union[mock.Mock, List[mock.Mock]]):
|
||||
"""
|
||||
Add custom mocks for tests that should be repeated on each test. Should be called during
|
||||
`MockingTestCase.setUp`, after `super().setUp()`.
|
||||
|
||||
Args:
|
||||
mocks (`mock.Mock` or list of `mock.Mock`):
|
||||
Mocks that should be added to the `TestCase` after `TestCase.setUpClass` has been run
|
||||
"""
|
||||
self.mocks = mocks if isinstance(mocks, (tuple, list)) else [mocks]
|
||||
for m in self.mocks:
|
||||
m.start()
|
||||
self.addCleanup(m.stop)
|
||||
|
||||
|
||||
def are_the_same_tensors(tensor):
|
||||
@ -64,6 +167,17 @@ def require_multi_gpu(test_case):
|
||||
return test_case
|
||||
|
||||
|
||||
def require_tensorflow(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires TensorFlow installed. These tests are skipped when TensorFlow isn't
|
||||
installed
|
||||
"""
|
||||
if not is_tensorflow_available():
|
||||
return unittest.skip("test requires TensorFlow")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
class _RunOutput:
|
||||
def __init__(self, returncode, stdout, stderr):
|
||||
self.returncode = returncode
|
||||
|
||||
@ -36,6 +36,10 @@ class RegressionModel(torch.nn.Module):
|
||||
super().__init__()
|
||||
self.a = torch.nn.Parameter(torch.tensor(a).float())
|
||||
self.b = torch.nn.Parameter(torch.tensor(b).float())
|
||||
self.first_batch = True
|
||||
|
||||
def forward(self, x=None):
|
||||
if self.first_batch:
|
||||
print(f"Model dtype: {self.a.dtype}, {self.b.dtype}. Input dtype: {x.dtype}")
|
||||
self.first_batch = False
|
||||
return x * self.a + self.b
|
||||
|
||||
321
src/accelerate/tracking.py
Normal file
321
src/accelerate/tracking.py
Normal file
@ -0,0 +1,321 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Expectation:
|
||||
# Provide a project dir name, then each type of logger gets stored in project/{`logging_dir`}
|
||||
|
||||
import logging
|
||||
import os
|
||||
from abc import ABCMeta, abstractmethod, abstractproperty
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from .utils import LoggerType, is_comet_ml_available, is_tensorboard_available, is_wandb_available
|
||||
|
||||
|
||||
_available_trackers = []
|
||||
|
||||
if is_tensorboard_available():
|
||||
from torch.utils import tensorboard
|
||||
|
||||
_available_trackers.append(LoggerType.TENSORBOARD)
|
||||
|
||||
if is_wandb_available():
|
||||
import wandb
|
||||
|
||||
_available_trackers.append(LoggerType.WANDB)
|
||||
|
||||
if is_comet_ml_available():
|
||||
from comet_ml import Experiment
|
||||
|
||||
_available_trackers.append(LoggerType.COMETML)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_available_trackers():
|
||||
"Returns a list of all supported available trackers in the system"
|
||||
return _available_trackers
|
||||
|
||||
|
||||
class GeneralTracker(object, metaclass=ABCMeta):
|
||||
"""
|
||||
A base Tracker class to be used for all logging integration implementations.
|
||||
"""
|
||||
|
||||
@abstractproperty
|
||||
def requires_logging_directory(self):
|
||||
"""
|
||||
Whether the logger requires a directory to store their logs. Should either return `True` or `False`.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def store_init_configuration(self, values: dict):
|
||||
"""
|
||||
Logs `values` as hyperparameters for the run. Implementations should use the experiment configuration
|
||||
functionality of a tracking API.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `bool`, `str`, `float` or `int`):
|
||||
Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
|
||||
`str`, `float`, `int`, or `None`.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def log(self, values: dict, step: Optional[int]):
|
||||
"""
|
||||
Logs `values` to the current run. Base `log` implementations of a tracking API should go in here, along with
|
||||
special behavior for the `step parameter.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `str`, `float`, or `int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
pass
|
||||
|
||||
def finish(self):
|
||||
"""
|
||||
Should run any finalizing functions within the tracking API. If the API should not have one, just don't
|
||||
overwrite that method.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TensorBoardTracker(GeneralTracker):
|
||||
"""
|
||||
A `Tracker` class that supports `tensorboard`. Should be initialized at the start of your script.
|
||||
|
||||
Args:
|
||||
run_name (`str`):
|
||||
The name of the experiment run
|
||||
logging_dir (`str`, `os.PathLike`):
|
||||
Location for TensorBoard logs to be stored.
|
||||
"""
|
||||
|
||||
requires_logging_directory = True
|
||||
|
||||
def __init__(self, run_name: str, logging_dir: Optional[Union[str, os.PathLike]]):
|
||||
self.run_name = run_name
|
||||
self.logging_dir = os.path.join(logging_dir, run_name)
|
||||
self.writer = tensorboard.SummaryWriter(self.logging_dir)
|
||||
logger.info(f"Initialized TensorBoard project {self.run_name} logging to {self.logging_dir}")
|
||||
logger.info(
|
||||
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
|
||||
)
|
||||
|
||||
def store_init_configuration(self, values: dict):
|
||||
"""
|
||||
Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `bool`, `str`, `float` or `int`):
|
||||
Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
|
||||
`str`, `float`, `int`, or `None`.
|
||||
"""
|
||||
self.writer.add_hparams(values, metric_dict={})
|
||||
self.writer.flush()
|
||||
logger.info("Stored initial configuration hyperparameters to TensorBoard")
|
||||
|
||||
def log(self, values: dict, step: Optional[int] = None):
|
||||
"""
|
||||
Logs `values` to the current run.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `str`, `float`, or `int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
for k, v in values.items():
|
||||
if isinstance(v, (int, float)):
|
||||
self.writer.add_scalar(k, v, global_step=step)
|
||||
elif isinstance(v, str):
|
||||
self.writer.add_text(k, v, global_step=step)
|
||||
self.writer.flush()
|
||||
logger.info("Successfully logged to TensorBoard")
|
||||
|
||||
def finish(self):
|
||||
"""
|
||||
Closes `TensorBoard` writer
|
||||
"""
|
||||
self.writer.close()
|
||||
logger.info("TensorBoard writer closed")
|
||||
|
||||
|
||||
class WandBTracker(GeneralTracker):
|
||||
"""
|
||||
A `Tracker` class that supports `wandb`. Should be initialized at the start of your script.
|
||||
|
||||
Args:
|
||||
run_name (`str`):
|
||||
The name of the experiment run.
|
||||
"""
|
||||
|
||||
requires_logging_directory = False
|
||||
|
||||
def __init__(self, run_name: str):
|
||||
self.run_name = run_name
|
||||
self.run = wandb.init(self.run_name)
|
||||
logger.info(f"Initialized WandB project {self.run_name}")
|
||||
logger.info(
|
||||
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
|
||||
)
|
||||
|
||||
def store_init_configuration(self, values: dict):
|
||||
"""
|
||||
Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `bool`, `str`, `float` or `int`):
|
||||
Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
|
||||
`str`, `float`, `int`, or `None`.
|
||||
"""
|
||||
wandb.config.update(values)
|
||||
logger.info("Stored initial configuration hyperparameters to WandB")
|
||||
|
||||
def log(self, values: dict, step: Optional[int] = None):
|
||||
"""
|
||||
Logs `values` to the current run.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `str`, `float`, or `int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
self.run.log(values, step=step)
|
||||
logger.info("Successfully logged to WandB")
|
||||
|
||||
def finish(self):
|
||||
"""
|
||||
Closes `wandb` writer
|
||||
"""
|
||||
self.run.finish()
|
||||
logger.info("WandB run closed")
|
||||
|
||||
|
||||
class CometMLTracker(GeneralTracker):
|
||||
"""
|
||||
A `Tracker` class that supports `comet_ml`. Should be initialized at the start of your script.
|
||||
|
||||
API keys must be stored in a Comet config file.
|
||||
|
||||
Args:
|
||||
run_name (`str`):
|
||||
The name of the experiment run.
|
||||
"""
|
||||
|
||||
requires_logging_directory = False
|
||||
|
||||
def __init__(self, run_name: str):
|
||||
self.run_name = run_name
|
||||
self.writer = Experiment(project_name=run_name)
|
||||
logger.info(f"Initialized CometML project {self.run_name}")
|
||||
logger.info(
|
||||
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
|
||||
)
|
||||
|
||||
def store_init_configuration(self, values: dict):
|
||||
"""
|
||||
Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `bool`, `str`, `float` or `int`):
|
||||
Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
|
||||
`str`, `float`, `int`, or `None`.
|
||||
"""
|
||||
self.writer.log_parameters(values)
|
||||
logger.info("Stored initial configuration hyperparameters to CometML")
|
||||
|
||||
def log(self, values: dict, step: Optional[int] = None):
|
||||
"""
|
||||
Logs `values` to the current run.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `str`, `float`, or `int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
if step is not None:
|
||||
self.writer.set_step(step)
|
||||
self.writer.log_others(values)
|
||||
logger.info("Successfully logged to CometML")
|
||||
|
||||
def finish(self):
|
||||
"""
|
||||
Closes `comet-ml` writer
|
||||
"""
|
||||
self.writer.end()
|
||||
logger.info("CometML run closed")
|
||||
|
||||
|
||||
LOGGER_TYPE_TO_CLASS = {"tensorboard": TensorBoardTracker, "wandb": WandBTracker, "comet_ml": CometMLTracker}
|
||||
|
||||
|
||||
def filter_trackers(
|
||||
log_with: List[Union[str, LoggerType, GeneralTracker]], logging_dir: Union[str, os.PathLike] = None
|
||||
):
|
||||
"""
|
||||
Takes in a list of potential tracker types and checks that:
|
||||
- The tracker wanted is available in that environment
|
||||
- Filters out repeats of tracker types
|
||||
- If `all` is in `log_with`, will return all trackers in the environment
|
||||
- If a tracker requires a `logging_dir`, ensures that `logging_dir` is not `None`
|
||||
|
||||
Args:
|
||||
log_with (list of `str`, [`~utils.LoggerType`] or [`~tracking.GeneralTracker`], *optional*):
|
||||
A list of loggers to be setup for experiment tracking. Should be one or several of:
|
||||
|
||||
- `"all"`
|
||||
- `"tensorboard"`
|
||||
- `"wandb"`
|
||||
- `"comet_ml"`
|
||||
If `"all`" is selected, will pick up all available trackers in the environment and intialize them. Can also
|
||||
accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
|
||||
logging_dir (`str`, `os.PathLike`, *optional*):
|
||||
A path to a directory for storing logs of locally-compatible loggers.
|
||||
"""
|
||||
loggers = []
|
||||
if log_with is not None:
|
||||
if not isinstance(log_with, (list, tuple)):
|
||||
log_with = [log_with]
|
||||
logger.debug(f"{log_with}")
|
||||
if "all" in log_with or LoggerType.ALL in log_with:
|
||||
loggers = [o for o in log_with if issubclass(type(o), GeneralTracker)] + get_available_trackers()
|
||||
else:
|
||||
for log_type in log_with:
|
||||
if log_type not in LoggerType and not issubclass(type(log_type), GeneralTracker):
|
||||
raise ValueError(f"Unsupported logging capability: {log_type}. Choose between {LoggerType.list()}")
|
||||
if issubclass(type(log_type), GeneralTracker):
|
||||
loggers.append(log_type)
|
||||
else:
|
||||
log_type = LoggerType(log_type)
|
||||
if log_type not in loggers:
|
||||
if log_type in get_available_trackers():
|
||||
tracker_init = LOGGER_TYPE_TO_CLASS[str(log_type)]
|
||||
if getattr(tracker_init, "requires_logging_directory"):
|
||||
if logging_dir is None:
|
||||
raise ValueError(
|
||||
f"Logging with `{str(log_type)}` requires a `logging_dir` to be passed in."
|
||||
)
|
||||
loggers.append(log_type)
|
||||
else:
|
||||
logger.info(f"Tried adding logger {log_type}, but package is unavailable in the system.")
|
||||
|
||||
return loggers
|
||||
@ -12,15 +12,23 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
import importlib
|
||||
import os
|
||||
import random
|
||||
import typing
|
||||
from collections.abc import Mapping
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
from enum import Enum, EnumMeta
|
||||
from functools import update_wrapper
|
||||
from typing import Any, Callable, Iterable, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.distributed import ReduceOp
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .state import AcceleratorState, DistributedType, is_deepspeed_available, is_tpu_available
|
||||
|
||||
@ -29,6 +37,22 @@ if is_tpu_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
|
||||
def is_tensorflow_available():
|
||||
return importlib.util.find_spec("tensorflow") is not None
|
||||
|
||||
|
||||
def is_tensorboard_available():
|
||||
return importlib.util.find_spec("tensorboard") is not None or importlib.util.find_spec("tensorboardX") is not None
|
||||
|
||||
|
||||
def is_wandb_available():
|
||||
return importlib.util.find_spec("wandb") is not None
|
||||
|
||||
|
||||
def is_comet_ml_available():
|
||||
return importlib.util.find_spec("comet_ml") is not None
|
||||
|
||||
|
||||
def is_boto3_available():
|
||||
return importlib.util.find_spec("boto3") is not None
|
||||
|
||||
@ -40,21 +64,73 @@ def is_sagemaker_available():
|
||||
if is_deepspeed_available():
|
||||
from deepspeed import DeepSpeedEngine
|
||||
|
||||
SCALER_NAME = "scaler.pt"
|
||||
MODEL_NAME = "pytorch_model"
|
||||
RNG_STATE_NAME = "random_states"
|
||||
OPTIMIZER_NAME = "optimizer"
|
||||
SCHEDULER_NAME = "scheduler"
|
||||
|
||||
class RNGType(Enum):
|
||||
|
||||
class EnumWithContains(EnumMeta):
|
||||
"A metaclass that adds the ability to check if `self` contains an item with the `in` operator"
|
||||
|
||||
def __contains__(cls, item):
|
||||
try:
|
||||
cls(item)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class BaseEnum(Enum, metaclass=EnumWithContains):
|
||||
"An enum class that can get the value of an item with `str(Enum.key)`"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
@classmethod
|
||||
def list(cls):
|
||||
"Method to list all the possible items in `cls`"
|
||||
return list(map(lambda item: str(item), cls))
|
||||
|
||||
|
||||
class LoggerType(BaseEnum):
|
||||
ALL = "all"
|
||||
TENSORBOARD = "tensorboard"
|
||||
WANDB = "wandb"
|
||||
COMETML = "comet_ml"
|
||||
|
||||
|
||||
class PrecisionType(BaseEnum):
|
||||
NO = "no"
|
||||
FP16 = "fp16"
|
||||
BF16 = "bf16"
|
||||
|
||||
|
||||
class RNGType(BaseEnum):
|
||||
TORCH = "torch"
|
||||
CUDA = "cuda"
|
||||
XLA = "xla"
|
||||
GENERATOR = "generator"
|
||||
|
||||
|
||||
def set_seed(seed: int):
|
||||
@dataclass
|
||||
class TensorInformation:
|
||||
shape: torch.Size
|
||||
dtype: torch.dtype
|
||||
|
||||
|
||||
def set_seed(seed: int, device_specific: bool = False):
|
||||
"""
|
||||
Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch``.
|
||||
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
|
||||
|
||||
Args:
|
||||
seed (:obj:`int`): The seed to set.
|
||||
seed (`int`): The seed to set.
|
||||
device_specific (`bool`, *optional*, defaults to `False`):
|
||||
Whether to differ the seed on each device slightly with `self.process_index`.
|
||||
"""
|
||||
if device_specific:
|
||||
seed += AcceleratorState().process_index
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
@ -81,7 +157,7 @@ def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optiona
|
||||
state = AcceleratorState()
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
rng_state = xm.mesh_reduce("random_seed", rng_state, lambda x: x[0])
|
||||
elif state.distributed_type == DistributedType.MULTI_GPU:
|
||||
elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
rng_state = rng_state.to(state.device)
|
||||
torch.distributed.broadcast(rng_state, 0)
|
||||
rng_state = rng_state.cpu()
|
||||
@ -108,11 +184,70 @@ def honor_type(obj, generator):
|
||||
"""
|
||||
Cast a generator to the same type as obj (list, tuple or namedtuple)
|
||||
"""
|
||||
# There is no direct check whether an object if of type namedtuple sadly, this is a workaround.
|
||||
if isinstance(obj, tuple) and hasattr(obj, "_fields"):
|
||||
# Can instantiate a namedtuple from a generator directly, contrary to a tuple/list.
|
||||
try:
|
||||
return type(obj)(generator)
|
||||
except TypeError:
|
||||
# Some objects may not be able to instantiate from a generator directly
|
||||
return type(obj)(*list(generator))
|
||||
return type(obj)(generator)
|
||||
|
||||
|
||||
def is_torch_tensor(tensor):
|
||||
return isinstance(tensor, torch.Tensor)
|
||||
|
||||
|
||||
def is_tensor_information(tensor_info):
|
||||
return isinstance(tensor_info, TensorInformation)
|
||||
|
||||
|
||||
def recursively_apply(func, data, *args, test_type=is_torch_tensor, error_on_other_type=False, **kwargs):
|
||||
"""
|
||||
Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.
|
||||
|
||||
Args:
|
||||
func (`callable`):
|
||||
The function to recursively apply.
|
||||
data (nested list/tuple/dictionary of `main_type`):
|
||||
The data on which to apply `func`
|
||||
*args:
|
||||
Positional arguments that will be passed to `func` when applied on the unpacked data.
|
||||
main_type (`type`, *optional*, defaults to `torch.Tensor`):
|
||||
The base type of the objects to which apply `func`.
|
||||
error_on_other_type (`bool`, *optional*, defaults to `False`):
|
||||
Whether to return an error or not if after unpacking `data`, we get on an object that is not of type
|
||||
`main_type`. If `False`, the function will leave objects of types different than `main_type` unchanged.
|
||||
**kwargs:
|
||||
Keyword arguments that will be passed to `func` when applied on the unpacked data.
|
||||
|
||||
Returns:
|
||||
The same data structure as `data` with `func` applied to every object of type `main_type`.
|
||||
"""
|
||||
if isinstance(data, (tuple, list)):
|
||||
return honor_type(
|
||||
data,
|
||||
(
|
||||
recursively_apply(
|
||||
func, o, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
|
||||
)
|
||||
for o in data
|
||||
),
|
||||
)
|
||||
elif isinstance(data, Mapping):
|
||||
return type(data)(
|
||||
{
|
||||
k: recursively_apply(
|
||||
func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
|
||||
)
|
||||
for k, v in data.items()
|
||||
}
|
||||
)
|
||||
elif test_type(data):
|
||||
return func(data, *args, **kwargs)
|
||||
elif error_on_other_type:
|
||||
raise TypeError(
|
||||
f"Can't apply {func.__name__} on object of type {type(data)}, only of nested list/tuple/dicts of objects "
|
||||
f"that satisfy {test_type.__name__}."
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
def send_to_device(tensor, device):
|
||||
@ -120,61 +255,104 @@ def send_to_device(tensor, device):
|
||||
Recursively sends the elements in a nested list/tuple/dictionary of tensors to a given device.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of :obj:`torch.Tensor`):
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to send to a given device.
|
||||
device (:obj:`torch.device`):
|
||||
The device to send the data to
|
||||
device (`torch.device`):
|
||||
The device to send the data to.
|
||||
|
||||
Returns:
|
||||
The same data structure as :obj:`tensor` with all tensors sent to the proper device.
|
||||
The same data structure as `tensor` with all tensors sent to the proper device.
|
||||
"""
|
||||
if isinstance(tensor, (list, tuple)):
|
||||
return honor_type(tensor, (send_to_device(t, device) for t in tensor))
|
||||
elif isinstance(tensor, dict):
|
||||
return type(tensor)({k: send_to_device(v, device) for k, v in tensor.items()})
|
||||
elif not hasattr(tensor, "to"):
|
||||
return tensor
|
||||
return tensor.to(device)
|
||||
|
||||
def _send_to_device(t, device):
|
||||
return t.to(device)
|
||||
|
||||
def _has_to_method(t):
|
||||
return hasattr(t, "to")
|
||||
|
||||
return recursively_apply(_send_to_device, tensor, device, test_type=_has_to_method)
|
||||
|
||||
|
||||
def get_data_structure(data):
|
||||
"""
|
||||
Recursively gathers the information needed to rebuild a nested list/tuple/dictionary of tensors.
|
||||
|
||||
Args:
|
||||
data (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to send to analyze.
|
||||
|
||||
Returns:
|
||||
The same data structure as `data` with [`~utils.TensorInformation`] instead of tensors.
|
||||
"""
|
||||
|
||||
def _get_data_structure(tensor):
|
||||
return TensorInformation(shape=tensor.shape, dtype=tensor.dtype)
|
||||
|
||||
return recursively_apply(_get_data_structure, data)
|
||||
|
||||
|
||||
def initialize_tensors(data_structure):
|
||||
"""
|
||||
Recursively initializes tensors from a nested list/tuple/dictionary of [`~utils.TensorInformation`].
|
||||
|
||||
Returns:
|
||||
The same data structure as `data` with tensors instead of [`~utils.TensorInformation`].
|
||||
"""
|
||||
|
||||
def _initialize_tensor(tensor_info):
|
||||
return torch.empty(*tensor_info.shape, dtype=tensor_info.dtype)
|
||||
|
||||
return recursively_apply(_initialize_tensor, data_structure, test_type=is_tensor_information)
|
||||
|
||||
|
||||
def convert_to_fp32(tensor):
|
||||
"""
|
||||
Recursively converts the lements nested list/tuple/dictionary of tensors in FP16 precision to FP32.
|
||||
Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of :obj:`torch.Tensor`):
|
||||
The data to convert from FP16 to FP32.
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to convert from FP16/BF16 to FP32.
|
||||
|
||||
Returns:
|
||||
The same data structure as :obj:`tensor` with all tensors that were in FP16 precision converted to FP32.
|
||||
The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
|
||||
"""
|
||||
if isinstance(tensor, (list, tuple)):
|
||||
return honor_type(tensor, (convert_to_fp32(t) for t in tensor))
|
||||
elif isinstance(tensor, dict):
|
||||
return type(tensor)({k: convert_to_fp32(v) for k, v in tensor.items()})
|
||||
elif not hasattr(tensor, "dtype") or tensor.dtype != torch.float16:
|
||||
return tensor
|
||||
return tensor.float()
|
||||
|
||||
def _convert_to_fp32(tensor):
|
||||
return tensor.float()
|
||||
|
||||
def _is_fp16_bf16_tensor(tensor):
|
||||
return hasattr(tensor, "dtype") and (
|
||||
tensor.dtype == torch.float16
|
||||
or (version.parse(torch.__version__) >= version.parse("1.10") and tensor.dtype == torch.bfloat16)
|
||||
)
|
||||
|
||||
return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor)
|
||||
|
||||
|
||||
def convert_outputs_to_fp32(model_forward):
|
||||
class ConvertOutputsToFp32:
|
||||
"""
|
||||
Decorator to apply to a function outputing tensors (like a model forward pass) that ensures the outputs in FP16
|
||||
precision will be convert back to FP32.
|
||||
|
||||
Use a class instead of a decorator because otherwise, the prepared model can no longer be pickled (issue #273).
|
||||
|
||||
Args:
|
||||
model_forward (:obj:`Callable`):
|
||||
model_forward (`Callable`):
|
||||
The function which outputs we want to treat.
|
||||
|
||||
Returns:
|
||||
The same function as :obj:`model_forward` but with converted outputs.
|
||||
The same function as `model_forward` but with converted outputs.
|
||||
"""
|
||||
|
||||
def convert_outputs(*args, **kwargs):
|
||||
outputs = model_forward(*args, **kwargs)
|
||||
return convert_to_fp32(outputs)
|
||||
def __init__(self, model_forward):
|
||||
self.model_forward = model_forward
|
||||
update_wrapper(self, model_forward)
|
||||
|
||||
return convert_outputs
|
||||
def __call__(self, *args, **kwargs):
|
||||
return convert_to_fp32(self.model_forward(*args, **kwargs))
|
||||
|
||||
|
||||
convert_outputs_to_fp32 = ConvertOutputsToFp32
|
||||
|
||||
|
||||
def extract_model_from_parallel(model):
|
||||
@ -182,10 +360,10 @@ def extract_model_from_parallel(model):
|
||||
Extract a model from its distributed containers.
|
||||
|
||||
Args:
|
||||
model (:obj:`torch.nn.Module`): The model to extract.
|
||||
model (`torch.nn.Module`): The model to extract.
|
||||
|
||||
Returns:
|
||||
:obj:`torch.nn.Module`: The extracted model.
|
||||
`torch.nn.Module`: The extracted model.
|
||||
"""
|
||||
options = (torch.nn.parallel.DistributedDataParallel, torch.nn.DataParallel)
|
||||
if is_deepspeed_available():
|
||||
@ -196,26 +374,27 @@ def extract_model_from_parallel(model):
|
||||
return model
|
||||
|
||||
|
||||
def _tpu_gather(tensor, name="tensor"):
|
||||
def _tpu_gather(tensor, name="gather tensor"):
|
||||
if isinstance(tensor, (list, tuple)):
|
||||
return honor_type(tensor, (_tpu_gather(t, name=f"{name}_{i}") for i, t in enumerate(tensor)))
|
||||
elif isinstance(tensor, dict):
|
||||
elif isinstance(tensor, Mapping):
|
||||
return type(tensor)({k: _tpu_gather(v, name=f"{name}_{k}") for k, v in tensor.items()})
|
||||
elif not isinstance(tensor, torch.Tensor):
|
||||
raise TypeError(f"Can't gather the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors.")
|
||||
if tensor.ndim == 0:
|
||||
tensor = tensor.clone()[None]
|
||||
return xm.mesh_reduce(name, tensor, torch.cat)
|
||||
|
||||
|
||||
def _gpu_gather(tensor):
|
||||
if isinstance(tensor, (list, tuple)):
|
||||
return honor_type(tensor, (_gpu_gather(t) for t in tensor))
|
||||
elif isinstance(tensor, dict):
|
||||
return type(tensor)({k: _gpu_gather(v) for k, v in tensor.items()})
|
||||
elif not isinstance(tensor, torch.Tensor):
|
||||
raise TypeError(f"Can't gather the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors.")
|
||||
output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())]
|
||||
torch.distributed.all_gather(output_tensors, tensor)
|
||||
return torch.cat(output_tensors, dim=0)
|
||||
def _gpu_gather_one(tensor):
|
||||
if tensor.ndim == 0:
|
||||
tensor = tensor.clone()[None]
|
||||
output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())]
|
||||
torch.distributed.all_gather(output_tensors, tensor)
|
||||
return torch.cat(output_tensors, dim=0)
|
||||
|
||||
return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)
|
||||
|
||||
|
||||
_cpu_gather = _gpu_gather
|
||||
@ -226,15 +405,15 @@ def gather(tensor):
|
||||
Recursively gather tensor in a nested list/tuple/dictionary of tensors from all devices.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of :obj:`torch.Tensor`):
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to gather.
|
||||
|
||||
Returns:
|
||||
The same data structure as :obj:`tensor` with all tensors sent to the proper device.
|
||||
The same data structure as `tensor` with all tensors sent to the proper device.
|
||||
"""
|
||||
if AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
return _tpu_gather(tensor, name="accelerate.utils.gather")
|
||||
elif AcceleratorState().distributed_type == DistributedType.MULTI_GPU:
|
||||
elif AcceleratorState().distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
return _gpu_gather(tensor)
|
||||
elif AcceleratorState().distributed_type == DistributedType.MULTI_CPU:
|
||||
return _cpu_gather(tensor)
|
||||
@ -242,60 +421,251 @@ def gather(tensor):
|
||||
return tensor
|
||||
|
||||
|
||||
def _gpu_gather_object(object: Any):
|
||||
def _gpu_gather_object_one(object: Any):
|
||||
output_objects = [None for _ in range(AcceleratorState().num_processes)]
|
||||
torch.distributed.all_gather_object(output_objects, object)
|
||||
return output_objects
|
||||
|
||||
return recursively_apply(_gpu_gather_object_one, object)
|
||||
|
||||
|
||||
_cpu_gather_object = _gpu_gather_object
|
||||
|
||||
|
||||
def gather_object(object: Any):
|
||||
"""
|
||||
Recursively gather object in a nested list/tuple/dictionary of objects from all devices.
|
||||
|
||||
Args:
|
||||
object (nested list/tuple/dictionary of picklable object):
|
||||
The data to gather.
|
||||
|
||||
Returns:
|
||||
The same data structure as `object` with all the objects sent to every device.
|
||||
"""
|
||||
if AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
raise NotImplementedError("gather objects in TPU is not supported")
|
||||
elif AcceleratorState().distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
return _gpu_gather_object(object)
|
||||
elif AcceleratorState().distributed_type == DistributedType.MULTI_CPU:
|
||||
return _cpu_gather_object(object)
|
||||
else:
|
||||
return object
|
||||
|
||||
|
||||
def _gpu_broadcast(data, src=0):
|
||||
def _gpu_broadcast_one(tensor, src=0):
|
||||
torch.distributed.broadcast(tensor, src=src)
|
||||
return tensor
|
||||
|
||||
return recursively_apply(_gpu_broadcast_one, data, error_on_other_type=True, src=src)
|
||||
|
||||
|
||||
def _tpu_broadcast(tensor, src=0, name="broadcast tensor"):
|
||||
if isinstance(tensor, (list, tuple)):
|
||||
return honor_type(tensor, (_tpu_broadcast(t, name=f"{name}_{i}") for i, t in enumerate(tensor)))
|
||||
elif isinstance(tensor, Mapping):
|
||||
return type(tensor)({k: _tpu_broadcast(v, name=f"{name}_{k}") for k, v in tensor.items()})
|
||||
return xm.mesh_reduce(name, tensor, lambda x: x[src])
|
||||
|
||||
|
||||
def broadcast(tensor, from_process: int = 0):
|
||||
"""
|
||||
Recursively broadcast tensor in a nested list/tuple/dictionary of tensors to all devices.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to gather.
|
||||
from_process (`int`, *optional*, defaults to 0):
|
||||
The process from which to send the data
|
||||
|
||||
Returns:
|
||||
The same data structure as `tensor` with all tensors broadcasted to the proper device.
|
||||
"""
|
||||
if AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
return _tpu_broadcast(tensor, src=from_process, name="accelerate.utils.broadcast")
|
||||
elif AcceleratorState().distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
return _gpu_broadcast(tensor, src=from_process)
|
||||
elif AcceleratorState().distributed_type == DistributedType.MULTI_CPU:
|
||||
return _gpu_broadcast(tensor, src=from_process)
|
||||
else:
|
||||
return tensor
|
||||
|
||||
|
||||
def broadcast_object_list(object_list, from_process: int = 0):
|
||||
"""
|
||||
Broadcast a list of picklable objects form one process to the others.
|
||||
|
||||
Args:
|
||||
object_list (list of picklable objects):
|
||||
The list of objects to broadcast. This list will be modified inplace.
|
||||
from_process (`int`, *optional*, defaults to 0):
|
||||
The process from which to send the data.
|
||||
|
||||
Returns:
|
||||
The same list containing the objects from process 0.
|
||||
"""
|
||||
if AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
for i, obj in enumerate(object_list):
|
||||
object_list[i] = xm.mesh_reduce("accelerate.utils.broadcast_object_list", obj, lambda x: x[from_process])
|
||||
elif AcceleratorState().distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
torch.distributed.broadcast_object_list(object_list, src=from_process)
|
||||
elif AcceleratorState().distributed_type == DistributedType.MULTI_CPU:
|
||||
torch.distributed.broadcast_object_list(object_list, src=from_process)
|
||||
return object_list
|
||||
|
||||
|
||||
def slice_tensors(data, tensor_slice):
|
||||
"""
|
||||
Recursively takes a slice in a nested list/tuple/dictionary of tensors.
|
||||
|
||||
Args:
|
||||
data (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to slice.
|
||||
tensor_slice (`slice`):
|
||||
The slice to take.
|
||||
|
||||
Returns:
|
||||
The same data structure as `data` with all the tensors slices.
|
||||
"""
|
||||
|
||||
def _slice_tensor(tensor, tensor_slice):
|
||||
return tensor[tensor_slice]
|
||||
|
||||
return recursively_apply(_slice_tensor, data, tensor_slice)
|
||||
|
||||
|
||||
def find_batch_size(data):
|
||||
"""
|
||||
Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.
|
||||
|
||||
Args:
|
||||
data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.
|
||||
|
||||
Returns:
|
||||
`int`: The batch size.
|
||||
"""
|
||||
if isinstance(data, (tuple, list)):
|
||||
return find_batch_size(data[0])
|
||||
elif isinstance(data, Mapping):
|
||||
for k in data.keys():
|
||||
return find_batch_size(data[k])
|
||||
elif not isinstance(data, torch.Tensor):
|
||||
raise TypeError(f"Can only find the batch size of tensors but got {type(data)}.")
|
||||
return data.shape[0]
|
||||
|
||||
|
||||
def concatenate(data, dim=0):
|
||||
"""
|
||||
Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.
|
||||
|
||||
Args:
|
||||
data (nested list/tuple/dictionary of lists of tensors `torch.Tensor`):
|
||||
The data to concatenate.
|
||||
dim (`int`, *optional*, defaults to 0):
|
||||
The dimension on which to concatenate.
|
||||
|
||||
Returns:
|
||||
The same data structure as `data` with all the tensors concatenated.
|
||||
"""
|
||||
if isinstance(data[0], (tuple, list)):
|
||||
return honor_type(data[0], (concatenate([d[i] for d in data], dim=dim) for i in range(len(data[0]))))
|
||||
elif isinstance(data[0], Mapping):
|
||||
return type(data[0])({k: concatenate([d[k] for d in data], dim=dim) for k in data[0].keys()})
|
||||
elif not isinstance(data[0], torch.Tensor):
|
||||
raise TypeError(f"Can only concatenate tensors but got {type(data[0])}")
|
||||
return torch.cat(data, dim=dim)
|
||||
|
||||
|
||||
def pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
|
||||
"""
|
||||
Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so they
|
||||
can safely be gathered.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of :obj:`torch.Tensor`):
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to gather.
|
||||
dim (:obj:`int`, `optional`, defaults to 0):
|
||||
dim (`int`, *optional*, defaults to 0):
|
||||
The dimension on which to pad.
|
||||
pad_index (:obj:`int`, `optional`, defaults to 0):
|
||||
pad_index (`int`, *optional*, defaults to 0):
|
||||
The value with which to pad.
|
||||
pad_first (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
pad_first (`bool`, *optional*, defaults to `False`):
|
||||
Whether to pad at the beginning or the end.
|
||||
"""
|
||||
if isinstance(tensor, (list, tuple)):
|
||||
return honor_type(tensor, (pad_across_processes(t, dim=dim, pad_index=pad_index) for t in tensor))
|
||||
elif isinstance(tensor, dict):
|
||||
return type(tensor)({k: pad_across_processes(v, dim=dim, pad_index=pad_index) for k, v in tensor.items()})
|
||||
elif not isinstance(tensor, torch.Tensor):
|
||||
raise TypeError(f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors.")
|
||||
|
||||
if dim >= len(tensor.shape):
|
||||
return tensor
|
||||
def _pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
|
||||
if dim >= len(tensor.shape):
|
||||
return tensor
|
||||
|
||||
# Gather all sizes
|
||||
size = torch.tensor(tensor.shape, device=tensor.device)[None]
|
||||
sizes = gather(size).cpu()
|
||||
# Then pad to the maximum size
|
||||
max_size = max(s[dim] for s in sizes)
|
||||
if max_size == tensor.shape[dim]:
|
||||
return tensor
|
||||
# Gather all sizes
|
||||
size = torch.tensor(tensor.shape, device=tensor.device)[None]
|
||||
sizes = gather(size).cpu()
|
||||
# Then pad to the maximum size
|
||||
max_size = max(s[dim] for s in sizes)
|
||||
if max_size == tensor.shape[dim]:
|
||||
return tensor
|
||||
|
||||
old_size = tensor.shape
|
||||
new_size = list(old_size)
|
||||
new_size[dim] = max_size
|
||||
new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
|
||||
if pad_first:
|
||||
indices = tuple(
|
||||
slice(max_size - old_size[dim], max_size) if i == dim else slice(None) for i in range(len(new_size))
|
||||
)
|
||||
else:
|
||||
indices = tuple(slice(0, old_size[dim]) if i == dim else slice(None) for i in range(len(new_size)))
|
||||
new_tensor[indices] = tensor
|
||||
return new_tensor
|
||||
old_size = tensor.shape
|
||||
new_size = list(old_size)
|
||||
new_size[dim] = max_size
|
||||
new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
|
||||
if pad_first:
|
||||
indices = tuple(
|
||||
slice(max_size - old_size[dim], max_size) if i == dim else slice(None) for i in range(len(new_size))
|
||||
)
|
||||
else:
|
||||
indices = tuple(slice(0, old_size[dim]) if i == dim else slice(None) for i in range(len(new_size)))
|
||||
new_tensor[indices] = tensor
|
||||
return new_tensor
|
||||
|
||||
return recursively_apply(
|
||||
_pad_across_processes, tensor, error_on_other_type=True, dim=dim, pad_index=pad_index, pad_first=pad_first
|
||||
)
|
||||
|
||||
|
||||
def reduce(tensor, reduction="mean"):
|
||||
"""
|
||||
Recursively reduce the tensors in a nested list/tuple/dictionary of lists of tensors across all processes by the
|
||||
mean of a given operation.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to reduce.
|
||||
reduction (`str`, *optional*, defaults to `"mean"`):
|
||||
A reduction method. Can be of "mean", "sum", or "none"
|
||||
|
||||
Returns:
|
||||
The same data structure as `data` with all the tensors reduced.
|
||||
"""
|
||||
|
||||
def _reduce_across_processes(tensor, reduction="mean"):
|
||||
state = AcceleratorState()
|
||||
cloned_tensor = tensor.clone()
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
xm.all_reduce("sum", cloned_tensor)
|
||||
return cloned_tensor
|
||||
elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
torch.distributed.reduce(cloned_tensor, ReduceOp.SUM)
|
||||
return cloned_tensor
|
||||
else:
|
||||
if reduction == "sum":
|
||||
return cloned_tensor.sum()
|
||||
else:
|
||||
return cloned_tensor.mean()
|
||||
|
||||
return recursively_apply(_reduce_across_processes, tensor, error_on_other_type=True, reduction=reduction)
|
||||
|
||||
|
||||
def wait_for_everyone():
|
||||
"""
|
||||
Introduces a blocking point in the script, making sure all processes have reached this point before continuing.
|
||||
|
||||
Warning::
|
||||
<Tip warning={true}>
|
||||
|
||||
Make sure all processes will reach this instruction otherwise one of your processes will hang forever.
|
||||
Make sure all processes will reach this instruction otherwise one of your processes will hang forever.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
if (
|
||||
AcceleratorState().distributed_type == DistributedType.MULTI_GPU
|
||||
@ -309,7 +679,7 @@ def wait_for_everyone():
|
||||
|
||||
def save(obj, f):
|
||||
"""
|
||||
Save the data to disk. Use in place of :obj:`torch.save()`.
|
||||
Save the data to disk. Use in place of `torch.save()`.
|
||||
|
||||
Args:
|
||||
obj: The data to save
|
||||
@ -326,18 +696,30 @@ class PrepareForLaunch:
|
||||
Prepare a function that will launched in a distributed setup.
|
||||
|
||||
Args:
|
||||
launcher (:obj:`Callable`):
|
||||
launcher (`Callable`):
|
||||
The function to launch.
|
||||
distributed_type (:class:`~accelerate.state.DistributedType`):
|
||||
distributed_type ([`~state.DistributedType`]):
|
||||
The distributed type to prepare for.
|
||||
debug (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not this is a debug launch.
|
||||
"""
|
||||
|
||||
def __init__(self, launcher, distributed_type="NO"):
|
||||
def __init__(self, launcher, distributed_type="NO", debug=False):
|
||||
self.launcher = launcher
|
||||
self.distributed_type = DistributedType(distributed_type)
|
||||
self.debug = debug
|
||||
|
||||
def __call__(self, index, *args):
|
||||
if self.distributed_type == DistributedType.MULTI_GPU or self.distributed_type == DistributedType.MULTI_CPU:
|
||||
if self.debug:
|
||||
world_size = int(os.environ.get("WORLD_SIZE"))
|
||||
rdv_file = os.environ.get("ACCELERATE_DEBUG_RDV_FILE")
|
||||
torch.distributed.init_process_group(
|
||||
"gloo",
|
||||
rank=index,
|
||||
store=torch.distributed.FileStore(rdv_file, world_size),
|
||||
world_size=world_size,
|
||||
)
|
||||
elif self.distributed_type == DistributedType.MULTI_GPU or self.distributed_type == DistributedType.MULTI_CPU:
|
||||
# Prepare the environment for torch.distributed
|
||||
os.environ["LOCAL_RANK"] = str(index)
|
||||
os.environ["RANK"] = str(index)
|
||||
@ -388,4 +770,85 @@ class DeepSpeedPlugin:
|
||||
},
|
||||
},
|
||||
"steps_per_print": float("inf"), # this will stop deepspeed from logging @ stdout
|
||||
"zero_allow_untested_optimizer": True,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class FullyShardedDataParallelPlugin:
|
||||
"""
|
||||
This plugin is used to enable fully sharded data parallelism.
|
||||
"""
|
||||
|
||||
sharding_strategy: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are [1] FULL_SHARD, [2] SHARD_GRAD_OP"},
|
||||
)
|
||||
backward_prefetch: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are [1] BACKWARD_PRE, [2] BACKWARD_POST"},
|
||||
)
|
||||
auto_wrap_policy: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={"help": "A callable specifying a policy to recursively wrap layers with FSDP"},
|
||||
)
|
||||
cpu_offload: Optional[Callable] = field(
|
||||
default=None,
|
||||
metadata={"help": "Decides Whether to offload parameters and gradients to CPU."},
|
||||
)
|
||||
min_num_params: int = field(
|
||||
default=None, metadata={"help": "FSDP's minimum number of parameters for Default Auto Wrapping."}
|
||||
)
|
||||
ignored_modules: Optional[Iterable[torch.nn.Module]] = field(
|
||||
default=None,
|
||||
metadata={"help": "A list of modules to ignore for FSDP."},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, ShardingStrategy
|
||||
from torch.distributed.fsdp.wrap import default_auto_wrap_policy
|
||||
|
||||
if self.sharding_strategy is None:
|
||||
self.sharding_strategy = ShardingStrategy(int(os.environ.get("FSDP_SHARDING_STRATEGY", 1)))
|
||||
|
||||
if self.cpu_offload is None:
|
||||
if os.environ.get("FSDP_OFFLOAD_PARAMS", "false") == "true":
|
||||
self.cpu_offload = CPUOffload(offload_params=True)
|
||||
else:
|
||||
self.cpu_offload = CPUOffload(offload_params=False)
|
||||
|
||||
if self.min_num_params is None:
|
||||
self.min_num_params = int(os.environ.get("FSDP_MIN_NUM_PARAMS", 0))
|
||||
|
||||
if self.auto_wrap_policy is None:
|
||||
if self.min_num_params > 0:
|
||||
self.auto_wrap_policy = functools.partial(default_auto_wrap_policy, min_num_params=self.min_num_params)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def patch_environment(**kwargs):
|
||||
"""
|
||||
A context manager that will add each keyword argument passed to `os.environ` and remove them when exiting.
|
||||
|
||||
Will convert the values in `kwargs` to strings and upper-case all the keys.
|
||||
"""
|
||||
for key, value in kwargs.items():
|
||||
os.environ[key.upper()] = str(value)
|
||||
|
||||
yield
|
||||
|
||||
for key in kwargs:
|
||||
del os.environ[key.upper()]
|
||||
|
||||
|
||||
def get_pretty_name(obj):
|
||||
"""
|
||||
Gets a pretty name from `obj`.
|
||||
"""
|
||||
if not hasattr(obj, "__qualname__") and not hasattr(obj, "__name__"):
|
||||
obj = getattr(obj, "__class__", obj)
|
||||
if hasattr(obj, "__qualname__"):
|
||||
return obj.__qualname__
|
||||
if hasattr(obj, "__name__"):
|
||||
return obj.__name__
|
||||
return str(obj)
|
||||
|
||||
23
tests/test_cpu.py
Normal file
23
tests/test_cpu.py
Normal file
@ -0,0 +1,23 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from accelerate import debug_launcher
|
||||
from accelerate.test_utils import test_script
|
||||
|
||||
|
||||
class MultiTPUTester(unittest.TestCase):
|
||||
def test_cpu(self):
|
||||
debug_launcher(test_script.main)
|
||||
245
tests/test_examples.py
Normal file
245
tests/test_examples.py
Normal file
@ -0,0 +1,245 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import DistributedType
|
||||
from accelerate.test_utils.examples import compare_against_test
|
||||
from accelerate.test_utils.testing import TempDirTestCase, slow
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
SRC_DIRS = [os.path.abspath(os.path.join("examples", "by_feature"))]
|
||||
sys.path.extend(SRC_DIRS)
|
||||
|
||||
if SRC_DIRS is not None:
|
||||
import checkpointing
|
||||
import cross_validation
|
||||
import multi_process_metrics
|
||||
import tracking
|
||||
|
||||
# DataLoaders built from `test_samples/MRPC` for quick testing
|
||||
# Should mock `{script_name}.get_dataloaders` via:
|
||||
# @mock.patch("{script_name}.get_dataloaders", mocked_dataloaders)
|
||||
|
||||
EXCLUDE_EXAMPLES = ["cross_validation.py", "multi_process_metrics.py", "memory.py"]
|
||||
|
||||
|
||||
def mocked_dataloaders(accelerator, batch_size: int = 16):
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
|
||||
datasets = load_dataset("csv", data_files=data_files)
|
||||
label_list = datasets["train"].unique("label")
|
||||
|
||||
label_to_id = {v: i for i, v in enumerate(label_list)}
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(
|
||||
examples["sentence1"], examples["sentence2"], truncation=True, max_length=None, padding="max_length"
|
||||
)
|
||||
if "label" in examples:
|
||||
outputs["labels"] = [label_to_id[l] for l in examples["label"]]
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["sentence1", "sentence2", "label"],
|
||||
)
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=2)
|
||||
eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
class ExampleDifferenceTests(unittest.TestCase):
|
||||
"""
|
||||
This TestCase checks that all of the `complete_*` scripts contain all of the
|
||||
information found in the `by_feature` scripts, line for line. If one fails,
|
||||
then a complete example does not contain all of the features in the features
|
||||
scripts, and should be updated.
|
||||
|
||||
Each example script should be a single test (such as `test_nlp_example`),
|
||||
and should run `one_complete_example` twice: once with `parser_only=True`,
|
||||
and the other with `parser_only=False`. This is so that when the test
|
||||
failures are returned to the user, they understand if the discrepancy lies in
|
||||
the `main` function, or the `training_loop` function. Otherwise it will be
|
||||
unclear.
|
||||
|
||||
Also, if there are any expected differences between the base script used and
|
||||
`complete_nlp_example.py` (the canonical base script), these should be included in
|
||||
`special_strings`. These would be differences in how something is logged, print statements,
|
||||
etc (such as calls to `Accelerate.log()`)
|
||||
"""
|
||||
|
||||
def one_complete_example(
|
||||
self, complete_file_name: str, parser_only: bool, secondary_filename: str = None, special_strings: list = None
|
||||
):
|
||||
"""
|
||||
Tests a single `complete` example against all of the implemented `by_feature` scripts
|
||||
|
||||
Args:
|
||||
complete_file_name (`str`):
|
||||
The filename of a complete example
|
||||
parser_only (`bool`):
|
||||
Whether to look at the main training function, or the argument parser
|
||||
secondary_filename (`str`, *optional*):
|
||||
A potential secondary base file to strip all script information not relevant for checking,
|
||||
such as "cv_example.py" when testing "complete_cv_example.py"
|
||||
special_strings (`list`, *optional*):
|
||||
A list of strings to potentially remove before checking no differences are left. These should be
|
||||
diffs that are file specific, such as different logging variations between files.
|
||||
"""
|
||||
self.maxDiff = None
|
||||
by_feature_path = os.path.abspath(os.path.join("examples", "by_feature"))
|
||||
examples_path = os.path.abspath("examples")
|
||||
for item in os.listdir(by_feature_path):
|
||||
if item not in EXCLUDE_EXAMPLES:
|
||||
item_path = os.path.join(by_feature_path, item)
|
||||
if os.path.isfile(item_path) and ".py" in item_path:
|
||||
with self.subTest(
|
||||
tested_script=complete_file_name,
|
||||
feature_script=item,
|
||||
tested_section="main()" if parser_only else "training_function()",
|
||||
):
|
||||
diff = compare_against_test(
|
||||
os.path.join(examples_path, complete_file_name), item_path, parser_only, secondary_filename
|
||||
)
|
||||
diff = "\n".join(diff)
|
||||
if special_strings is not None:
|
||||
for string in special_strings:
|
||||
diff = diff.replace(string, "")
|
||||
self.assertEqual(diff, "")
|
||||
|
||||
def test_nlp_examples(self):
|
||||
self.one_complete_example("complete_nlp_example.py", True)
|
||||
self.one_complete_example("complete_nlp_example.py", False)
|
||||
|
||||
def test_cv_examples(self):
|
||||
cv_path = os.path.abspath(os.path.join("examples", "cv_example.py"))
|
||||
special_strings = [
|
||||
" " * 16 + "{\n\n",
|
||||
" " * 18 + '"accuracy": eval_metric["accuracy"],\n\n',
|
||||
" " * 18 + '"f1": eval_metric["f1"],\n\n',
|
||||
" " * 18 + '"train_loss": total_loss,\n\n',
|
||||
" " * 18 + '"epoch": epoch,\n\n',
|
||||
" " * 16 + "}\n",
|
||||
" " * 8,
|
||||
]
|
||||
self.one_complete_example("complete_cv_example.py", True, cv_path, special_strings)
|
||||
self.one_complete_example("complete_cv_example.py", False, cv_path, special_strings)
|
||||
|
||||
|
||||
class FeatureExamplesTests(TempDirTestCase):
|
||||
clear_on_setup = False
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
def test_checkpointing_by_epoch(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
--checkpointing_steps epoch
|
||||
--output_dir {self.tmpdir}
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
self.assertTrue(os.path.exists(os.path.join(self.tmpdir, "epoch_1")))
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
def test_checkpointing_by_steps(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
--checkpointing_steps 2
|
||||
--output_dir {self.tmpdir}
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
self.assertTrue(os.path.exists(os.path.join(self.tmpdir, "step_4")))
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
def test_load_states_by_epoch(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
--resume_from_checkpoint {os.path.join(self.tmpdir, "epoch_1")}
|
||||
""".split()
|
||||
dummy_results = {"accuracy": mock.ANY, "f1": mock.ANY}
|
||||
with mock.patch("accelerate.Accelerator.print") as mocked_print:
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
with self.assertRaises(AssertionError):
|
||||
mocked_print.assert_any_call("epoch 0:", dummy_results)
|
||||
with self.assertRaises(AssertionError):
|
||||
mocked_print.assert_any_call("epoch 1:", dummy_results)
|
||||
mocked_print.assert_any_call("epoch 2:", dummy_results)
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
def test_load_states_by_steps(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
--resume_from_checkpoint {os.path.join(self.tmpdir, "step_4")}
|
||||
""".split()
|
||||
dummy_results = {"accuracy": mock.ANY, "f1": mock.ANY}
|
||||
with mock.patch("accelerate.Accelerator.print") as mocked_print:
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
with self.assertRaises(AssertionError):
|
||||
mocked_print.assert_any_call("epoch 0:", dummy_results)
|
||||
mocked_print.assert_any_call("epoch 1:", dummy_results)
|
||||
mocked_print.assert_any_call("epoch 2:", dummy_results)
|
||||
|
||||
@slow
|
||||
def test_cross_validation(self):
|
||||
testargs = """
|
||||
cross_validation.py
|
||||
--num_folds 2
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
with mock.patch("accelerate.Accelerator.print") as mocked_print:
|
||||
cross_validation.main()
|
||||
call = mocked_print.mock_calls[-1]
|
||||
self.assertGreaterEqual(call.args[1]["accuracy"], 0.75)
|
||||
|
||||
@mock.patch("multi_process_metrics.get_dataloaders", mocked_dataloaders)
|
||||
def test_multi_process_metrics(self):
|
||||
testargs = ["multi_process_metrics.py"]
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
multi_process_metrics.main()
|
||||
|
||||
@mock.patch("tracking.get_dataloaders", mocked_dataloaders)
|
||||
def test_tracking(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
testargs = f"""
|
||||
tracking.py
|
||||
--with_tracking
|
||||
--logging_dir {tmpdir}
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
tracking.main()
|
||||
self.assertTrue(os.path.exists(os.path.join(tmpdir, "tracking")))
|
||||
91
tests/test_memory_utils.py
Normal file
91
tests/test_memory_utils.py
Normal file
@ -0,0 +1,91 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
from accelerate.memory_utils import find_executable_batch_size
|
||||
|
||||
|
||||
def raise_fake_out_of_memory():
|
||||
raise RuntimeError("CUDA out of memory.")
|
||||
|
||||
|
||||
class MemoryTest(unittest.TestCase):
|
||||
def test_memory_implicit(self):
|
||||
batch_sizes = []
|
||||
|
||||
@find_executable_batch_size(starting_batch_size=128)
|
||||
def mock_training_loop_function(batch_size):
|
||||
nonlocal batch_sizes
|
||||
batch_sizes.append(batch_size)
|
||||
if batch_size != 8:
|
||||
raise_fake_out_of_memory()
|
||||
|
||||
mock_training_loop_function()
|
||||
self.assertListEqual(batch_sizes, [128, 64, 32, 16, 8])
|
||||
|
||||
def test_memory_explicit(self):
|
||||
batch_sizes = []
|
||||
|
||||
@find_executable_batch_size(starting_batch_size=128)
|
||||
def mock_training_loop_function(batch_size, arg1):
|
||||
nonlocal batch_sizes
|
||||
batch_sizes.append(batch_size)
|
||||
if batch_size != 8:
|
||||
raise_fake_out_of_memory()
|
||||
return batch_size, arg1
|
||||
|
||||
bs, arg1 = mock_training_loop_function("hello")
|
||||
self.assertListEqual(batch_sizes, [128, 64, 32, 16, 8])
|
||||
self.assertListEqual([bs, arg1], [8, "hello"])
|
||||
|
||||
def test_start_zero(self):
|
||||
@find_executable_batch_size(starting_batch_size=0)
|
||||
def mock_training_loop_function(batch_size):
|
||||
pass
|
||||
|
||||
with self.assertRaises(RuntimeError) as cm:
|
||||
mock_training_loop_function()
|
||||
self.assertIn("No executable batch size found, reached zero.", cm.exception.args[0])
|
||||
|
||||
def test_approach_zero(self):
|
||||
@find_executable_batch_size(starting_batch_size=16)
|
||||
def mock_training_loop_function(batch_size):
|
||||
if batch_size > 0:
|
||||
raise_fake_out_of_memory()
|
||||
pass
|
||||
|
||||
with self.assertRaises(RuntimeError) as cm:
|
||||
mock_training_loop_function()
|
||||
self.assertIn("No executable batch size found, reached zero.", cm.exception.args[0])
|
||||
|
||||
def test_verbose_guard(self):
|
||||
@find_executable_batch_size(starting_batch_size=128)
|
||||
def mock_training_loop_function(batch_size, arg1, arg2):
|
||||
if batch_size != 8:
|
||||
raise raise_fake_out_of_memory()
|
||||
|
||||
with self.assertRaises(TypeError) as cm:
|
||||
mock_training_loop_function(128, "hello", "world")
|
||||
self.assertIn("Batch size was passed into `f`", cm.exception.args[0])
|
||||
self.assertIn("`f(arg1='hello', arg2='world')", cm.exception.args[0])
|
||||
|
||||
def test_any_other_error(self):
|
||||
@find_executable_batch_size(starting_batch_size=16)
|
||||
def mock_training_loop_function(batch_size):
|
||||
raise ValueError("Oops, we had an error!")
|
||||
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
mock_training_loop_function()
|
||||
self.assertIn("Oops, we had an error!", cm.exception.args[0])
|
||||
7
tests/test_samples/MRPC/dev.csv
Normal file
7
tests/test_samples/MRPC/dev.csv
Normal file
@ -0,0 +1,7 @@
|
||||
label,sentence1,sentence2
|
||||
equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
|
||||
not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
|
||||
not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
|
||||
equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
|
||||
not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
|
||||
equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
|
||||
|
7
tests/test_samples/MRPC/train.csv
Normal file
7
tests/test_samples/MRPC/train.csv
Normal file
@ -0,0 +1,7 @@
|
||||
label,sentence1,sentence2
|
||||
equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
|
||||
not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
|
||||
not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
|
||||
equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
|
||||
not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
|
||||
equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
|
||||
|
62
tests/test_scheduler.py
Normal file
62
tests/test_scheduler.py
Normal file
@ -0,0 +1,62 @@
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
|
||||
from accelerate import Accelerator, debug_launcher
|
||||
|
||||
|
||||
def scheduler_test(num_processes=2, step_scheduler_with_optimizer=True, split_batches=False):
|
||||
accelerator = Accelerator(step_scheduler_with_optimizer=step_scheduler_with_optimizer, split_batches=split_batches)
|
||||
model = torch.nn.Linear(2, 4)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
|
||||
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda n: 1 - n / 10)
|
||||
|
||||
model, optimizer, scheduler = accelerator.prepare(model, optimizer, scheduler)
|
||||
|
||||
# Optimizer has stepped
|
||||
optimizer._is_overflow = False
|
||||
scheduler.step()
|
||||
expected_lr = 1 - (num_processes if (step_scheduler_with_optimizer and not split_batches) else 1) / 10
|
||||
assert (
|
||||
scheduler.get_last_lr()[0] == expected_lr
|
||||
), f"Wrong lr found at first step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
|
||||
|
||||
# Optimizer has not stepped
|
||||
optimizer._is_overflow = True
|
||||
scheduler.step()
|
||||
if not step_scheduler_with_optimizer:
|
||||
expected_lr = 1 - 2 / 10
|
||||
assert (
|
||||
scheduler.get_last_lr()[0] == expected_lr
|
||||
), f"Wrong lr found at second step, expected {expected_lr}, got {scheduler.get_last_lr()[0]}"
|
||||
|
||||
|
||||
class SchedulerTester(unittest.TestCase):
|
||||
def test_scheduler_steps_with_optimizer_single_process(self):
|
||||
debug_launcher(partial(scheduler_test, num_processes=1), num_processes=1)
|
||||
debug_launcher(partial(scheduler_test, num_processes=1, split_batches=True), num_processes=1)
|
||||
|
||||
def test_scheduler_not_step_with_optimizer_single_process(self):
|
||||
debug_launcher(partial(scheduler_test, num_processes=1, step_scheduler_with_optimizer=False), num_processes=1)
|
||||
|
||||
def test_scheduler_steps_with_optimizer_multiprocess(self):
|
||||
debug_launcher(scheduler_test)
|
||||
debug_launcher(partial(scheduler_test, num_processes=1, split_batches=True), num_processes=1)
|
||||
|
||||
def test_scheduler_not_step_with_optimizer_multiprocess(self):
|
||||
debug_launcher(partial(scheduler_test, step_scheduler_with_optimizer=False))
|
||||
165
tests/test_state_checkpointing.py
Normal file
165
tests/test_state_checkpointing.py
Normal file
@ -0,0 +1,165 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data import DataLoader, TensorDataset
|
||||
|
||||
from accelerate import Accelerator
|
||||
from accelerate.utils import set_seed
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def dummy_dataloaders(a=2, b=3, batch_size=16, n_train_batches: int = 10, n_valid_batches: int = 2):
|
||||
"Generates a tuple of dummy DataLoaders to test with"
|
||||
|
||||
def get_dataset(n_batches):
|
||||
x = torch.randn(batch_size * n_batches, 1)
|
||||
return TensorDataset(x, a * x + b + 0.1 * torch.randn(batch_size * n_batches, 1))
|
||||
|
||||
train_dataset = get_dataset(n_train_batches)
|
||||
valid_dataset = get_dataset(n_valid_batches)
|
||||
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=4)
|
||||
valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=batch_size, num_workers=4)
|
||||
return (train_dataloader, valid_dataloader)
|
||||
|
||||
|
||||
def train(num_epochs, model, dataloader, optimizer, accelerator, scheduler=None):
|
||||
"Trains for `num_epochs`"
|
||||
rands = []
|
||||
for epoch in range(num_epochs):
|
||||
# Train quickly
|
||||
model.train()
|
||||
for batch in dataloader:
|
||||
x, y = batch
|
||||
outputs = model(x)
|
||||
loss = torch.nn.functional.mse_loss(outputs, y)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
rands.append(random.random()) # Introduce some randomness
|
||||
if scheduler is not None:
|
||||
scheduler.step()
|
||||
return rands
|
||||
|
||||
|
||||
class DummyModel(nn.Module):
|
||||
"Simple model to do y=mx+b"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.a = nn.Parameter(torch.randn(1))
|
||||
self.b = nn.Parameter(torch.randn(1))
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.a + self.b
|
||||
|
||||
|
||||
class CheckpointTest(unittest.TestCase):
|
||||
def test_can_resume_training(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
set_seed(42)
|
||||
model = DummyModel()
|
||||
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
|
||||
train_dataloader, valid_dataloader = dummy_dataloaders()
|
||||
# Train baseline
|
||||
accelerator = Accelerator()
|
||||
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, valid_dataloader
|
||||
)
|
||||
# Save initial
|
||||
initial = os.path.join(tmpdir, "initial")
|
||||
accelerator.save_state(initial)
|
||||
(a, b) = model.a.item(), model.b.item()
|
||||
opt_state = optimizer.state_dict()
|
||||
ground_truth_rands = train(3, model, train_dataloader, optimizer, accelerator)
|
||||
(a1, b1) = model.a.item(), model.b.item()
|
||||
opt_state1 = optimizer.state_dict()
|
||||
|
||||
# Train partially
|
||||
set_seed(42)
|
||||
model = DummyModel()
|
||||
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
|
||||
train_dataloader, valid_dataloader = dummy_dataloaders()
|
||||
accelerator = Accelerator()
|
||||
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, valid_dataloader
|
||||
)
|
||||
accelerator.load_state(initial)
|
||||
(a2, b2) = model.a.item(), model.b.item()
|
||||
opt_state2 = optimizer.state_dict()
|
||||
self.assertEqual(a, a2)
|
||||
self.assertEqual(b, b2)
|
||||
self.assertEqual(opt_state, opt_state2)
|
||||
|
||||
test_rands = train(2, model, train_dataloader, optimizer, accelerator)
|
||||
# Save everything
|
||||
checkpoint = os.path.join(tmpdir, "checkpoint")
|
||||
accelerator.save_state(checkpoint)
|
||||
|
||||
# Load everything back in and make sure all states work
|
||||
accelerator.load_state(checkpoint)
|
||||
test_rands += train(1, model, train_dataloader, optimizer, accelerator)
|
||||
(a3, b3) = model.a.item(), model.b.item()
|
||||
opt_state3 = optimizer.state_dict()
|
||||
self.assertEqual(a1, a3)
|
||||
self.assertEqual(b1, b3)
|
||||
self.assertEqual(opt_state1, opt_state3)
|
||||
self.assertEqual(ground_truth_rands, test_rands)
|
||||
|
||||
def test_invalid_registration(self):
|
||||
t = torch.tensor([1, 2, 3])
|
||||
t1 = torch.tensor([2, 3, 4])
|
||||
net = DummyModel()
|
||||
opt = torch.optim.Adam(net.parameters())
|
||||
accelerator = Accelerator()
|
||||
with self.assertRaises(ValueError) as ve:
|
||||
accelerator.register_for_checkpointing(t, t1, net, opt)
|
||||
message = str(ve.exception)
|
||||
self.assertTrue("Item at index 0" in message)
|
||||
self.assertTrue("Item at index 1" in message)
|
||||
self.assertFalse("Item at index 2" in message)
|
||||
self.assertFalse("Item at index 3" in message)
|
||||
|
||||
def test_with_scheduler(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
set_seed(42)
|
||||
model = DummyModel()
|
||||
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
|
||||
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
|
||||
train_dataloader, valid_dataloader = dummy_dataloaders()
|
||||
# Train baseline
|
||||
accelerator = Accelerator()
|
||||
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, valid_dataloader
|
||||
)
|
||||
accelerator.register_for_checkpointing(scheduler)
|
||||
# Save initial
|
||||
initial = os.path.join(tmpdir, "initial")
|
||||
accelerator.save_state(initial)
|
||||
scheduler_state = scheduler.state_dict()
|
||||
train(3, model, train_dataloader, optimizer, accelerator, scheduler)
|
||||
self.assertNotEqual(scheduler_state, scheduler.state_dict())
|
||||
|
||||
# Load everything back in and make sure all states work
|
||||
accelerator.load_state(initial)
|
||||
self.assertEqual(scheduler_state, scheduler.state_dict())
|
||||
307
tests/test_tracking.py
Normal file
307
tests/test_tracking.py
Normal file
@ -0,0 +1,307 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import unittest
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from unittest import mock
|
||||
|
||||
# We use TF to parse the logs
|
||||
from accelerate import Accelerator
|
||||
from accelerate.test_utils.testing import MockingTestCase, TempDirTestCase, require_tensorflow
|
||||
from accelerate.tracking import CometMLTracker, GeneralTracker
|
||||
from accelerate.utils import is_tensorflow_available
|
||||
from comet_ml import OfflineExperiment
|
||||
|
||||
|
||||
if is_tensorflow_available():
|
||||
import tensorflow as tf
|
||||
from tensorboard.plugins.hparams import plugin_data_pb2
|
||||
from tensorflow.core.util import event_pb2
|
||||
from tensorflow.python.summary.summary_iterator import summary_iterator
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TensorBoardTrackingTest(unittest.TestCase):
|
||||
@require_tensorflow
|
||||
def test_init_trackers(self):
|
||||
hps = None
|
||||
project_name = "test_project_with_config"
|
||||
with tempfile.TemporaryDirectory() as dirpath:
|
||||
accelerator = Accelerator(log_with="tensorboard", logging_dir=dirpath)
|
||||
config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
|
||||
accelerator.init_trackers(project_name, config)
|
||||
accelerator.end_training()
|
||||
for child in Path(f"{dirpath}/{project_name}").glob("*/**"):
|
||||
log = list(filter(lambda x: x.is_file(), child.iterdir()))[0]
|
||||
# The config log is stored one layer deeper in the logged directory
|
||||
# And names are randomly generated each time
|
||||
si = summary_iterator(str(log))
|
||||
# Pull HPS through careful parsing
|
||||
for event in si:
|
||||
for value in event.summary.value:
|
||||
proto_bytes = value.metadata.plugin_data.content
|
||||
plugin_data = plugin_data_pb2.HParamsPluginData.FromString(proto_bytes)
|
||||
if plugin_data.HasField("session_start_info"):
|
||||
hps = dict(plugin_data.session_start_info.hparams)
|
||||
|
||||
self.assertTrue(isinstance(hps, dict))
|
||||
keys = list(hps.keys())
|
||||
keys.sort()
|
||||
self.assertEqual(keys, ["learning_rate", "num_iterations", "some_boolean", "some_string"])
|
||||
self.assertEqual(hps["num_iterations"].number_value, 12)
|
||||
self.assertEqual(hps["learning_rate"].number_value, 0.01)
|
||||
self.assertEqual(hps["some_boolean"].bool_value, False)
|
||||
self.assertEqual(hps["some_string"].string_value, "some_value")
|
||||
|
||||
@require_tensorflow
|
||||
def test_log(self):
|
||||
step = None
|
||||
project_name = "test_project_with_log"
|
||||
with tempfile.TemporaryDirectory() as dirpath:
|
||||
accelerator = Accelerator(log_with="tensorboard", logging_dir=dirpath)
|
||||
accelerator.init_trackers(project_name)
|
||||
values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
|
||||
accelerator.log(values, step=0)
|
||||
accelerator.end_training()
|
||||
# Logged values are stored in the outermost-tfevents file and can be read in as a TFRecord
|
||||
# Names are randomly generated each time
|
||||
log = list(filter(lambda x: x.is_file(), Path(f"{dirpath}/{project_name}").iterdir()))[0]
|
||||
serialized_examples = tf.data.TFRecordDataset(log)
|
||||
for e in serialized_examples:
|
||||
event = event_pb2.Event.FromString(e.numpy())
|
||||
if step is None:
|
||||
step = event.step
|
||||
for value in event.summary.value:
|
||||
if value.tag == "total_loss":
|
||||
total_loss = value.simple_value
|
||||
elif value.tag == "iteration":
|
||||
iteration = value.simple_value
|
||||
elif value.tag == "my_text/text_summary": # Append /text_summary to the key
|
||||
my_text = value.tensor.string_val[0].decode()
|
||||
self.assertAlmostEqual(total_loss, values["total_loss"])
|
||||
self.assertEqual(iteration, values["iteration"])
|
||||
self.assertEqual(my_text, values["my_text"])
|
||||
|
||||
def test_logging_dir(self):
|
||||
with self.assertRaisesRegex(ValueError, "Logging with `tensorboard` requires a `logging_dir`"):
|
||||
_ = Accelerator(log_with="tensorboard")
|
||||
with tempfile.TemporaryDirectory() as dirpath:
|
||||
_ = Accelerator(log_with="tensorboard", logging_dir=dirpath)
|
||||
|
||||
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
|
||||
class WandBTrackingTest(TempDirTestCase, MockingTestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# wandb let's us override where logs are stored to via the WANDB_DIR env var
|
||||
self.add_mocks(mock.patch.dict(os.environ, {"WANDB_DIR": self.tmpdir}))
|
||||
|
||||
@staticmethod
|
||||
def get_value_from_log(key: str, log: str, key_occurance: int = 0):
|
||||
"""
|
||||
Parses wandb log for `key` and returns the value.
|
||||
If parsing through multiple calls to .log, pass in a `key_occurance`
|
||||
"""
|
||||
res = re.findall(rf"(?<={key} )[^\s]+", log)[key_occurance]
|
||||
if '"' in res:
|
||||
return re.findall(r'"([^"]*)"', res)[0]
|
||||
else:
|
||||
return res
|
||||
|
||||
def test_init_trackers(self):
|
||||
project_name = "test_project_with_config"
|
||||
accelerator = Accelerator(log_with="wandb")
|
||||
config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
|
||||
accelerator.init_trackers(project_name, config)
|
||||
accelerator.end_training()
|
||||
# The latest offline log is stored at wandb/latest-run/*.wandb
|
||||
for child in Path(f"{self.tmpdir}/wandb/latest-run").glob("*"):
|
||||
logger.info(child)
|
||||
if child.is_file() and child.suffix == ".wandb":
|
||||
with open(child, "rb") as f:
|
||||
content = f.read()
|
||||
break
|
||||
|
||||
# Check HPS through careful parsing and cleaning
|
||||
cleaned_log = re.sub(r"[\x00-\x1f]+", " ", content.decode("utf8", "ignore"))
|
||||
self.assertEqual(self.get_value_from_log("num_iterations", cleaned_log), "12")
|
||||
self.assertEqual(self.get_value_from_log("learning_rate", cleaned_log), "0.01")
|
||||
self.assertEqual(self.get_value_from_log("some_boolean", cleaned_log), "false")
|
||||
self.assertEqual(self.get_value_from_log("some_string", cleaned_log), "some_value")
|
||||
|
||||
def test_log(self):
|
||||
project_name = "test_project_with_log"
|
||||
accelerator = Accelerator(log_with="wandb")
|
||||
accelerator.init_trackers(project_name)
|
||||
values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
|
||||
accelerator.log(values, step=0)
|
||||
accelerator.end_training()
|
||||
# The latest offline log is stored at wandb/latest-run/*.wandb
|
||||
for child in Path(f"{self.tmpdir}/wandb/latest-run").glob("*"):
|
||||
if child.is_file() and child.suffix == ".wandb":
|
||||
with open(child, "rb") as f:
|
||||
content = f.read()
|
||||
break
|
||||
# Check HPS through careful parsing and cleaning
|
||||
cleaned_log = re.sub(r"[\x00-\x1f]+", " ", content.decode("utf8", "ignore"))
|
||||
self.assertTrue("0.1" in self.get_value_from_log("total_loss", cleaned_log))
|
||||
self.assertTrue("1" in self.get_value_from_log("iteration", cleaned_log))
|
||||
self.assertTrue("some_value" in self.get_value_from_log("my_text", cleaned_log))
|
||||
self.assertTrue("0" in self.get_value_from_log("_step", cleaned_log))
|
||||
|
||||
|
||||
# Comet has a special `OfflineExperiment` we need to use for testing
|
||||
def offline_init(self, run_name: str, tmpdir: str):
|
||||
self.run_name = run_name
|
||||
self.writer = OfflineExperiment(project_name=run_name, offline_directory=tmpdir)
|
||||
logger.info(f"Initialized offline CometML project {self.run_name}")
|
||||
logger.info("Make sure to log any initial configurations with `self.store_init_configuration` before training!")
|
||||
|
||||
|
||||
@mock.patch.object(CometMLTracker, "__init__", offline_init)
|
||||
class CometMLTest(unittest.TestCase):
|
||||
@staticmethod
|
||||
def get_value_from_key(log_list, key: str, is_param: bool = False):
|
||||
"Extracts `key` from Comet `log`"
|
||||
for log in log_list:
|
||||
j = json.loads(log)["payload"]
|
||||
if is_param and "param" in j.keys():
|
||||
if j["param"]["paramName"] == key:
|
||||
return j["param"]["paramValue"]
|
||||
if "log_other" in j.keys():
|
||||
if j["log_other"]["key"] == key:
|
||||
return j["log_other"]["val"]
|
||||
|
||||
def test_init_trackers(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
tracker = CometMLTracker("test_project_with_config", d)
|
||||
accelerator = Accelerator(log_with=tracker)
|
||||
config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
|
||||
accelerator.init_trackers(None, config)
|
||||
accelerator.end_training()
|
||||
log = os.listdir(d)[0] # Comet is nice, it's just a zip file here
|
||||
# We parse the raw logs
|
||||
p = os.path.join(d, log)
|
||||
archive = zipfile.ZipFile(p, "r")
|
||||
log = archive.open("messages.json").read().decode("utf-8")
|
||||
list_of_json = log.split("\n")[:-1]
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "num_iterations", True), 12)
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "learning_rate", True), 0.01)
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "some_boolean", True), False)
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "some_string", True), "some_value")
|
||||
|
||||
def test_log(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
tracker = CometMLTracker("test_project_with_config", d)
|
||||
accelerator = Accelerator(log_with=tracker)
|
||||
accelerator.init_trackers(None)
|
||||
values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
|
||||
accelerator.log(values, step=0)
|
||||
accelerator.end_training()
|
||||
log = os.listdir(d)[0] # Comet is nice, it's just a zip file here
|
||||
# We parse the raw logs
|
||||
p = os.path.join(d, log)
|
||||
archive = zipfile.ZipFile(p, "r")
|
||||
log = archive.open("messages.json").read().decode("utf-8")
|
||||
list_of_json = log.split("\n")[:-1]
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "curr_step", True), 0)
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "total_loss"), 0.1)
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "iteration"), 1)
|
||||
self.assertEqual(self.get_value_from_key(list_of_json, "my_text"), "some_value")
|
||||
|
||||
|
||||
class MyCustomTracker(GeneralTracker):
|
||||
"Basic tracker that writes to a csv for testing"
|
||||
_col_names = [
|
||||
"total_loss",
|
||||
"iteration",
|
||||
"my_text",
|
||||
"learning_rate",
|
||||
"num_iterations",
|
||||
"some_boolean",
|
||||
"some_string",
|
||||
]
|
||||
|
||||
requires_logging_directory = False
|
||||
|
||||
def __init__(self, dir: str):
|
||||
self.f = open(f"{dir}/log.csv", "w+")
|
||||
self.writer = csv.DictWriter(self.f, fieldnames=self._col_names)
|
||||
self.writer.writeheader()
|
||||
|
||||
def store_init_configuration(self, values: dict):
|
||||
logger.info("Call init")
|
||||
self.writer.writerow(values)
|
||||
|
||||
def log(self, values: dict, step: Optional[int]):
|
||||
logger.info("Call log")
|
||||
self.writer.writerow(values)
|
||||
|
||||
def finish(self):
|
||||
self.f.close()
|
||||
|
||||
|
||||
class CustomTrackerTestCase(unittest.TestCase):
|
||||
def test_init_trackers(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
tracker = MyCustomTracker(d)
|
||||
accelerator = Accelerator(log_with=tracker)
|
||||
config = {"num_iterations": 12, "learning_rate": 1e-2, "some_boolean": False, "some_string": "some_value"}
|
||||
accelerator.init_trackers("Some name", config)
|
||||
accelerator.end_training()
|
||||
with open(f"{d}/log.csv", "r") as f:
|
||||
data = csv.DictReader(f)
|
||||
data = next(data)
|
||||
truth = {
|
||||
"total_loss": "",
|
||||
"iteration": "",
|
||||
"my_text": "",
|
||||
"learning_rate": "0.01",
|
||||
"num_iterations": "12",
|
||||
"some_boolean": "False",
|
||||
"some_string": "some_value",
|
||||
}
|
||||
self.assertDictEqual(data, truth)
|
||||
|
||||
def test_log(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
tracker = MyCustomTracker(d)
|
||||
accelerator = Accelerator(log_with=tracker)
|
||||
accelerator.init_trackers("Some name")
|
||||
values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
|
||||
accelerator.log(values, step=0)
|
||||
accelerator.end_training()
|
||||
with open(f"{d}/log.csv", "r") as f:
|
||||
data = csv.DictReader(f)
|
||||
data = next(data)
|
||||
truth = {
|
||||
"total_loss": "0.1",
|
||||
"iteration": "1",
|
||||
"my_text": "some_value",
|
||||
"learning_rate": "",
|
||||
"num_iterations": "",
|
||||
"some_boolean": "",
|
||||
"some_string": "",
|
||||
}
|
||||
self.assertDictEqual(data, truth)
|
||||
@ -12,15 +12,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import unittest
|
||||
from collections import namedtuple
|
||||
from collections import UserDict, namedtuple
|
||||
|
||||
import torch
|
||||
|
||||
from accelerate.utils import send_to_device
|
||||
from accelerate.test_utils.training import RegressionModel
|
||||
from accelerate.utils import convert_outputs_to_fp32, patch_environment, send_to_device
|
||||
|
||||
|
||||
TestNamedTuple = namedtuple("TestNamedTuple", "a b")
|
||||
TestNamedTuple = namedtuple("TestNamedTuple", "a b c")
|
||||
|
||||
|
||||
class UtilsTester(unittest.TestCase):
|
||||
@ -31,23 +34,47 @@ class UtilsTester(unittest.TestCase):
|
||||
result1 = send_to_device(tensor, device)
|
||||
self.assertTrue(torch.equal(result1.cpu(), tensor))
|
||||
|
||||
result2 = send_to_device((tensor, [tensor, tensor]), device)
|
||||
result2 = send_to_device((tensor, [tensor, tensor], 1), device)
|
||||
self.assertIsInstance(result2, tuple)
|
||||
self.assertTrue(torch.equal(result2[0].cpu(), tensor))
|
||||
self.assertIsInstance(result2[1], list)
|
||||
self.assertTrue(torch.equal(result2[1][0].cpu(), tensor))
|
||||
self.assertTrue(torch.equal(result2[1][1].cpu(), tensor))
|
||||
self.assertEqual(result2[2], 1)
|
||||
|
||||
result2 = send_to_device({"a": tensor, "b": [tensor, tensor]}, device)
|
||||
result2 = send_to_device({"a": tensor, "b": [tensor, tensor], "c": 1}, device)
|
||||
self.assertIsInstance(result2, dict)
|
||||
self.assertTrue(torch.equal(result2["a"].cpu(), tensor))
|
||||
self.assertIsInstance(result2["b"], list)
|
||||
self.assertTrue(torch.equal(result2["b"][0].cpu(), tensor))
|
||||
self.assertTrue(torch.equal(result2["b"][1].cpu(), tensor))
|
||||
self.assertEqual(result2["c"], 1)
|
||||
|
||||
result3 = send_to_device(TestNamedTuple(a=tensor, b=[tensor, tensor]), device)
|
||||
result3 = send_to_device(TestNamedTuple(a=tensor, b=[tensor, tensor], c=1), device)
|
||||
self.assertIsInstance(result3, TestNamedTuple)
|
||||
self.assertTrue(torch.equal(result3.a.cpu(), tensor))
|
||||
self.assertIsInstance(result3.b, list)
|
||||
self.assertTrue(torch.equal(result3.b[0].cpu(), tensor))
|
||||
self.assertTrue(torch.equal(result3.b[1].cpu(), tensor))
|
||||
self.assertEqual(result3.c, 1)
|
||||
|
||||
result4 = send_to_device(UserDict({"a": tensor, "b": [tensor, tensor], "c": 1}), device)
|
||||
self.assertIsInstance(result4, UserDict)
|
||||
self.assertTrue(torch.equal(result4["a"].cpu(), tensor))
|
||||
self.assertIsInstance(result4["b"], list)
|
||||
self.assertTrue(torch.equal(result4["b"][0].cpu(), tensor))
|
||||
self.assertTrue(torch.equal(result4["b"][1].cpu(), tensor))
|
||||
self.assertEqual(result4["c"], 1)
|
||||
|
||||
def test_patch_environment(self):
|
||||
with patch_environment(aa=1, BB=2):
|
||||
self.assertEqual(os.environ.get("AA"), "1")
|
||||
self.assertEqual(os.environ.get("BB"), "2")
|
||||
|
||||
self.assertNotIn("AA", os.environ)
|
||||
self.assertNotIn("BB", os.environ)
|
||||
|
||||
def test_convert_to_32_lets_model_pickle(self):
|
||||
model = RegressionModel()
|
||||
model.forward = convert_outputs_to_fp32(model.forward)
|
||||
_ = pickle.dumps(model)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# coding=utf-8
|
||||
|
||||
# Copyright 2021 The HuggingFace Inc. team.
|
||||
# Copyright 2020 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -13,78 +12,209 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Style utils for the .rst and the docstrings."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
from enum import Enum
|
||||
|
||||
import black
|
||||
|
||||
|
||||
# Special blocks where the inside should be formatted.
|
||||
TEXTUAL_BLOCKS = ["note", "warning"]
|
||||
# List of acceptable characters for titles and sections underline.
|
||||
TITLE_SPECIAL_CHARS = """= - ` : ' " ~ ^ _ * + # < >""".split(" ")
|
||||
# Special words for docstrings (s? means the s is optional)
|
||||
DOC_SPECIAL_WORD = [
|
||||
"Args?",
|
||||
"Params?",
|
||||
"Parameters?",
|
||||
"Arguments?",
|
||||
"Examples?",
|
||||
"Usage",
|
||||
"Returns?",
|
||||
"Raises?",
|
||||
"Attributes?",
|
||||
]
|
||||
BLACK_AVOID_PATTERNS = {}
|
||||
|
||||
|
||||
# Regexes
|
||||
# Matches any declaration of textual block, like `.. note::`. (ignore case to avoid writing all versions in the list)
|
||||
_re_textual_blocks = re.compile(r"^\s*\.\.\s+(" + "|".join(TEXTUAL_BLOCKS) + r")\s*::\s*$", re.IGNORECASE)
|
||||
# Matches list introduction in rst.
|
||||
# Re pattern that catches list introduction (with potential indent)
|
||||
_re_list = re.compile(r"^(\s*-\s+|\s*\*\s+|\s*\d+\.\s+)")
|
||||
# Matches the indent in a line.
|
||||
_re_indent = re.compile(r"^(\s*)\S")
|
||||
# Matches a table declaration in rst.
|
||||
_re_table = re.compile(r"(\+-+)+\+\s*$")
|
||||
# Matches a code block in rst `:: `.
|
||||
_re_code_block = re.compile(r"^\s*::\s*$")
|
||||
# Matches any block of the form `.. something::` or `.. something:: bla`.
|
||||
_re_ignore = re.compile(r"^\s*\.\.\s+(.*?)\s*::\s*\S*\s*$")
|
||||
# Matches comment introduction in rst.
|
||||
_re_comment = re.compile(r"\s*\.\.\s*$")
|
||||
# Re pattern that catches code block introduction (with potentinal indent)
|
||||
_re_code = re.compile(r"^(\s*)```(.*)$")
|
||||
# Re pattern that catches rst args blocks of the form `Parameters:`.
|
||||
_re_args = re.compile("^\s*(Args?|Arguments?|Params?|Parameters?):\s*$")
|
||||
# Re pattern that catches return blocks of the form `Return:`.
|
||||
_re_returns = re.compile("^\s*Returns?:\s*$")
|
||||
# Matches the special tag to ignore some paragraphs.
|
||||
_re_doc_ignore = re.compile(r"(\.\.|#)\s*docstyle-ignore")
|
||||
# Matches the example introduction in docstrings.
|
||||
_re_example = re.compile(r"::\s*$")
|
||||
# Matches the parameters introduction in docstrings.
|
||||
_re_arg_def = re.compile(r"^\s*(Args?|Parameters?|Params|Arguments?|Environment|Attributes?)\s*:\s*$")
|
||||
# Matches the return introduction in docstrings.
|
||||
_re_return = re.compile(r"^\s*(Returns?|Raises?|Note)\s*:\s*$")
|
||||
# Matches any doc special word.
|
||||
_re_any_doc_special_word = re.compile(r"^\s*(" + "|".join(DOC_SPECIAL_WORD) + r")::?\s*$")
|
||||
# Re pattern that matches <Tip>, </Tip> and <Tip warning={true}> blocks.
|
||||
_re_tip = re.compile("^\s*</?Tip(>|\s+warning={true}>)\s*$")
|
||||
|
||||
DOCTEST_PROMPTS = [">>>", "..."]
|
||||
|
||||
|
||||
class SpecialBlock(Enum):
|
||||
NOT_SPECIAL = 0
|
||||
NO_STYLE = 1
|
||||
ARG_LIST = 2
|
||||
def is_empty_line(line):
|
||||
return len(line) == 0 or line.isspace()
|
||||
|
||||
|
||||
def split_text_in_lines(text, max_len, prefix="", min_indent=None):
|
||||
def find_indent(line):
|
||||
"""
|
||||
Split `text` in the biggest lines possible with the constraint of `max_len` using `prefix` on the first line and
|
||||
then indenting with the same length as `prefix`.
|
||||
Returns the number of spaces that start a line indent.
|
||||
"""
|
||||
search = re.search("^(\s*)(?:\S|$)", line)
|
||||
if search is None:
|
||||
return 0
|
||||
return len(search.groups()[0])
|
||||
|
||||
|
||||
def parse_code_example(code_lines):
|
||||
"""
|
||||
Parses a code example
|
||||
|
||||
Args:
|
||||
code_lines (`List[str]`): The code lines to parse.
|
||||
max_len (`int`): The maximum lengh per line.
|
||||
|
||||
Returns:
|
||||
(List[`str`], List[`str`]): The list of code samples and the list of outputs.
|
||||
"""
|
||||
has_doctest = code_lines[0][:3] in DOCTEST_PROMPTS
|
||||
|
||||
code_samples = []
|
||||
outputs = []
|
||||
in_code = True
|
||||
current_bit = []
|
||||
|
||||
for line in code_lines:
|
||||
if in_code and has_doctest and not is_empty_line(line) and line[:3] not in DOCTEST_PROMPTS:
|
||||
code_sample = "\n".join(current_bit)
|
||||
code_samples.append(code_sample.strip())
|
||||
in_code = False
|
||||
current_bit = []
|
||||
elif not in_code and line[:3] in DOCTEST_PROMPTS:
|
||||
output = "\n".join(current_bit)
|
||||
outputs.append(output.strip())
|
||||
in_code = True
|
||||
current_bit = []
|
||||
|
||||
# Add the line without doctest prompt
|
||||
if line[:3] in DOCTEST_PROMPTS:
|
||||
line = line[4:]
|
||||
current_bit.append(line)
|
||||
|
||||
# Add last sample
|
||||
if in_code:
|
||||
code_sample = "\n".join(current_bit)
|
||||
code_samples.append(code_sample.strip())
|
||||
else:
|
||||
output = "\n".join(current_bit)
|
||||
outputs.append(output.strip())
|
||||
|
||||
return code_samples, outputs
|
||||
|
||||
|
||||
def format_code_example(code: str, max_len: int, in_docstring: bool = False):
|
||||
"""
|
||||
Format a code example using black. Will take into account the doctest syntax as well as any initial indentation in
|
||||
the code provided.
|
||||
|
||||
Args:
|
||||
code (`str`): The code example to format.
|
||||
max_len (`int`): The maximum lengh per line.
|
||||
in_docstring (`bool`, *optional*, defaults to `False`): Whether or not the code example is inside a docstring.
|
||||
|
||||
Returns:
|
||||
`str`: The formatted code.
|
||||
"""
|
||||
code_lines = code.split("\n")
|
||||
|
||||
# Find initial indent
|
||||
idx = 0
|
||||
while idx < len(code_lines) and is_empty_line(code_lines[idx]):
|
||||
idx += 1
|
||||
if idx >= len(code_lines):
|
||||
return "", ""
|
||||
indent = find_indent(code_lines[idx])
|
||||
|
||||
# Remove the initial indent for now, we will had it back after styling.
|
||||
# Note that l[indent:] works for empty lines
|
||||
code_lines = [l[indent:] for l in code_lines[idx:]]
|
||||
has_doctest = code_lines[0][:3] in DOCTEST_PROMPTS
|
||||
|
||||
code_samples, outputs = parse_code_example(code_lines)
|
||||
|
||||
# Let's blackify the code! We put everything in one big text to go faster.
|
||||
delimiter = "\n\n### New code sample ###\n"
|
||||
full_code = delimiter.join(code_samples)
|
||||
line_length = max_len - indent
|
||||
if has_doctest:
|
||||
line_length -= 4
|
||||
|
||||
for k, v in BLACK_AVOID_PATTERNS.items():
|
||||
full_code = full_code.replace(k, v)
|
||||
try:
|
||||
mode = black.Mode(target_versions={black.TargetVersion.PY37}, line_length=line_length)
|
||||
formatted_code = black.format_str(full_code, mode=mode)
|
||||
error = ""
|
||||
except Exception as e:
|
||||
formatted_code = full_code
|
||||
error = f"Code sample:\n{full_code}\n\nError message:\n{e}"
|
||||
|
||||
# Let's get back the formatted code samples
|
||||
for k, v in BLACK_AVOID_PATTERNS.items():
|
||||
formatted_code = formatted_code.replace(v, k)
|
||||
# Triple quotes will mess docstrings.
|
||||
if in_docstring:
|
||||
formatted_code = formatted_code.replace('"""', "'''")
|
||||
|
||||
code_samples = formatted_code.split(delimiter)
|
||||
# We can have one output less than code samples
|
||||
if len(outputs) == len(code_samples) - 1:
|
||||
outputs.append("")
|
||||
|
||||
formatted_lines = []
|
||||
for code_sample, output in zip(code_samples, outputs):
|
||||
# black may have added some new lines, we remove them
|
||||
code_sample = code_sample.strip()
|
||||
in_triple_quotes = False
|
||||
in_decorator = False
|
||||
for line in code_sample.strip().split("\n"):
|
||||
if has_doctest and not is_empty_line(line):
|
||||
prefix = (
|
||||
"... "
|
||||
if line.startswith(" ") or line in [")", "]", "}"] or in_triple_quotes or in_decorator
|
||||
else ">>> "
|
||||
)
|
||||
else:
|
||||
prefix = ""
|
||||
indent_str = "" if is_empty_line(line) else (" " * indent)
|
||||
formatted_lines.append(indent_str + prefix + line)
|
||||
|
||||
if '"""' in line:
|
||||
in_triple_quotes = not in_triple_quotes
|
||||
if line.startswith(" "):
|
||||
in_decorator = False
|
||||
if line.startswith("@"):
|
||||
in_decorator = True
|
||||
|
||||
formatted_lines.extend([" " * indent + line for line in output.split("\n")])
|
||||
if not output.endswith("===PT-TF-SPLIT==="):
|
||||
formatted_lines.append("")
|
||||
|
||||
result = "\n".join(formatted_lines)
|
||||
return result.rstrip(), error
|
||||
|
||||
|
||||
def format_text(text, max_len, prefix="", min_indent=None):
|
||||
"""
|
||||
Format a text in the biggest lines possible with the constraint of a maximum length and an indentation.
|
||||
|
||||
Args:
|
||||
text (`str`): The text to format
|
||||
max_len (`int`): The maximum length per line to use
|
||||
prefix (`str`, *optional*, defaults to `""`): A prefix that will be added to the text.
|
||||
The prefix doesn't count toward the indent (like a - introducing a list).
|
||||
min_indent (`int`, *optional*): The minimum indent of the text.
|
||||
If not set, will default to the length of the `prefix`.
|
||||
|
||||
Returns:
|
||||
`str`: The formatted text.
|
||||
"""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
indent = " " * len(prefix)
|
||||
if min_indent is not None:
|
||||
if len(indent) < len(min_indent):
|
||||
indent = min_indent
|
||||
if len(prefix) < len(min_indent):
|
||||
prefix = " " * (len(min_indent) - len(prefix)) + prefix
|
||||
if len(prefix) < min_indent:
|
||||
prefix = " " * (min_indent - len(prefix)) + prefix
|
||||
|
||||
indent = " " * len(prefix)
|
||||
new_lines = []
|
||||
words = text.split(" ")
|
||||
current_line = f"{prefix}{words[0]}"
|
||||
@ -99,379 +229,189 @@ def split_text_in_lines(text, max_len, prefix="", min_indent=None):
|
||||
return "\n".join(new_lines)
|
||||
|
||||
|
||||
def get_indent(line):
|
||||
"""Get the indentation of `line`."""
|
||||
indent_search = _re_indent.search(line)
|
||||
return indent_search.groups()[0] if indent_search is not None else ""
|
||||
def split_line_on_first_colon(line):
|
||||
splits = line.split(":")
|
||||
return splits[0], ":".join(splits[1:])
|
||||
|
||||
|
||||
class CodeStyler:
|
||||
"""A generic class to style .rst files."""
|
||||
def style_docstring(docstring, max_len):
|
||||
"""
|
||||
Style a docstring by making sure there is no useless whitespace and the maximum horizontal space is used.
|
||||
|
||||
def is_no_style_block(self, line):
|
||||
"""Whether or not `line` introduces a block where styling should be ignore"""
|
||||
if _re_code_block.search(line) is not None:
|
||||
return True
|
||||
if _re_textual_blocks.search(line) is not None:
|
||||
return False
|
||||
return _re_ignore.search(line) is not None
|
||||
Args:
|
||||
docstring (`str`): The docstring to style.
|
||||
max_len (`int`): The maximum length of each line.
|
||||
|
||||
def is_comment_or_textual_block(self, line):
|
||||
"""Whether or not `line` introduces a block where styling should not be ignored (note, warnings...)"""
|
||||
if _re_comment.search(line):
|
||||
return True
|
||||
return _re_textual_blocks.search(line) is not None
|
||||
Returns:
|
||||
`str`: The styled docstring
|
||||
"""
|
||||
lines = docstring.split("\n")
|
||||
new_lines = []
|
||||
|
||||
def is_special_block(self, line):
|
||||
"""Whether or not `line` introduces a special block."""
|
||||
if self.is_no_style_block(line):
|
||||
self.in_block = SpecialBlock.NO_STYLE
|
||||
return True
|
||||
return False
|
||||
# Initialization
|
||||
current_paragraph = None
|
||||
current_indent = -1
|
||||
in_code = False
|
||||
param_indent = -1
|
||||
prefix = ""
|
||||
black_errors = []
|
||||
|
||||
def init_in_block(self, text):
|
||||
"""
|
||||
Returns the initial value for `self.in_block`.
|
||||
# Special case for docstrings that begin with continuation of Args with no Args block.
|
||||
idx = 0
|
||||
while idx < len(lines) and is_empty_line(lines[idx]):
|
||||
idx += 1
|
||||
if (
|
||||
len(lines[idx]) > 1
|
||||
and lines[idx].rstrip().endswith(":")
|
||||
and find_indent(lines[idx + 1]) > find_indent(lines[idx])
|
||||
):
|
||||
param_indent = find_indent(lines[idx])
|
||||
|
||||
Useful for some docstrings beginning inside an argument declaration block (all models).
|
||||
"""
|
||||
return SpecialBlock.NOT_SPECIAL
|
||||
for idx, line in enumerate(lines):
|
||||
# Doing all re searches once for the one we need to repeat.
|
||||
list_search = _re_list.search(line)
|
||||
code_search = _re_code.search(line)
|
||||
|
||||
def end_of_special_style(self, line):
|
||||
"""
|
||||
Sets back the `in_block` attribute to `NOT_SPECIAL`.
|
||||
# Are we starting a new paragraph?
|
||||
# New indentation or new line:
|
||||
new_paragraph = find_indent(line) != current_indent or is_empty_line(line)
|
||||
# List item
|
||||
new_paragraph = new_paragraph or list_search is not None
|
||||
# Code block beginning
|
||||
new_paragraph = new_paragraph or code_search is not None
|
||||
# Beginning/end of tip
|
||||
new_paragraph = new_paragraph or _re_tip.search(line)
|
||||
|
||||
Useful for some docstrings where we may have to go back to `ARG_LIST` instead.
|
||||
"""
|
||||
self.in_block = SpecialBlock.NOT_SPECIAL
|
||||
# In this case, we treat the current paragraph
|
||||
if not in_code and new_paragraph and current_paragraph is not None and len(current_paragraph) > 0:
|
||||
paragraph = " ".join(current_paragraph)
|
||||
new_lines.append(format_text(paragraph, max_len, prefix=prefix, min_indent=current_indent))
|
||||
current_paragraph = None
|
||||
|
||||
def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None):
|
||||
"""
|
||||
Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag
|
||||
is passed.
|
||||
"""
|
||||
if len(paragraph) == 0:
|
||||
return ""
|
||||
if no_style or self.in_block == SpecialBlock.NO_STYLE:
|
||||
return "\n".join(paragraph)
|
||||
if _re_list.search(paragraph[0]) is not None:
|
||||
# Great, we're in a list. So we need to split our paragraphs in smaller parts, one for each item.
|
||||
result = ""
|
||||
remainder = ""
|
||||
prefix = _re_list.search(paragraph[0]).groups()[0]
|
||||
prefix_indent = get_indent(paragraph[0])
|
||||
current_item = [paragraph[0][len(prefix) :]]
|
||||
for i, line in enumerate(paragraph[1:]):
|
||||
new_item_search = _re_list.search(line)
|
||||
indent = get_indent(line)
|
||||
if len(indent) < len(prefix_indent) or (len(indent) == len(prefix_indent) and new_item_search is None):
|
||||
# There might not be an empty line after the list, formatting the remainder recursively.
|
||||
remainder = "\n" + self.style_paragraph(
|
||||
paragraph[i + 1 :], max_len, no_style=no_style, min_indent=min_indent
|
||||
)
|
||||
break
|
||||
elif new_item_search is not None:
|
||||
text = " ".join([l.strip() for l in current_item])
|
||||
result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent) + "\n"
|
||||
prefix = new_item_search.groups()[0]
|
||||
prefix_indent = indent
|
||||
current_item = [line[len(prefix) :]]
|
||||
if code_search is not None:
|
||||
if not in_code:
|
||||
current_paragraph = []
|
||||
current_indent = len(code_search.groups()[0])
|
||||
current_code = code_search.groups()[1]
|
||||
prefix = ""
|
||||
if current_indent < param_indent:
|
||||
param_indent = -1
|
||||
else:
|
||||
current_indent = -1
|
||||
code = "\n".join(current_paragraph)
|
||||
if current_code in ["py", "python"]:
|
||||
formatted_code, error = format_code_example(code, max_len, in_docstring=True)
|
||||
new_lines.append(formatted_code)
|
||||
if len(error) > 0:
|
||||
black_errors.append(error)
|
||||
else:
|
||||
current_item.append(line)
|
||||
# Treat the last item
|
||||
text = " ".join([l.strip() for l in current_item])
|
||||
result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent)
|
||||
# Add the potential remainder
|
||||
return result + remainder
|
||||
new_lines.append(code)
|
||||
current_paragraph = None
|
||||
new_lines.append(line)
|
||||
in_code = not in_code
|
||||
|
||||
if len(paragraph) > 1 and self.is_comment_or_textual_block(paragraph[0]):
|
||||
# Comments/notes in rst should be restyled with indentation, ignoring the first line.
|
||||
indent = get_indent(paragraph[1])
|
||||
text = " ".join([l.strip() for l in paragraph[1:]])
|
||||
return paragraph[0] + "\n" + split_text_in_lines(text, max_len, indent, min_indent=min_indent)
|
||||
|
||||
if self.in_block == SpecialBlock.ARG_LIST:
|
||||
# Arg lists are special: we need to ignore the lines that are at the first indentation level beneath the
|
||||
# Args/Parameters (parameter description), then we can style the indentation level beneath.
|
||||
result = ""
|
||||
# The args/parameters could be in that paragraph and should be ignored
|
||||
if _re_arg_def.search(paragraph[0]) is not None:
|
||||
if len(paragraph) == 1:
|
||||
return paragraph[0]
|
||||
result += paragraph[0] + "\n"
|
||||
paragraph = paragraph[1:]
|
||||
|
||||
if self.current_indent is None:
|
||||
self.current_indent = get_indent(paragraph[1])
|
||||
|
||||
current_item = []
|
||||
for line in paragraph:
|
||||
if get_indent(line) == self.current_indent:
|
||||
if len(current_item) > 0:
|
||||
item_indent = get_indent(current_item[0])
|
||||
text = " ".join([l.strip() for l in current_item])
|
||||
result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
|
||||
result += line + "\n"
|
||||
current_item = []
|
||||
elif in_code:
|
||||
current_paragraph.append(line)
|
||||
elif is_empty_line(line):
|
||||
current_paragraph = None
|
||||
current_indent = -1
|
||||
prefix = ""
|
||||
new_lines.append(line)
|
||||
elif list_search is not None:
|
||||
prefix = list_search.groups()[0]
|
||||
current_indent = len(prefix)
|
||||
current_paragraph = [line[current_indent:]]
|
||||
elif _re_args.search(line):
|
||||
new_lines.append(line)
|
||||
param_indent = find_indent(lines[idx + 1])
|
||||
elif _re_tip.search(line):
|
||||
# Add a new line before if not present
|
||||
if not is_empty_line(new_lines[-1]):
|
||||
new_lines.append("")
|
||||
new_lines.append(line)
|
||||
# Add a new line after if not present
|
||||
if idx < len(lines) - 1 and not is_empty_line(lines[idx + 1]):
|
||||
new_lines.append("")
|
||||
elif current_paragraph is None or find_indent(line) != current_indent:
|
||||
indent = find_indent(line)
|
||||
# Special behavior for parameters intros.
|
||||
if indent == param_indent:
|
||||
# Special rules for some docstring where the Returns blocks has the same indent as the parameters.
|
||||
if _re_returns.search(line) is not None:
|
||||
param_indent = -1
|
||||
new_lines.append(line)
|
||||
elif len(line) < max_len:
|
||||
new_lines.append(line)
|
||||
else:
|
||||
current_item.append(line)
|
||||
if len(current_item) > 0:
|
||||
item_indent = get_indent(current_item[0])
|
||||
text = " ".join([l.strip() for l in current_item])
|
||||
result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
|
||||
return result[:-1]
|
||||
|
||||
indent = get_indent(paragraph[0])
|
||||
text = " ".join([l.strip() for l in paragraph])
|
||||
return split_text_in_lines(text, max_len, indent, min_indent=min_indent)
|
||||
|
||||
def style(self, text, max_len=119, min_indent=None):
|
||||
"""Style `text` to `max_len`."""
|
||||
new_lines = []
|
||||
paragraph = []
|
||||
self.current_indent = ""
|
||||
self.previous_indent = None
|
||||
# If one of those is True, the paragraph should not be touched (code samples, lists...)
|
||||
no_style = False
|
||||
no_style_next = False
|
||||
self.in_block = self.init_in_block(text)
|
||||
# If this is True, we force-break a paragraph, even if there is no new empty line.
|
||||
break_paragraph = False
|
||||
|
||||
lines = text.split("\n")
|
||||
last_line = None
|
||||
for line in lines:
|
||||
# New paragraph
|
||||
line_is_empty = len(line.strip()) == 0
|
||||
list_begins = (
|
||||
_re_list.search(line) is not None
|
||||
and last_line is not None
|
||||
and len(get_indent(line)) > len(get_indent(last_line))
|
||||
)
|
||||
if line_is_empty or break_paragraph or list_begins:
|
||||
if len(paragraph) > 0:
|
||||
if self.in_block != SpecialBlock.NOT_SPECIAL:
|
||||
indent = get_indent(paragraph[0])
|
||||
# Are we still in a no-style block?
|
||||
if self.current_indent is None:
|
||||
# If current_indent is None, we haven't begun the interior of the block so the answer is
|
||||
# yes, unless we have an indent of 0 in which case the special block took one line only.
|
||||
if len(indent) == 0:
|
||||
self.in_block = SpecialBlock.NOT_SPECIAL
|
||||
else:
|
||||
self.current_indent = indent
|
||||
elif not indent.startswith(self.current_indent):
|
||||
# If not, we are leaving the block when we unindent.
|
||||
self.end_of_special_style(paragraph[0])
|
||||
|
||||
if self.is_special_block(paragraph[0]):
|
||||
# Maybe we are starting a special block.
|
||||
if len(paragraph) > 1:
|
||||
# If we have the interior of the block in the paragraph, we grab the indent.
|
||||
self.current_indent = get_indent(paragraph[1])
|
||||
intro, description = split_line_on_first_colon(line)
|
||||
new_lines.append(intro + ":")
|
||||
if len(description) != 0:
|
||||
if find_indent(lines[idx + 1]) > indent:
|
||||
current_indent = find_indent(lines[idx + 1])
|
||||
else:
|
||||
# We will determine the indent with the next paragraph
|
||||
self.current_indent = None
|
||||
styled_paragraph = self.style_paragraph(
|
||||
paragraph, max_len, no_style=no_style, min_indent=min_indent
|
||||
)
|
||||
new_lines.append(styled_paragraph + "\n")
|
||||
else:
|
||||
new_lines.append("")
|
||||
current_indent = indent + 4
|
||||
current_paragraph = [description.strip()]
|
||||
prefix = ""
|
||||
else:
|
||||
# Check if we have exited the parameter block
|
||||
if indent < param_indent:
|
||||
param_indent = -1
|
||||
|
||||
paragraph = []
|
||||
no_style = no_style_next
|
||||
no_style_next = False
|
||||
last_line = None
|
||||
if (not break_paragraph and not list_begins) or line_is_empty:
|
||||
break_paragraph = False
|
||||
continue
|
||||
break_paragraph = False
|
||||
current_paragraph = [line.strip()]
|
||||
current_indent = find_indent(line)
|
||||
prefix = ""
|
||||
elif current_paragraph is not None:
|
||||
current_paragraph.append(line.lstrip())
|
||||
|
||||
# Title and section lines should go to the max + add a new paragraph.
|
||||
if (
|
||||
len(set(line)) == 1
|
||||
and line[0] in TITLE_SPECIAL_CHARS
|
||||
and last_line is not None
|
||||
and len(line) >= len(last_line)
|
||||
):
|
||||
line = line[0] * max_len
|
||||
break_paragraph = True
|
||||
# proper doc comment indicates the next paragraph should be no-style.
|
||||
if _re_doc_ignore.search(line) is not None:
|
||||
no_style_next = True
|
||||
# Table are in just one paragraph and should be no-style.
|
||||
if _re_table.search(line) is not None:
|
||||
no_style = True
|
||||
paragraph.append(line)
|
||||
last_line = line
|
||||
if current_paragraph is not None and len(current_paragraph) > 0:
|
||||
paragraph = " ".join(current_paragraph)
|
||||
new_lines.append(format_text(paragraph, max_len, prefix=prefix, min_indent=current_indent))
|
||||
|
||||
# Just have to treat the last paragraph. It could still be in a no-style block (or not)
|
||||
if len(paragraph) > 0:
|
||||
# Are we still in a special block
|
||||
# (if current_indent is None, we are but no need to set it since we are the end.)
|
||||
if self.in_block != SpecialBlock.NO_STYLE and self.current_indent is not None:
|
||||
indent = get_indent(paragraph[0])
|
||||
if not indent.startswith(self.current_indent):
|
||||
self.in_block = SpecialBlock.NOT_SPECIAL
|
||||
_ = self.is_special_block(paragraph[0])
|
||||
new_lines.append(self.style_paragraph(paragraph, max_len, no_style=no_style, min_indent=min_indent) + "\n")
|
||||
return "\n".join(new_lines)
|
||||
return "\n".join(new_lines), "\n\n".join(black_errors)
|
||||
|
||||
|
||||
class DocstringStyler(CodeStyler):
|
||||
"""Class to style docstrings that take the main method from `CodeStyler`."""
|
||||
def style_docstrings_in_code(code, max_len=119):
|
||||
"""
|
||||
Style all docstrings in some code.
|
||||
|
||||
def is_no_style_block(self, line):
|
||||
if _re_textual_blocks.search(line) is not None:
|
||||
return False
|
||||
if _re_example.search(line) is not None:
|
||||
return True
|
||||
return _re_code_block.search(line) is not None
|
||||
Args:
|
||||
code (`str`): The code in which we want to style the docstrings.
|
||||
max_len (`int`): The maximum number of characters per line.
|
||||
|
||||
def is_comment_or_textual_block(self, line):
|
||||
if _re_return.search(line) is not None:
|
||||
self.in_block = SpecialBlock.NOT_SPECIAL
|
||||
return True
|
||||
return super().is_comment_or_textual_block(line)
|
||||
|
||||
def is_special_block(self, line):
|
||||
if self.is_no_style_block(line):
|
||||
if self.previous_indent is None and self.in_block == SpecialBlock.ARG_LIST:
|
||||
self.previous_indent = self.current_indent
|
||||
self.in_block = SpecialBlock.NO_STYLE
|
||||
return True
|
||||
if _re_arg_def.search(line) is not None:
|
||||
self.in_block = SpecialBlock.ARG_LIST
|
||||
return True
|
||||
return False
|
||||
|
||||
def end_of_special_style(self, line):
|
||||
if self.previous_indent is not None and line.startswith(self.previous_indent):
|
||||
self.in_block = SpecialBlock.ARG_LIST
|
||||
self.current_indent = self.previous_indent
|
||||
else:
|
||||
self.in_block = SpecialBlock.NOT_SPECIAL
|
||||
self.previous_indent = None
|
||||
|
||||
def init_in_block(self, text):
|
||||
lines = text.split("\n")
|
||||
while len(lines) > 0 and len(lines[0]) == 0:
|
||||
lines = lines[1:]
|
||||
if len(lines) == 0:
|
||||
return SpecialBlock.NOT_SPECIAL
|
||||
if re.search(r":\s*$", lines[0]):
|
||||
indent = get_indent(lines[0])
|
||||
if (
|
||||
len(lines) == 1
|
||||
or len(get_indent(lines[1])) > len(indent)
|
||||
or (len(get_indent(lines[1])) == len(indent) and re.search(r":\s*$", lines[1]))
|
||||
):
|
||||
self.current_indent = indent
|
||||
return SpecialBlock.ARG_LIST
|
||||
return SpecialBlock.NOT_SPECIAL
|
||||
|
||||
|
||||
rst_styler = CodeStyler()
|
||||
doc_styler = DocstringStyler()
|
||||
|
||||
|
||||
def _add_new_lines_before_list(text):
|
||||
"""Add a new empty line before a list begins."""
|
||||
lines = text.split("\n")
|
||||
new_lines = []
|
||||
in_list = False
|
||||
for idx, line in enumerate(lines):
|
||||
# Detect if the line is the start of a new list.
|
||||
if _re_list.search(line) is not None and not in_list:
|
||||
current_indent = get_indent(line)
|
||||
in_list = True
|
||||
# If the line before is non empty, add an extra new line.
|
||||
if idx > 0 and len(lines[idx - 1]) != 0:
|
||||
new_lines.append("")
|
||||
# Detect if we're out of the current list.
|
||||
if in_list and not line.startswith(current_indent) and _re_list.search(line) is None:
|
||||
in_list = False
|
||||
new_lines.append(line)
|
||||
return "\n".join(new_lines)
|
||||
|
||||
|
||||
def _add_new_lines_before_doc_special_words(text):
|
||||
lines = text.split("\n")
|
||||
new_lines = []
|
||||
for idx, line in enumerate(lines):
|
||||
# Detect if the line is the start of a new list.
|
||||
if _re_any_doc_special_word.search(line) is not None:
|
||||
# If the line before is non empty, add an extra new line.
|
||||
if idx > 0 and len(lines[idx - 1]) != 0:
|
||||
new_lines.append("")
|
||||
new_lines.append(line)
|
||||
return "\n".join(new_lines)
|
||||
|
||||
|
||||
def style_rst_file(doc_file, max_len=119, check_only=False):
|
||||
""" Style one rst file `doc_file` to `max_len`."""
|
||||
with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
|
||||
doc = f.read()
|
||||
|
||||
# Add missing new lines before lists
|
||||
clean_doc = _add_new_lines_before_list(doc)
|
||||
# Style
|
||||
clean_doc = rst_styler.style(clean_doc, max_len=max_len)
|
||||
|
||||
diff = clean_doc != doc
|
||||
if not check_only and diff:
|
||||
print(f"Overwriting content of {doc_file}.")
|
||||
with open(doc_file, "w", encoding="utf-8", newline="\n") as f:
|
||||
f.write(clean_doc)
|
||||
|
||||
return diff
|
||||
|
||||
|
||||
def style_docstring(docstring, max_len=119):
|
||||
"""Style `docstring` to `max_len`."""
|
||||
# One-line docstring that are not too long are left as is.
|
||||
if len(docstring) < max_len and "\n" not in docstring:
|
||||
return docstring
|
||||
|
||||
# Grab the indent from the last line
|
||||
last_line = docstring.split("\n")[-1]
|
||||
# Is it empty except for the last triple-quotes (not-included in `docstring`)?
|
||||
indent_search = re.search(r"^(\s*)$", last_line)
|
||||
if indent_search is not None:
|
||||
indent = indent_search.groups()[0]
|
||||
if len(indent) > 0:
|
||||
docstring = docstring[: -len(indent)]
|
||||
# Or are the triple quotes next to text (we will fix that).
|
||||
else:
|
||||
indent_search = _re_indent.search(last_line)
|
||||
indent = indent_search.groups()[0] if indent_search is not None else ""
|
||||
|
||||
# Add missing new lines before Args/Returns etc.
|
||||
docstring = _add_new_lines_before_doc_special_words(docstring)
|
||||
# Add missing new lines before lists
|
||||
docstring = _add_new_lines_before_list(docstring)
|
||||
# Style
|
||||
styled_doc = doc_styler.style(docstring, max_len=max_len, min_indent=indent)
|
||||
|
||||
# Add new lines if necessary
|
||||
if not styled_doc.startswith("\n"):
|
||||
styled_doc = "\n" + styled_doc
|
||||
if not styled_doc.endswith("\n"):
|
||||
styled_doc += "\n"
|
||||
return styled_doc + indent
|
||||
|
||||
|
||||
def style_file_docstrings(code_file, max_len=119, check_only=False):
|
||||
"""Style all docstrings in `code_file` to `max_len`."""
|
||||
with open(code_file, "r", encoding="utf-8", newline="\n") as f:
|
||||
code = f.read()
|
||||
splits = code.split('"""')
|
||||
Returns:
|
||||
`Tuple[str, str]`: A tuple with the clean code and the black errors (if any)
|
||||
"""
|
||||
# fmt: off
|
||||
splits = code.split('\"\"\"')
|
||||
splits = [
|
||||
(s if i % 2 == 0 or _re_doc_ignore.search(splits[i - 1]) is not None else style_docstring(s, max_len=max_len))
|
||||
for i, s in enumerate(splits)
|
||||
]
|
||||
clean_code = '"""'.join(splits)
|
||||
black_errors = "\n\n".join([s[1] for s in splits if isinstance(s, tuple) and len(s[1]) > 0])
|
||||
splits = [s[0] if isinstance(s, tuple) else s for s in splits]
|
||||
clean_code = '\"\"\"'.join(splits)
|
||||
# fmt: on
|
||||
|
||||
return clean_code, black_errors
|
||||
|
||||
|
||||
def style_file_docstrings(code_file, max_len=119, check_only=False):
|
||||
"""
|
||||
Style all docstrings in a given file.
|
||||
|
||||
Args:
|
||||
code_file (`str` or `os.PathLike`): The file in which we want to style the docstring.
|
||||
max_len (`int`): The maximum number of characters per line.
|
||||
check_only (`bool`, *optional*, defaults to `False`):
|
||||
Whether to restyle file or just check if they should be restyled.
|
||||
|
||||
Returns:
|
||||
`bool`: Whether or not the file was or should be restyled.
|
||||
"""
|
||||
with open(code_file, "r", encoding="utf-8", newline="\n") as f:
|
||||
code = f.read()
|
||||
|
||||
clean_code, black_errors = style_docstrings_in_code(code, max_len=max_len)
|
||||
|
||||
diff = clean_code != code
|
||||
if not check_only and diff:
|
||||
@ -479,31 +419,122 @@ def style_file_docstrings(code_file, max_len=119, check_only=False):
|
||||
with open(code_file, "w", encoding="utf-8", newline="\n") as f:
|
||||
f.write(clean_code)
|
||||
|
||||
return diff
|
||||
return diff, black_errors
|
||||
|
||||
|
||||
def style_mdx_file(mdx_file, max_len=119, check_only=False):
|
||||
"""
|
||||
Style a MDX file by formatting all Python code samples.
|
||||
|
||||
Args:
|
||||
mdx_file (`str` or `os.PathLike`): The file in which we want to style the examples.
|
||||
max_len (`int`): The maximum number of characters per line.
|
||||
check_only (`bool`, *optional*, defaults to `False`):
|
||||
Whether to restyle file or just check if they should be restyled.
|
||||
|
||||
Returns:
|
||||
`bool`: Whether or not the file was or should be restyled.
|
||||
"""
|
||||
with open(mdx_file, "r", encoding="utf-8", newline="\n") as f:
|
||||
content = f.read()
|
||||
|
||||
lines = content.split("\n")
|
||||
current_code = []
|
||||
current_language = ""
|
||||
in_code = False
|
||||
new_lines = []
|
||||
black_errors = []
|
||||
|
||||
for line in lines:
|
||||
if _re_code.search(line) is not None:
|
||||
in_code = not in_code
|
||||
if in_code:
|
||||
current_language = _re_code.search(line).groups()[1]
|
||||
current_code = []
|
||||
else:
|
||||
code = "\n".join(current_code)
|
||||
if current_language in ["py", "python"]:
|
||||
code, error = format_code_example(code, max_len)
|
||||
if len(error) > 0:
|
||||
black_errors.append(error)
|
||||
new_lines.append(code)
|
||||
|
||||
new_lines.append(line)
|
||||
elif in_code:
|
||||
current_code.append(line)
|
||||
else:
|
||||
new_lines.append(line)
|
||||
|
||||
if in_code:
|
||||
raise ValueError(f"There was a problem when styling {mdx_file}. A code block is opened without being closed.")
|
||||
|
||||
clean_content = "\n".join(new_lines)
|
||||
diff = clean_content != content
|
||||
if not check_only and diff:
|
||||
print(f"Overwriting content of {mdx_file}.")
|
||||
with open(mdx_file, "w", encoding="utf-8", newline="\n") as f:
|
||||
f.write(clean_content)
|
||||
|
||||
return diff, "\n\n".join(black_errors)
|
||||
|
||||
|
||||
def style_doc_files(*files, max_len=119, check_only=False):
|
||||
"""
|
||||
Style all `files` to `max_len` and fixes mistakes if not `check_only`, otherwise raises an error if styling should
|
||||
be done.
|
||||
Applies doc styling or checks everything is correct in a list of files.
|
||||
|
||||
Args:
|
||||
files (several `str` or `os.PathLike`): The files to treat.
|
||||
max_len (`int`): The maximum number of characters per line.
|
||||
check_only (`bool`, *optional*, defaults to `False`):
|
||||
Whether to restyle file or just check if they should be restyled.
|
||||
|
||||
Returns:
|
||||
List[`str`]: The list of files changed or that should be restyled.
|
||||
"""
|
||||
changed = []
|
||||
black_errors = []
|
||||
for file in files:
|
||||
# Treat folders
|
||||
if os.path.isdir(file):
|
||||
files = [os.path.join(file, f) for f in os.listdir(file)]
|
||||
files = [f for f in files if os.path.isdir(f) or f.endswith(".rst") or f.endswith(".py")]
|
||||
files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")]
|
||||
changed += style_doc_files(*files, max_len=max_len, check_only=check_only)
|
||||
# Treat rst
|
||||
elif file.endswith(".rst"):
|
||||
if style_rst_file(file, max_len=max_len, check_only=check_only):
|
||||
changed.append(file)
|
||||
# Treat mdx
|
||||
elif file.endswith(".mdx"):
|
||||
try:
|
||||
diff, black_error = style_mdx_file(file, max_len=max_len, check_only=check_only)
|
||||
if diff:
|
||||
changed.append(file)
|
||||
if len(black_error) > 0:
|
||||
black_errors.append(
|
||||
f"There was a problem while formatting an example in {file} with black:\m{black_error}"
|
||||
)
|
||||
except Exception:
|
||||
print(f"There is a problem in {file}.")
|
||||
raise
|
||||
# Treat python files
|
||||
elif file.endswith(".py"):
|
||||
if style_file_docstrings(file, max_len=max_len, check_only=check_only):
|
||||
changed.append(file)
|
||||
try:
|
||||
diff, black_error = style_file_docstrings(file, max_len=max_len, check_only=check_only)
|
||||
if diff:
|
||||
changed.append(file)
|
||||
if len(black_error) > 0:
|
||||
black_errors.append(
|
||||
f"There was a problem while formatting an example in {file} with black:\m{black_error}"
|
||||
)
|
||||
except Exception:
|
||||
print(f"There is a problem in {file}.")
|
||||
raise
|
||||
else:
|
||||
warnings.warn(f"Ignoring {file} because it's not a py or an rst file or a folder.")
|
||||
warnings.warn(f"Ignoring {file} because it's not a py or an mdx file or a folder.")
|
||||
if len(black_errors) > 0:
|
||||
black_message = "\n\n".join(black_errors)
|
||||
raise ValueError(
|
||||
"Some code examples can't be interpreted by black, which means they aren't regular python:\n\n"
|
||||
+ black_message
|
||||
+ "\n\nMake sure to fix the corresponding docstring or doc file, or remove the py/python after ``` if it "
|
||||
+ "was not supposed to be a Python code sample."
|
||||
)
|
||||
return changed
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user