mirror of
https://github.com/huggingface/accelerate.git
synced 2025-11-17 16:04:35 +08:00
Compare commits
116 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| eebeb59a36 | |||
| be4b74f42f | |||
| c93b3eb5d7 | |||
| 3eea8ceee0 | |||
| 7abc708be2 | |||
| bb78b04cce | |||
| 7e6593756f | |||
| 960fd9d86a | |||
| 70ca65a9a1 | |||
| ea0d5368bd | |||
| 78357f44b3 | |||
| c7526e9483 | |||
| f5ef120e77 | |||
| 3c1f97c386 | |||
| a0514dd809 | |||
| b20f90ab17 | |||
| cfb2a3e239 | |||
| 86ce737d7f | |||
| deffaba8d6 | |||
| 6ebddcd5e0 | |||
| 4a7bc3bcb7 | |||
| 1f96f3cf85 | |||
| bbca2700c7 | |||
| a8eca60d57 | |||
| 329209871f | |||
| 619ef04f09 | |||
| 9d8ed50f7b | |||
| 196856f357 | |||
| 3a5490b066 | |||
| 24be733d84 | |||
| 775bc790e7 | |||
| 799fa935e9 | |||
| 3ccbd9f7a0 | |||
| f13c59f91e | |||
| d39c57c11f | |||
| e2a968c66d | |||
| dc243c0db1 | |||
| 97f4c9de61 | |||
| 73a596593e | |||
| eeaba598f4 | |||
| 3d92caa241 | |||
| fa17f207b5 | |||
| 873dcc63a4 | |||
| 40b6fe1784 | |||
| 29eef234c9 | |||
| 3f0876ac03 | |||
| 450d51ce01 | |||
| 1b2da6c6a5 | |||
| 1424a8e00d | |||
| b2afd4e8da | |||
| 2130205626 | |||
| 1703b79a79 | |||
| 05c641bc0c | |||
| da78e296ba | |||
| 9e0fff9291 | |||
| 938b8f358d | |||
| d04e8e2baa | |||
| 8db128498c | |||
| 114707449b | |||
| 3b51d6e9ad | |||
| 174eb3af1d | |||
| d176b552c9 | |||
| 95d1edbf8d | |||
| a91575f1bb | |||
| 146ce3df48 | |||
| 94d88fb50d | |||
| b515800947 | |||
| d1f7f99684 | |||
| 00ee34d9a6 | |||
| f6ec2660f0 | |||
| b3e21686de | |||
| f12ef1416e | |||
| 18085fa250 | |||
| 6be221f15e | |||
| 3c4308e8cd | |||
| 17046bfaf8 | |||
| 07ed7e92b5 | |||
| 5a679d08d3 | |||
| 5a00ece500 | |||
| f62ae86cfb | |||
| f9de557037 | |||
| 517cbf408b | |||
| f626d87eb7 | |||
| 8b8c5345cd | |||
| 41427c594a | |||
| 3c45b6f760 | |||
| b922c63322 | |||
| 23c0341262 | |||
| 6163e20b14 | |||
| d33dc39a32 | |||
| 043d2ec52d | |||
| 64e41a4995 | |||
| 4736c754bf | |||
| 28edac2c4c | |||
| 1700716760 | |||
| aa9b614967 | |||
| 2943172b8f | |||
| f56f4441b3 | |||
| 45359a73ff | |||
| b5b68fbb4d | |||
| d190ed7e41 | |||
| b923e134e7 | |||
| b2956acbe9 | |||
| be0f7ce44f | |||
| 603a53f056 | |||
| 02e2ed567b | |||
| 8abd274a7f | |||
| b05d483944 | |||
| a74c7c9538 | |||
| a60640d7e2 | |||
| 611546f12d | |||
| 7d2a259e3d | |||
| e5c17f36a8 | |||
| 20de3fc959 | |||
| f84cb0c1fa | |||
| 136437e3e8 |
59
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
59
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
@ -0,0 +1,59 @@
|
||||
name: "\U0001F41B Bug Report"
|
||||
description: Submit a bug report to help us improve Accelerate
|
||||
labels: [ "bug" ]
|
||||
body:
|
||||
- type: textarea
|
||||
id: system-info
|
||||
attributes:
|
||||
label: System Info
|
||||
description: Please share your accelerate configuration with us. You can run the command `accelerate env` and copy-paste its outputs below
|
||||
render: Shell
|
||||
placeholder: accelerate version, OS, python version, numpy version, torch version, and accelerate's configuration
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: checkboxes
|
||||
id: information-scripts-examples
|
||||
attributes:
|
||||
label: Information
|
||||
description: 'The problem arises when using:'
|
||||
options:
|
||||
- label: "The official example scripts"
|
||||
- label: "My own modified scripts"
|
||||
|
||||
- type: checkboxes
|
||||
id: information-tasks
|
||||
attributes:
|
||||
label: Tasks
|
||||
description: "The tasks I am working on are:"
|
||||
options:
|
||||
- label: "One of the scripts in the examples/ folder of Accelerate or an officially supported `no_trainer` script in the `examples` folder of the `transformers` repo (such as `run_no_trainer_glue.py`)"
|
||||
- label: "My own task or dataset (give details below)"
|
||||
|
||||
- type: textarea
|
||||
id: reproduction
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Reproduction
|
||||
description: |
|
||||
Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
|
||||
If you have code snippets, error messages, stack traces please provide them here as well.
|
||||
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
|
||||
|
||||
placeholder: |
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
|
||||
- type: textarea
|
||||
id: expected-behavior
|
||||
validations:
|
||||
required: true
|
||||
attributes:
|
||||
label: Expected behavior
|
||||
description: "A clear and concise description of what you would expect to happen."
|
||||
render: Shell
|
||||
54
.github/workflows/build-docker-images.yml
vendored
Normal file
54
.github/workflows/build-docker-images.yml
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
name: Build Docker images (scheduled)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
schedule:
|
||||
- cron: "0 1 * * *"
|
||||
|
||||
concurrency:
|
||||
group: docker-image-builds
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
latest-cpu:
|
||||
name: "Latest Accelerate CPU [dev]"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v2
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Build and Push CPU
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: ./docker/accelerate-cpu
|
||||
push: true
|
||||
tags: huggingface/accelerate-cpu
|
||||
|
||||
latest-cuda:
|
||||
name: "Latest Accelerate GPU [dev]"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v2
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Build and Push GPU
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: ./docker/accelerate-gpu
|
||||
push: true
|
||||
tags: huggingface/accelerate-gpu
|
||||
45
.github/workflows/check_dependencies.yml
vendored
Normal file
45
.github/workflows/check_dependencies.yml
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
name: Trigger docker images and run slow tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
jobs:
|
||||
check-for-setup:
|
||||
runs-on: ubuntu-latest
|
||||
name: Check if setup was changed
|
||||
outputs:
|
||||
changed: ${{ steps.was_changed.outputs.changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: "2"
|
||||
|
||||
- name: Get changed files
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@v22.2
|
||||
|
||||
- name: Was setup changed
|
||||
id: was_changed
|
||||
run: |
|
||||
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
|
||||
if [ `basename "${file}"` = "setup.py" ]; then
|
||||
echo ::set-output name=changed::"1"
|
||||
fi
|
||||
done
|
||||
|
||||
build-docker-containers:
|
||||
needs: check-for-setup
|
||||
if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
|
||||
uses: ./.github/workflows/build-docker-images.yml
|
||||
secrets: inherit
|
||||
|
||||
run-tests:
|
||||
needs: build-docker-containers
|
||||
if: always()
|
||||
uses: ./.github/workflows/on-merge.yml
|
||||
69
.github/workflows/nightly.yml
vendored
Normal file
69
.github/workflows/nightly.yml
vendored
Normal file
@ -0,0 +1,69 @@
|
||||
name: Self-hosted runner (scheduled)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 2 * * *"
|
||||
|
||||
env:
|
||||
RUN_SLOW: "yes"
|
||||
|
||||
jobs:
|
||||
run_all_tests_single_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
container:
|
||||
image: huggingface/accelerate-gpu:latest
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
working-directory: accelerate/
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Update clone & pip install
|
||||
run: |
|
||||
source activate accelerate
|
||||
git config --global --add safe.directory '*'
|
||||
git fetch && git checkout ${{ github.sha }}
|
||||
pip install -e . --no-deps
|
||||
|
||||
- name: Run test on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test
|
||||
- name: Run examples on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
|
||||
run_all_tests_multi_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0,1"
|
||||
container:
|
||||
image: huggingface/accelerate-gpu:latest
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
working-directory: accelerate/
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Update clone
|
||||
run: |
|
||||
source activate accelerate
|
||||
git config --global --add safe.directory '*'
|
||||
git fetch && git checkout ${{ github.sha }}
|
||||
pip install -e . --no-deps
|
||||
|
||||
- name: Run test on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test
|
||||
|
||||
- name: Run examples on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
66
.github/workflows/on-merge.yml
vendored
Normal file
66
.github/workflows/on-merge.yml
vendored
Normal file
@ -0,0 +1,66 @@
|
||||
name: Self-hosted runner tests (push to "main")
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
TESTING_MOCKED_DATALOADERS: "1"
|
||||
|
||||
jobs:
|
||||
run_all_tests_single_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
env:
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
container:
|
||||
image: huggingface/accelerate-gpu:latest
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
working-directory: accelerate/
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Update clone & pip install
|
||||
run: |
|
||||
source activate accelerate
|
||||
git config --global --add safe.directory '*'
|
||||
git fetch && git checkout ${{ github.sha }}
|
||||
pip install -e .[test,test_trackers]
|
||||
|
||||
- name: Run test on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test
|
||||
- name: Run examples on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
|
||||
run_all_tests_multi_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
container:
|
||||
image: huggingface/accelerate-gpu:latest
|
||||
options: --gpus all --shm-size "16gb"
|
||||
defaults:
|
||||
run:
|
||||
working-directory: accelerate/
|
||||
shell: bash
|
||||
steps:
|
||||
- name: Update clone
|
||||
run: |
|
||||
source activate accelerate
|
||||
git config --global --add safe.directory '*'
|
||||
git fetch && git checkout ${{ github.sha }}
|
||||
pip install -e .[test,test_trackers]
|
||||
|
||||
- name: Run test on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
make test
|
||||
|
||||
- name: Run examples on GPUs
|
||||
run: |
|
||||
source activate accelerate
|
||||
pip uninstall comet_ml -y
|
||||
make test_examples
|
||||
6
.github/workflows/quality.yml
vendored
6
.github/workflows/quality.yml
vendored
@ -7,10 +7,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.6
|
||||
uses: actions/setup-python@v2
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.6
|
||||
python-version: 3.7
|
||||
- name: Install Python dependencies
|
||||
run: pip install -e .[quality]
|
||||
- name: Run Quality check
|
||||
|
||||
28
.github/workflows/stale.yml
vendored
Normal file
28
.github/workflows/stale.yml
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
name: Stale Bot
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 15 * * *"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
close_stale_issues:
|
||||
name: Close Stale Issues
|
||||
if: github.repository == 'huggingface/accelerate'
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: 3.7
|
||||
|
||||
- name: Install requirements
|
||||
run: |
|
||||
pip install PyGithub
|
||||
- name: Close stale issues
|
||||
run: |
|
||||
python utils/stale.py
|
||||
55
.github/workflows/test.yml
vendored
55
.github/workflows/test.yml
vendored
@ -2,29 +2,44 @@ name: Run Tests
|
||||
|
||||
on: [pull_request]
|
||||
|
||||
env:
|
||||
HF_HOME: ~/hf_cache
|
||||
TESTING_MOCKED_DATALOADERS: "1"
|
||||
|
||||
jobs:
|
||||
test:
|
||||
run-tests:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
test-kind: [
|
||||
test,
|
||||
test_deepspeed,
|
||||
test_example_differences,
|
||||
test_checkpoint_step,
|
||||
test_checkpoint_epoch,
|
||||
test_rest
|
||||
]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.6
|
||||
uses: actions/setup-python@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up python 3.7
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.6
|
||||
- name: Install Python dependencies
|
||||
run: pip install setuptools==59.5.0; pip install -e .[test,test_trackers]
|
||||
- name: Run Tests
|
||||
run: make test
|
||||
|
||||
test_examples:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.6
|
||||
uses: actions/setup-python@v2
|
||||
python-version: 3.7
|
||||
|
||||
- name: Activate python cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
python-version: 3.6
|
||||
- name: Install Python dependencies
|
||||
run: pip install setuptools==59.5.0; pip install -e .[test] tensorboard
|
||||
path: |
|
||||
${{ env.pythonLocation }}
|
||||
${{ env.HF_HOME }}
|
||||
key: ${{ env.pythonLocation }}-${{ matrix.test-kind }}-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Install the library
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e .[test,test_trackers]
|
||||
if [ ${{ matrix.test-kind }} = test_rest ]; then pip uninstall comet_ml -y; fi
|
||||
|
||||
- name: Run Tests
|
||||
run: make test_examples
|
||||
run: |
|
||||
make ${{ matrix.test-kind }}
|
||||
22
Makefile
22
Makefile
@ -1,6 +1,6 @@
|
||||
.PHONY: quality style test docs
|
||||
|
||||
check_dirs := tests src examples
|
||||
check_dirs := tests src examples benchmarks
|
||||
|
||||
# Check that source code meets quality standards
|
||||
|
||||
@ -25,7 +25,23 @@ style:
|
||||
|
||||
# Run tests for the library
|
||||
test:
|
||||
python -m pytest -n auto --dist=loadfile -s -v ./tests/ --ignore=./tests/test_examples.py
|
||||
python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py
|
||||
|
||||
test_deepspeed:
|
||||
python -m pytest -s -v ./tests/deepspeed
|
||||
|
||||
test_examples:
|
||||
python -m pytest -n auto --dist=loadfile -s -v ./tests/test_examples.py
|
||||
python -m pytest -s -v ./tests/test_examples.py
|
||||
|
||||
# Broken down example tests for the CI runners
|
||||
test_example_differences:
|
||||
python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests
|
||||
|
||||
test_checkpoint_epoch:
|
||||
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_epoch"
|
||||
|
||||
test_checkpoint_step:
|
||||
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "by_step"
|
||||
|
||||
test_rest:
|
||||
python -m pytest -s -v ./tests/test_examples.py::FeatureExamplesTests -k "not by_step and not by_epoch"
|
||||
@ -212,6 +212,7 @@ If you like the simplicity of 🤗 Accelerate but would prefer a higher-level ab
|
||||
|
||||
* [Animus](https://github.com/Scitator/animus) is a minimalistic framework to run machine learning experiments. Animus highlights common "breakpoints" in ML experiments and provides a unified interface for them within [IExperiment](https://github.com/Scitator/animus/blob/main/animus/core.py#L76).
|
||||
* [Catalyst](https://github.com/catalyst-team/catalyst#getting-started) is a PyTorch framework for Deep Learning Research and Development. It focuses on reproducibility, rapid experimentation, and codebase reuse so you can create something new rather than write yet another train loop. Catalyst provides a [Runner](https://catalyst-team.github.io/catalyst/api/core.html#runner) to connect all parts of the experiment: hardware backend, data transformations, model train, and inference logic.
|
||||
* [fastai](https://github.com/fastai/fastai#installing) is a PyTorch framework for Deep Learning that simplifies training fast and accurate neural nets using modern best practices. fastai provides a [Learner](https://docs.fast.ai/learner.html#Learner) to handle the training, fine-tuning, and inference of deep learning algorithms.
|
||||
* [Kornia](https://kornia.readthedocs.io/en/latest/get-started/introduction.html) is a differentiable library that allows classical computer vision to be integrated into deep learning models. Kornia provides a [Trainer](https://kornia.readthedocs.io/en/latest/x.html#kornia.x.Trainer) with the specific purpose to train and fine-tune the supported deep learning algorithms within the library.
|
||||
* [pytorch-accelerated](https://github.com/Chris-hughes10/pytorch-accelerated) is a lightweight training library, with a streamlined feature set centred around a general-purpose [Trainer](https://pytorch-accelerated.readthedocs.io/en/latest/trainer.html), that places a huge emphasis on simplicity and transparency; enabling users to understand exactly what is going on under the hood, but without having to write and maintain the boilerplate themselves!
|
||||
|
||||
@ -240,4 +241,5 @@ pip install accelerate
|
||||
- multi-GPU on several nodes (machines)
|
||||
- TPU
|
||||
- FP16 with native AMP (apex on the roadmap)
|
||||
- DeepSpeed support (experimental)
|
||||
- DeepSpeed support (Experimental)
|
||||
- PyTorch Fully Sharded Data Parallel (FSDP) support (Experimental)
|
||||
|
||||
46
benchmarks/README.md
Normal file
46
benchmarks/README.md
Normal file
@ -0,0 +1,46 @@
|
||||
# Big model inference benchmarks
|
||||
|
||||
Running inference with Accelerate on big models.
|
||||
|
||||
## Setup
|
||||
|
||||
These benchmarks use the `transformers` library:
|
||||
|
||||
```bash
|
||||
pip install transformers
|
||||
```
|
||||
|
||||
To reproduce or test a new setup, run
|
||||
|
||||
```py
|
||||
python inference_acc.py model_name
|
||||
```
|
||||
|
||||
This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`.
|
||||
|
||||
To force a different `torch_dtype` than the one in the config: `--torch_dtype xxx`.
|
||||
|
||||
If you get an error linked to disk offload, you need to add the option `--disk-offload`
|
||||
|
||||
## Results
|
||||
|
||||
On a setup with two Titan RTXs (24GB of RAM) and 32GB of RAM, we get the following benchmarks (T0pp does not run in float16, which is why it's not included).
|
||||
|
||||
| Model | Model load time | Generation time | dtype | GPU 0 use | GPU 1 use | CPU use | Disk offload |
|
||||
|:-----:|:---------------:|:---------------:|:-----:|:---------:|:---------:|:-------:|:------------:|
|
||||
| GPT-J-6B | 8.7s | 0.05s per token | float16 | 11.7GB | 0GB | 0GB | no |
|
||||
| GPT-J-6B | 12.4s | 0.06s per token | float32 | 21.9GB | 1.5GB | 0GB | no |
|
||||
| GPT-Neo-X-20B | 30.9s | 0.08s per token | float16 | 21.5GB | 18GB | 0GB | no |
|
||||
| GPT-Neo-X-20B | 78.2s | 10.72s per token | float32 | 20.3GB | 22.7 GB | 24.4GB | yes |
|
||||
| T0pp (11B) | 29.4s | 0.05s per token | float32 | 21.1GB | 21.3GB | 0GB | no |
|
||||
| OPT-30B | 34.5s | 2.37s per token | float16 | 20.7GB | 22.3GB | 14.1GB | no |
|
||||
| OPT-30B | 112.3s | 33.9s per token | float32 | 20.2GB | 21.2GB | 23.5GB | yes |
|
||||
|
||||
Note on the results:
|
||||
- using two GPUs instead of one does not slow down generation
|
||||
- using CPU offload slows down a bit (see OPT-30b)
|
||||
- using disk offload slows down a lot (need to implement prefetching)
|
||||
|
||||
You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary:
|
||||
- peak GPU memory is exactly the size of the model put on a given GPU
|
||||
- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger.
|
||||
143
benchmarks/big_model_inference.py
Normal file
143
benchmarks/big_model_inference.py
Normal file
@ -0,0 +1,143 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
import transformers
|
||||
from accelerate.utils import compute_module_sizes
|
||||
from measures_util import end_measure, log_measures, start_measure
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
|
||||
|
||||
|
||||
DEFAULT_MODELS = {
|
||||
"gpt-j-6b": {"is_causal": True, "model": "sgugger/sharded-gpt-j-6B", "tokenizer": "EleutherAI/gpt-j-6B"},
|
||||
"gpt-neox": {"is_causal": True, "model": "EleutherAI/gpt-neox-20b"},
|
||||
"opt": {"is_causal": True, "model": "facebook/opt-30b"},
|
||||
"T0pp": {"is_causal": False, "model": "bigscience/T0pp", "model_revision": "sharded"},
|
||||
}
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"Are unicorns real? Unicorns are",
|
||||
"For the first time in several years,",
|
||||
"My name is Julien and I am",
|
||||
"The goal of life is",
|
||||
"Whenever I'm sad, I like to",
|
||||
]
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Run and time generations on a big model using Accelerate.")
|
||||
parser.add_argument("model_name", type=str, default=None, help="The name of the model to try.")
|
||||
parser.add_argument(
|
||||
"--tokenizer_name", type=str, default=None, help="The name of the tokenizer (if different from the model."
|
||||
)
|
||||
parser.add_argument("--is_causal", type=bool, default=None, help="Whether or not the model is causal.")
|
||||
parser.add_argument(
|
||||
"--model_revision", type=str, default=None, help="The revision to use for the model checkpoint."
|
||||
)
|
||||
parser.add_argument("--torch_dtype", type=str, default=None, help="The dtype for the model.")
|
||||
parser.add_argument("--disk_offload", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Sanitize args
|
||||
if args.model_name in DEFAULT_MODELS:
|
||||
defaults = DEFAULT_MODELS[args.model_name]
|
||||
args.model_name = defaults["model"]
|
||||
if args.tokenizer_name is None:
|
||||
args.tokenizer_name = defaults.get("tokenizer", args.model_name)
|
||||
if args.is_causal is None:
|
||||
args.is_causal = defaults["is_causal"]
|
||||
if args.model_revision is None:
|
||||
args.model_revision = defaults.get("model_revision", "main")
|
||||
|
||||
if args.is_causal is None:
|
||||
raise ValueError("Could not infer the default for `--is_causal`, pass either True or False for it.")
|
||||
if args.tokenizer_name is None:
|
||||
args.tokenizer_name = args.model_name
|
||||
if args.model_revision is None:
|
||||
args.model_revision = "main"
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
transformers.utils.logging.set_verbosity_error()
|
||||
args = parse_args()
|
||||
|
||||
if args.torch_dtype is None:
|
||||
config = AutoConfig.from_pretrained(args.model_name)
|
||||
torch_dtype = getattr(config, "torch_dtype", torch.float32)
|
||||
else:
|
||||
torch_dtype = getattr(torch, args.torch_dtype)
|
||||
model_cls = AutoModelForCausalLM if args.is_causal else AutoModelForSeq2SeqLM
|
||||
kwargs = {
|
||||
"torch_dtype": torch_dtype,
|
||||
"revision": args.model_revision,
|
||||
}
|
||||
if args.disk_offload:
|
||||
kwargs["offload_folder"] = "tmp_offload"
|
||||
kwargs["offload_state_dict"] = True
|
||||
|
||||
start_measures = start_measure()
|
||||
model = model_cls.from_pretrained(args.model_name, device_map="auto", **kwargs)
|
||||
end_measures = end_measure(start_measures)
|
||||
log_measures(end_measures, "Model loading")
|
||||
|
||||
module_sizes = compute_module_sizes(model)
|
||||
device_size = {v: 0 for v in model.hf_device_map.values()}
|
||||
for module, device in model.hf_device_map.items():
|
||||
device_size[device] += module_sizes[module]
|
||||
message = "\n".join([f"- {device}: {size // 2**20}MiB" for device, size in device_size.items()])
|
||||
print(f"\nTheoretical use:\n{message}")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
|
||||
|
||||
start_measures = start_measure()
|
||||
generation_times = []
|
||||
gen_tokens = []
|
||||
texts_outs = []
|
||||
for prompt in PROMPTS:
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(0)
|
||||
tokens = inputs["input_ids"][0].tolist()
|
||||
before_generate = time.time()
|
||||
outputs = model.generate(inputs["input_ids"])
|
||||
after_generate = time.time()
|
||||
outputs = outputs[0].tolist()
|
||||
num_gen_tokens = len(outputs) if outputs[: len(tokens)] != tokens else len(outputs) - len(tokens)
|
||||
generation_time = after_generate - before_generate
|
||||
|
||||
text_out = tokenizer.decode(outputs, skip_special_tokens=True)
|
||||
texts_outs.append(text_out)
|
||||
generation_times.append(generation_time)
|
||||
gen_tokens.append(num_gen_tokens)
|
||||
print(f"Prompt: {prompt}\nGeneration {text_out}\nIn {generation_time:.2f}s for {num_gen_tokens} tokens\n")
|
||||
|
||||
end_measures = end_measure(start_measures)
|
||||
log_measures(end_measures, "Model generation")
|
||||
|
||||
generation_times_per_token = [gen / tok for gen, tok in zip(generation_times, gen_tokens)]
|
||||
avg_gen = sum(generation_times_per_token) / len(generation_times)
|
||||
print(f"Average time of generation per token: {avg_gen:.2f}s")
|
||||
print(f"First generation (avg time per token): {generation_times_per_token[0]:.2f}s")
|
||||
avg_gen = sum(generation_times_per_token[1:]) / (len(generation_times_per_token) - 1)
|
||||
print(f"Average time of generation per token (excluding the first): {avg_gen:.2f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
86
benchmarks/measures_util.py
Normal file
86
benchmarks/measures_util.py
Normal file
@ -0,0 +1,86 @@
|
||||
import gc
|
||||
import threading
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
class PeakCPUMemory:
|
||||
def __init__(self):
|
||||
self.process = psutil.Process()
|
||||
self.peak_monitoring = False
|
||||
|
||||
def peak_monitor(self):
|
||||
self.cpu_memory_peak = -1
|
||||
|
||||
while True:
|
||||
self.cpu_memory_peak = max(self.process.memory_info().rss, self.cpu_memory_peak)
|
||||
|
||||
# can't sleep or will not catch the peak right (this comment is here on purpose)
|
||||
if not self.peak_monitoring:
|
||||
break
|
||||
|
||||
def start(self):
|
||||
self.peak_monitoring = True
|
||||
self.thread = threading.Thread(target=self.peak_monitor)
|
||||
self.thread.daemon = True
|
||||
self.thread.start()
|
||||
|
||||
def stop(self):
|
||||
self.peak_monitoring = False
|
||||
self.thread.join()
|
||||
return self.cpu_memory_peak
|
||||
|
||||
|
||||
cpu_peak_tracker = PeakCPUMemory()
|
||||
|
||||
|
||||
def start_measure():
|
||||
# Time
|
||||
measures = {"time": time.time()}
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# CPU mem
|
||||
measures["cpu"] = psutil.Process().memory_info().rss
|
||||
cpu_peak_tracker.start()
|
||||
|
||||
# GPU mem
|
||||
for i in range(torch.cuda.device_count()):
|
||||
measures[str(i)] = torch.cuda.memory_allocated(i)
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
return measures
|
||||
|
||||
|
||||
def end_measure(start_measures):
|
||||
# Time
|
||||
measures = {"time": time.time() - start_measures["time"]}
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# CPU mem
|
||||
measures["cpu"] = (psutil.Process().memory_info().rss - start_measures["cpu"]) / 2**20
|
||||
measures["cpu-peak"] = (cpu_peak_tracker.stop() - start_measures["cpu"]) / 2**20
|
||||
|
||||
# GPU mem
|
||||
for i in range(torch.cuda.device_count()):
|
||||
measures[str(i)] = (torch.cuda.memory_allocated(i) - start_measures[str(i)]) / 2**20
|
||||
measures[f"{i}-peak"] = (torch.cuda.max_memory_allocated(i) - start_measures[str(i)]) / 2**20
|
||||
|
||||
return measures
|
||||
|
||||
|
||||
def log_measures(measures, description):
|
||||
print(f"{description}:")
|
||||
print(f"- Time: {measures['time']:.2f}s")
|
||||
for i in range(torch.cuda.device_count()):
|
||||
print(f"- GPU {i} allocated: {measures[str(i)]:.2f}MiB")
|
||||
peak = measures[f"{i}-peak"]
|
||||
print(f"- GPU {i} peak: {peak:.2f}MiB")
|
||||
print(f"- CPU RAM allocated: {measures['cpu']:.2f}MiB")
|
||||
print(f"- CPU RAM peak: {measures['cpu-peak']:.2f}MiB")
|
||||
35
docker/accelerate-cpu/Dockerfile
Normal file
35
docker/accelerate-cpu/Dockerfile
Normal file
@ -0,0 +1,35 @@
|
||||
# Builds CPU-only Docker image of PyTorch
|
||||
# Uses multi-staged approach to reduce size
|
||||
# Stage 1
|
||||
FROM python:3.7-slim as compile-image
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt update
|
||||
RUN apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
git \
|
||||
gcc
|
||||
|
||||
# Setup virtual environment for Docker
|
||||
ENV VIRTUAL_ENV=/opt/venv
|
||||
RUN python3 -m venv ${VIRTUAL_ENV}
|
||||
# Make sure we use the virtualenv
|
||||
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
||||
WORKDIR /workspace
|
||||
# Install specific CPU torch wheel to save on space
|
||||
RUN python3 -m pip install --upgrade --no-cache-dir pip
|
||||
RUN python3 -m pip install --no-cache-dir \
|
||||
jupyter \
|
||||
git+https://github.com/huggingface/accelerate#egg=accelerate[test,test_trackers] \
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
# Stage 2
|
||||
FROM python:3.7-slim AS build-image
|
||||
COPY --from=compile-image /opt/venv /opt/venv
|
||||
RUN useradd -ms /bin/bash user
|
||||
USER user
|
||||
|
||||
# Make sure we use the virtualenv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
CMD ["/bin/bash"]
|
||||
42
docker/accelerate-gpu/Dockerfile
Normal file
42
docker/accelerate-gpu/Dockerfile
Normal file
@ -0,0 +1,42 @@
|
||||
# Builds GPU docker image of PyTorch
|
||||
# Uses multi-staged approach to reduce size
|
||||
# Stage 1
|
||||
# Use base conda image to reduce time
|
||||
FROM continuumio/miniconda3:latest AS compile-image
|
||||
# Specify py version
|
||||
ENV PYTHON_VERSION=3.7.3
|
||||
# Install apt libs
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl git wget && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists*
|
||||
|
||||
# Create our conda env
|
||||
RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
|
||||
# We don't install pytorch here yet since CUDA isn't available
|
||||
# instead we use the direct torch wheel
|
||||
ENV PATH /opt/conda/envs/accelerate/bin:$PATH
|
||||
# Activate our bash shell
|
||||
RUN chsh -s /bin/bash
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
# Activate the conda env and install torch + accelerate
|
||||
RUN source activate accelerate && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
git+https://github.com/huggingface/accelerate#egg=accelerate[test,test_trackers] \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu113
|
||||
|
||||
# Stage 2
|
||||
FROM nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04 AS build-image
|
||||
COPY --from=compile-image /opt/conda /opt/conda
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
|
||||
# Install apt libs
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl git wget && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists*
|
||||
|
||||
RUN echo "source activate accelerate" >> ~/.profile
|
||||
|
||||
# Activate the virtualenv
|
||||
CMD ["/bin/bash"]
|
||||
@ -7,6 +7,10 @@
|
||||
title: Installation
|
||||
title: Get started
|
||||
- sections:
|
||||
- local: big_modeling
|
||||
title: Handling big models
|
||||
- local: gradient_accumulation
|
||||
title: Gradient accumulation
|
||||
- local: sagemaker
|
||||
title: Amazon SageMaker
|
||||
title: Guides
|
||||
@ -17,14 +21,18 @@
|
||||
title: Notebook Launcher
|
||||
- local: kwargs
|
||||
title: Kwargs Handlers
|
||||
- local: internal
|
||||
title: Internals
|
||||
- local: checkpoint
|
||||
title: Checkpointing
|
||||
- local: internal
|
||||
title: Internals
|
||||
- local: tracking
|
||||
title: Experiment Tracking
|
||||
- local: fsdp
|
||||
title: Fully Sharded Data Parallel
|
||||
- local: memory
|
||||
title: Memory Utilities
|
||||
- local: deepspeed
|
||||
title: DeepSpeed
|
||||
- local: utilities
|
||||
title: General Utilities
|
||||
title: API Reference
|
||||
|
||||
@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
|
||||
# Accelerator
|
||||
|
||||
The [`Accelerator`] is the main class provided by 🤗 Accelerate. It serves at the main entrypoint for
|
||||
the API. To quickly adapt your script to work on any kind of setup with 🤗 Accelerate juste:
|
||||
the API. To quickly adapt your script to work on any kind of setup with 🤗 Accelerate just:
|
||||
|
||||
1. Initialize an [`Accelerator`] object (that we will call `accelerator` in the rest of this
|
||||
page) as early as possible in your script.
|
||||
@ -21,10 +21,10 @@ the API. To quickly adapt your script to work on any kind of setup with 🤗 Acc
|
||||
3. (Optional but best practice) Remove all the `.cuda()` or `.to(device)` in your code and let the
|
||||
`accelerator` handle device placement for you.
|
||||
4. Replace the `loss.backward()` in your code by `accelerator.backward(loss)`.
|
||||
5. (Optional, when using distributed evaluation) Gather your predictions and labelsbefore storing them or using them
|
||||
for metric computation using [`~Accelerator.gather`].
|
||||
5. (Optional, when using distributed evaluation) Gather your predictions and labels before storing them or using
|
||||
them for metric computation using [`~Accelerator.gather`].
|
||||
|
||||
This is all what is needed in most cases. For more advanced case or a nicer experience here are the functions you
|
||||
This is all that is needed in most cases. For more advanced cases or a nicer experience here are the functions you
|
||||
should search for and replace by the corresponding methods of your `accelerator`:
|
||||
|
||||
- `print` statements should be replaced by [`~Accelerator.print`] to be only printed once per
|
||||
@ -38,4 +38,27 @@ should search for and replace by the corresponding methods of your `accelerator`
|
||||
- Use [`~Accelerator.clip_grad_norm_`] instead of `torch.nn.utils.clip_grad_norm_` and
|
||||
[`~Accelerator.clip_grad_value_`] instead of `torch.nn.utils.clip_grad_value_`.
|
||||
|
||||
To perform gradient accumulation use [`~Accelerator.accumulate`] and specify a `gradient_accumulation_steps`.
|
||||
This will also automatically ensure the gradients are synced or unsynced when on multi-device training, check if the step should
|
||||
actually be performed, and auto-scale the loss:
|
||||
|
||||
```python
|
||||
accelerator = Accelerator(gradient_accumulation_steps=2)
|
||||
|
||||
for (input, label) in enumerate(training_dataloader):
|
||||
with accelerator.accumulate(model):
|
||||
predictions = model(input)
|
||||
loss = loss_function(predictions, labels)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Using this with `dispatch_batches=True` (which is the default for iterable datasets) is currently not supported.
|
||||
|
||||
</Tip>
|
||||
|
||||
[[autodoc]] Accelerator
|
||||
|
||||
232
docs/source/big_modeling.mdx
Normal file
232
docs/source/big_modeling.mdx
Normal file
@ -0,0 +1,232 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Handling big models
|
||||
|
||||
When loading a pretrained model in PyTorch, the usual workflow looks like this:
|
||||
|
||||
```py
|
||||
import torch
|
||||
|
||||
my_model = ModelClass(...)
|
||||
state_dict = torch.load(checkpoint_file)
|
||||
my_model.load_state_dict(state_dict)
|
||||
```
|
||||
|
||||
In plain English, those steps are:
|
||||
1. Create the model with randomly initialized weights
|
||||
2. Load the model weights (in a dictionary usually called a state dict) from the disk
|
||||
3. Load those weights inside the model
|
||||
|
||||
While this works very well for regularly sized models, this workflow has some clear limitations when we deal with a huge model: in step 1, we load a full version of the model in RAM, and spend some time randomly initializing the weights (which will be discarded in step 3). In step 2, we load another full version of the model in RAM, with the pretrained weights. If you're loading a model with 6 billions parameters, this means you will need 24GB of RAM for each copy of the model, so 48GB in total (half of it to load the model in FP16).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This API is quite new and still in its experimental stage. While we strive to provide a stable API, it's possible some small parts of the public API will change in the future.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Instantiating an empty model
|
||||
|
||||
The first tool 🤗 Accelerate introduces to help with big models is a context manager [`init_empty_weights`] that helps you initialize a model without using any RAM, so that step 1 can be done on models of any size. Here is how it works:
|
||||
|
||||
```py
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
with init_empty_weights():
|
||||
my_model = ModelClass(...)
|
||||
```
|
||||
|
||||
For instance:
|
||||
|
||||
```py
|
||||
with init_empty_weights():
|
||||
model = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
|
||||
```
|
||||
|
||||
initializes an empty model with a bit more than 100B parameters. Behind the scenes, this relies on the meta device introduced in PyTorch 1.9. During the initialization under the context manager, each time a parameter is created, it is instantly moved on that device.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
You can't move a model initialized like this on CPU or another device directly, since it doesn't have any data. It's also very likely that a forward pass with that empty model will fail, as not all operations are supported on the meta device.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Sharded checkpoints
|
||||
|
||||
It's possible your model is so big that even a single copy won't fit in RAM. That doesn't mean it can't be loaded: if you have one or several GPUs, this is more memory available to store your model. In this case, it's better if your checkpoint is split in several smaller files that we call checkpoint shards.
|
||||
|
||||
🤗 Accelerate will handle sharded checkpoints as long as you follow the following format: your checkpoint should be in a folder, with several files containing the partial state dicts, and there should be an index in the JSON format that contains a dictionary mapping parameter names to the file containing their weights. For instance we could have a folder containing:
|
||||
|
||||
```bash
|
||||
first_state_dict.bin
|
||||
index.json
|
||||
second_state_dict.bin
|
||||
```
|
||||
|
||||
with index.json being the following file:
|
||||
|
||||
```
|
||||
{
|
||||
"linear1.weight": "first_state_dict.bin",
|
||||
"linear1.bias": "first_state_dict.bin",
|
||||
"linear2.weight": "second_state_dict.bin",
|
||||
"linear2.bias": "second_state_dict.bin"
|
||||
}
|
||||
```
|
||||
|
||||
and `first_state_dict.bin` containing the weights for `"linear1.weight"` and `"linear1.bias"`, `second_state_dict.bin` the ones for `"linear2.weight"` and `"linear2.bias"`
|
||||
|
||||
## Loading weights
|
||||
|
||||
The second tool 🤗 Accelerate introduces is a function [`load_checkpoint_and_dispatch`], that will allow you to load a checkpoint inside your empty model. This supports full checkpoints (a single file containing the whole state dict) as well as sharded checkpoints. It will also automatically dispatch those weights across the devices you have available (GPUs, CPU RAM), so if you are loading a sharded checkpoint, the maximum RAM usage will be the size of the biggest shard.
|
||||
|
||||
Here is how we can use this to load the [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6B) model. You clone the sharded version of this model with:
|
||||
|
||||
```bash
|
||||
git clone https://huggingface.co/sgugger/sharded-gpt-j-6B
|
||||
cd sharded-gpt-j-6B
|
||||
git-lfs install
|
||||
git pull
|
||||
```
|
||||
|
||||
then we can initialize the model with
|
||||
|
||||
```py
|
||||
from accelerate import init_empty_weights
|
||||
from transformers import AutoConfig, AutoModelForCausalLM
|
||||
|
||||
checkpoint = "EleutherAI/gpt-j-6B"
|
||||
config = AutoConfig.from_pretrained(checkpoint)
|
||||
|
||||
with init_empty_weights():
|
||||
model = AutoModelForCausalLM.from_config(config)
|
||||
```
|
||||
|
||||
and load the checkpoint we just downloaded with:
|
||||
|
||||
```py
|
||||
from accelerate import load_checkpoint_and_dispatch
|
||||
|
||||
model = load_checkpoint_and_dispatch(
|
||||
model, "sharded-gpt-j-6B", device_map="auto", no_split_module_classes=["GPTJBlock"]
|
||||
)
|
||||
```
|
||||
|
||||
By passing `device_map="auto"`, we tell 🤗 Accelerate to determine automatically where to put each layer of the model depending on the available resources:
|
||||
- first we use the maximum space available on the GPU(s)
|
||||
- if we still need space, we store the remaining weights on the CPU
|
||||
- if there is not enough RAM, we store the remaining weights on the hard drive as memory-mapped tensors
|
||||
|
||||
`no_split_module_classes=["GPTJBlock"]` indicates that the modules that are `GPTJBlock` should not be split on different devices. You should set here all blocks that include a residual connection of some kind.
|
||||
|
||||
You can see the `device_map` that 🤗 Accelerate picked by accessing the `hf_device_map` attribute of your model:
|
||||
|
||||
```py
|
||||
model.hf_device_map
|
||||
```
|
||||
|
||||
```python out
|
||||
{'transformer.wte': 0,
|
||||
'transformer.drop': 0,
|
||||
'transformer.h.0': 0,
|
||||
'transformer.h.1': 0,
|
||||
'transformer.h.2': 0,
|
||||
'transformer.h.3': 0,
|
||||
'transformer.h.4': 0,
|
||||
'transformer.h.5': 0,
|
||||
'transformer.h.6': 0,
|
||||
'transformer.h.7': 0,
|
||||
'transformer.h.8': 0,
|
||||
'transformer.h.9': 0,
|
||||
'transformer.h.10': 0,
|
||||
'transformer.h.11': 0,
|
||||
'transformer.h.12': 0,
|
||||
'transformer.h.13': 0,
|
||||
'transformer.h.14': 0,
|
||||
'transformer.h.15': 0,
|
||||
'transformer.h.16': 0,
|
||||
'transformer.h.17': 0,
|
||||
'transformer.h.18': 0,
|
||||
'transformer.h.19': 0,
|
||||
'transformer.h.20': 0,
|
||||
'transformer.h.21': 0,
|
||||
'transformer.h.22': 0,
|
||||
'transformer.h.23': 0,
|
||||
'transformer.h.24': 1,
|
||||
'transformer.h.25': 1,
|
||||
'transformer.h.26': 1,
|
||||
'transformer.h.27': 1,
|
||||
'transformer.ln_f': 1,
|
||||
'lm_head': 1}
|
||||
```
|
||||
|
||||
You can also design your `device_map` yourself, if you prefer to explicitly decide where each layer should be. In this case, the command above becomes:
|
||||
|
||||
```py
|
||||
model = load_checkpoint_and_dispatch(model, "sharded-gpt-j-6B", device_map=my_device_map)
|
||||
```
|
||||
|
||||
## Run the model
|
||||
|
||||
Now that we have done this, our model lies across several devices, and maybe the hard drive. But it can still be used as a regular PyTorch model:
|
||||
|
||||
```py
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
inputs = tokenizer("Hello, my name is", return_tensors="pt")
|
||||
inputs = inputs.to(0)
|
||||
output = model.generate(inputs["input_ids"])
|
||||
tokenizer.decode(output[0].tolist())
|
||||
```
|
||||
|
||||
Behind the scenes, 🤗 Accelerate added hooks to the model, so that:
|
||||
- at each layer, the inputs are put on the right device (so even if your model is spread across several GPUs, it works)
|
||||
- for the weights offloaded on the CPU, they are put on a GPU just before the forward pass, and cleaned up just after
|
||||
- for the weights offloaded on the hard drive, they are loaded in RAM then put on a GPU just before the forward pass, and cleaned up just after
|
||||
|
||||
This way, you model can run for inference even if it doesn't fit on one of the GPUs or the CPU RAM!
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
This only supports inference of your model, not training. Most of the computation happens behind `torch.no_grad()` context managers to avoid spending some GPU memory with intermediate activations.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Limits and further development
|
||||
|
||||
We are aware of the current limitations in the API:
|
||||
|
||||
- While this could theoretically work on just one CPU with potential disk offload, you need at least one GPU to run this API. This will be fixed in further development.
|
||||
- [`infer_auto_device_map`] (or `device_map="auto"` in [`load_checkpoint_and_dispatch`]) tries to maximize GPU and CPU RAM it sees available when you execute it. While PyTorch is very good at managing GPU RAM efficiently (and giving it back when not needed), it's not entirely true with Python and CPU RAM. Therefore, an automatically computed device map might be too intense on the CPU. Move a few modules to the disk device if you get crashes due to lack of RAM.
|
||||
- [`infer_auto_device_map`] (or `device_map="auto"` in [`load_checkpoint_and_dispatch`]) attributes devices sequentially (to avoid moving things back and forth) so if your first layer is bigger than the size of the GPU you have, it will end up with everything on the CPU/Disk.
|
||||
- [`load_checkpoint_and_dispatch`] and [`load_checkpoint_in_model`] do not perform any check on the correctness of your state dict compared to your model at the moment (this will be fixed in a future version), so you may get some weird errors if trying to load a checkpoint with mismatched or missing keys.
|
||||
- The model parallelism used when your model is split on several GPUs is naive and not optimized, meaning that only one GPU works at a given time and the other sits idle.
|
||||
- When weights are offloaded on the CPU/hard drive, there is no pre-fetching (yet, we will work on this for future versions) which means the weights are put on the GPU when they are needed and not before.
|
||||
- Hard-drive offloading might be very slow if the hardware you run on does not have fast communication between disk and CPU (like NVMes).
|
||||
|
||||
## API doc
|
||||
|
||||
[[autodoc]] cpu_offload
|
||||
|
||||
[[autodoc]] disk_offload
|
||||
|
||||
[[autodoc]] dispatch_model
|
||||
|
||||
[[autodoc]] infer_auto_device_map
|
||||
|
||||
[[autodoc]] init_empty_weights
|
||||
|
||||
[[autodoc]] load_checkpoint_and_dispatch
|
||||
|
||||
[[autodoc]] load_checkpoint_in_model
|
||||
@ -12,8 +12,8 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Checkpointing
|
||||
|
||||
When training a PyTorch model with Accelerate, you may often want to save and continue a state of training. Doing so requires
|
||||
saving and loading the model, optimizer, RNG generators, and the GradScaler. Inside Accelerate are two convience functions to achieve this quickly:
|
||||
When training a PyTorch model with 🤗 Accelerate, you may often want to save and continue a state of training. Doing so requires
|
||||
saving and loading the model, optimizer, RNG generators, and the GradScaler. Inside 🤗 Accelerate are two convience functions to achieve this quickly:
|
||||
- Use [`~Accelerator.save_state`] for saving everything mentioned above to a folder location
|
||||
- Use [`~Accelerator.load_state`] for loading everything stored from an earlier `save_state`
|
||||
|
||||
@ -57,4 +57,4 @@ for epoch in range(num_epochs):
|
||||
|
||||
# Restore previous state
|
||||
accelerate.load_state("my/save/path")
|
||||
```
|
||||
```
|
||||
|
||||
508
docs/source/deepspeed.mdx
Normal file
508
docs/source/deepspeed.mdx
Normal file
@ -0,0 +1,508 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DeepSpeed
|
||||
|
||||
[DeepSpeed](https://github.com/microsoft/DeepSpeed) implements everything described in the [ZeRO paper](https://arxiv.org/abs/1910.02054). Currently it provides full support for:
|
||||
|
||||
1. Optimizer state partitioning (ZeRO stage 1)
|
||||
2. Gradient partitioning (ZeRO stage 2)
|
||||
3. Parameter partitioning (ZeRO stage 3)
|
||||
4. Custom mixed precision training handling
|
||||
5. A range of fast CUDA-extension-based optimizers
|
||||
6. ZeRO-Offload to CPU and Disk/NVMe
|
||||
|
||||
ZeRO-Offload has its own dedicated paper: [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840). And NVMe-support is described in the paper [ZeRO-Infinity: Breaking the GPU
|
||||
Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857).
|
||||
|
||||
DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference.
|
||||
|
||||
DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which
|
||||
won't be possible on a single GPU.
|
||||
|
||||
🤗 Accelerate integrates [DeepSpeed](https://github.com/microsoft/DeepSpeed) via 2 options:
|
||||
|
||||
1. Integration of the DeepSpeed features via `deepspeed config file` specification in `accelerate config` . You just supply your custom config file or use our template. Most of
|
||||
this document is focused on this feature. This supports all the core features of DeepSpeed and gives user a lot of flexibility.
|
||||
User may have to change few lines of code depending on the config.
|
||||
2. Integration via `deepspeed_plugin`.This supports subset of the DeepSpeed features and uses default options for the rest of the configurations.
|
||||
User need not change any code and is good for those who are fine with most of the default settings of DeepSpeed.
|
||||
|
||||
## What is integrated?
|
||||
|
||||
Training:
|
||||
|
||||
1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 as well as CPU/Disk offload of optimizer states, gradients and parameters.
|
||||
Below is a short description of Data Parallelism using ZeRO - Zero Redundancy Optimizer along with diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
|
||||

|
||||
|
||||
(Source: [link](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/))
|
||||
|
||||
a. **Stage 1** : Shards optimizer states across data parallel workers/GPUs
|
||||
|
||||
b. **Stage 2** : Shards optimizer states + gradients across data parallel workers/GPUs
|
||||
|
||||
c. **Stage 3**: Shards optimizer states + gradients + model parameters across data parallel workers/GPUs
|
||||
|
||||
d. **Optimizer Offload**: Offloads the gradients + optimizer states to CPU/Disk building on top of ZERO Stage 2
|
||||
|
||||
e. **Param Offload**: Offloads the model parameters to CPU/Disk building on top of ZERO Stage 3
|
||||
|
||||
<u>Note</u>: With respect to Disk Offload, the disk should be an NVME for decent speed but it technically work on any Disk
|
||||
|
||||
Inference:
|
||||
|
||||
1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but
|
||||
it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see:
|
||||
[deepspeed-zero-inference](#deepspeed-zero-inference).
|
||||
|
||||
|
||||
## How it works?
|
||||
|
||||
**Pre-Requisites**: Install DeepSpeed version >=0.6.5. Please refer to the [DeepSpeed Insallation details](https://github.com/microsoft/DeepSpeed#installation)
|
||||
for more information.
|
||||
|
||||
We will first look at easy to use integration via `accelerate config`.
|
||||
Followed by more flexible and feature rich `deepspeed config file` integration.
|
||||
|
||||
### Accelerate DeepSpeed Plugin
|
||||
On your machine(s) just run:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
and answer the questions asked. It will ask whether you want to use a config file for DeepSpeed to which you should answer no. Then answer the following questions to generate a basic DeepSpeed config.
|
||||
This will generate a config file that will be used automatically to properly set the
|
||||
default options when doing
|
||||
|
||||
```bash
|
||||
accelerate launch my_script.py --args_to_my_script
|
||||
```
|
||||
|
||||
For instance, here is how you would run the NLP example `examples/nlp_example.py` (from the root of the repo) with DeepSpeed Plugin:
|
||||
|
||||
**ZeRO Stage-2 DeepSpeed Plugin Example**
|
||||
```bash
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
gradient_accumulation_steps: 1
|
||||
gradient_clipping: 1.0
|
||||
offload_optimizer_device: none
|
||||
offload_param_device: none
|
||||
zero3_init_flag: true
|
||||
zero_stage: 2
|
||||
distributed_type: DEEPSPEED
|
||||
fsdp_config: {}
|
||||
machine_rank: 0
|
||||
main_process_ip: null
|
||||
main_process_port: null
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
```bash
|
||||
accelerate launch examples/nlp_example.py --mixed_precision fp16
|
||||
```
|
||||
|
||||
**ZeRO Stage-3 with CPU Offload DeepSpeed Plugin Example**
|
||||
```bash
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
gradient_accumulation_steps: 1
|
||||
gradient_clipping: 1.0
|
||||
offload_optimizer_device: cpu
|
||||
offload_param_device: cpu
|
||||
zero3_init_flag: true
|
||||
zero3_save_16bit_model: true
|
||||
zero_stage: 3
|
||||
distributed_type: DEEPSPEED
|
||||
fsdp_config: {}
|
||||
machine_rank: 0
|
||||
main_process_ip: null
|
||||
main_process_port: null
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
```bash
|
||||
accelerate launch examples/nlp_example.py --mixed_precision fp16
|
||||
```
|
||||
|
||||
Currently, `Accelerate` supports following config through the CLI:
|
||||
|
||||
```bash
|
||||
`zero_stage`: [0] Disabled, [1] optimizer state partitioning, [2] optimizer+gradient state partitioning and [3] optimizer+gradient+parameter partitioning
|
||||
`gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them.
|
||||
`gradient_clipping`: Enable gradient clipping with value.
|
||||
`offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2.
|
||||
`offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3.
|
||||
`zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3.
|
||||
`zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3.
|
||||
`mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training.
|
||||
```
|
||||
To be able to tweak more options, you will need to use a DeepSpeed config file.
|
||||
|
||||
### DeepSpeed Config File
|
||||
On your machine(s) just run:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
```
|
||||
|
||||
and answer the questions asked. It will ask whether you want to use a config file for deepspeed to which you answer yes
|
||||
and provide the path to the deepspeed config file.
|
||||
This will generate a config file that will be used automatically to properly set the
|
||||
default options when doing
|
||||
|
||||
```bash
|
||||
accelerate launch my_script.py --args_to_my_script
|
||||
```
|
||||
|
||||
For instance, here is how you would run the NLP example `examples/by_feature/deepspeed_with_config_support.py` (from the root of the repo) with DeepSpeed Config File:
|
||||
|
||||
**ZeRO Stage-2 DeepSpeed Config File Example**
|
||||
```bash
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
deepspeed_config_file: /home/ubuntu/accelerate/examples/configs/deepspeed_config_templates/zero_stage2_config.json
|
||||
zero3_init_flag: true
|
||||
distributed_type: DEEPSPEED
|
||||
fsdp_config: {}
|
||||
machine_rank: 0
|
||||
main_process_ip: null
|
||||
main_process_port: null
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
with the contents of `zero_stage2_config.json` being:
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto",
|
||||
"torch_adam": true,
|
||||
"adam_w_mode": true
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
accelerate launch examples/by_feature/deepspeed_with_config_support.py \
|
||||
--config_name "gpt2-large" \
|
||||
--tokenizer_name "gpt2-large" \
|
||||
--dataset_name "wikitext" \
|
||||
--dataset_config_name "wikitext-2-raw-v1" \
|
||||
--block_size 128 \
|
||||
--output_dir "./clm/clm_deepspeed_stage2_accelerate" \
|
||||
--learning_rate 5e-4 \
|
||||
--per_device_train_batch_size 24 \
|
||||
--per_device_eval_batch_size 24 \
|
||||
--num_train_epochs 3 \
|
||||
--with_tracking \
|
||||
--report_to "wandb"\
|
||||
```
|
||||
|
||||
**ZeRO Stage-3 with CPU offload DeepSpeed Config File Example**
|
||||
```bash
|
||||
compute_environment: LOCAL_MACHINE
|
||||
deepspeed_config:
|
||||
deepspeed_config_file: /home/ubuntu/accelerate/examples/configs/deepspeed_config_templates/zero_stage3_offload_config.json
|
||||
zero3_init_flag: true
|
||||
distributed_type: DEEPSPEED
|
||||
fsdp_config: {}
|
||||
machine_rank: 0
|
||||
main_process_ip: null
|
||||
main_process_port: null
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 1
|
||||
num_processes: 2
|
||||
use_cpu: false
|
||||
```
|
||||
with the contents of `zero_stage3_offload_config.json` being:
|
||||
```json
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"sub_group_size": 1e9,
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": "auto"
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
accelerate launch examples/by_feature/deepspeed_with_config_support.py \
|
||||
--config_name "gpt2-large" \
|
||||
--tokenizer_name "gpt2-large" \
|
||||
--dataset_name "wikitext" \
|
||||
--dataset_config_name "wikitext-2-raw-v1" \
|
||||
--block_size 128 \
|
||||
--output_dir "./clm/clm_deepspeed_stage3_offload_accelerate" \
|
||||
--learning_rate 5e-4 \
|
||||
--per_device_train_batch_size 32 \
|
||||
--per_device_eval_batch_size 32 \
|
||||
--num_train_epochs 3 \
|
||||
--with_tracking \
|
||||
--report_to "wandb"\
|
||||
```
|
||||
|
||||
**Important code changes when using DeepSpeed Config File**
|
||||
|
||||
1. DeepSpeed Optimizers and Schedulers. For more information on these,
|
||||
see the [DeepSpeed Optimizers](https://deepspeed.readthedocs.io/en/latest/optimizers.html) and [DeepSpeed Schedulers](https://deepspeed.readthedocs.io/en/latest/schedulers.html) documentation.
|
||||
We will look at the changes needed in the code when using these.
|
||||
|
||||
a. DS Optim + DS Scheduler: The case when both `optimizer` and `scheduler` keys present in the DeepSpeed config file.
|
||||
In this situation, those will be used and user has to use `accelerate.utils.DummyOptim` and `accelerate.utils.DummyScheduler` to replace the PyTorch/Custom optimizers and schedulers in their code.
|
||||
Below is the snippet from `examples/by_feature/deepspeed_with_config_support.py` showing this:
|
||||
```python
|
||||
# Creates Dummy Optimizer if `optimizer` was spcified in the config file else creates Adam Optimizer
|
||||
optimizer_cls = (
|
||||
torch.optim.AdamW
|
||||
if accelerator.state.deepspeed_plugin is None
|
||||
or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
|
||||
else DummyOptim
|
||||
)
|
||||
optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||
|
||||
# Creates Dummy Scheduler if `scheduler` was spcified in the config file else creates `args.lr_scheduler_type` Scheduler
|
||||
if (
|
||||
accelerator.state.deepspeed_plugin is None
|
||||
or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
|
||||
):
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
)
|
||||
else:
|
||||
lr_scheduler = DummyScheduler(
|
||||
optimizer, total_num_steps=args.max_train_steps, warmup_num_steps=args.num_warmup_steps
|
||||
)
|
||||
```
|
||||
b. Custom Optim + Custom Scheduler: The case when both `optimizer` and `scheduler` keys are absent in the DeepSpeed config file.
|
||||
In this situation, no code changes are needed from the user and this is the case when using integration via DeepSpeed Plugin.
|
||||
In the above example we can see that the code reamins unchanged if the `optimizer` and `scheduler` keys are absent in the DeepSpeed config file.
|
||||
|
||||
c. Custom Optim + DS Scheduler: The case when only `scheduler` key is present in the DeepSpeed config file.
|
||||
In this situation, user has to use `accelerate.utils.DummyScheduler` to replace the PyTorch/Custom scheduler in their code.
|
||||
|
||||
d. DS Optim + Custom Scheduler: The case when only `optimizer` key is present in the DeepSpeed config file.
|
||||
This will result in an error because one can only use DS Scheduler when using DS Optim.
|
||||
|
||||
2. Notice the `auto` values in the above example DeepSpeed config files. These are automatically handled by `prepare` method
|
||||
based on model, dataloaders, dummy optimizer and dummy schedulers provided to `prepare` method.
|
||||
Only the `auto` fields specified in above examples are handled by `prepare` method and the rest have to be explicitly specified by the user.
|
||||
|
||||
## Saving and loading
|
||||
|
||||
1. Saving and loading of models is unchanged for ZeRO Stage-1 and Stage-2.
|
||||
|
||||
2. under ZeRO Stage-3, `state_dict` contains just the placeholders since the model weights are partitioned across multiple GPUs.
|
||||
ZeRO Stage-3 has 2 options:
|
||||
|
||||
a. Saving the entire 16bit model weights to directly load later on using `model.load_state_dict(torch.load(pytorch_model.bin))`.
|
||||
For this, either set `zero_optimization.stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed Config file or set
|
||||
`zero3_save_16bit_model` to True in DeepSpeed Plugin.
|
||||
**Note that this option requires consolidation of the weights on one GPU it can be slow and memory demanding, so only use this feature when needed.**
|
||||
Below is the snippet from `examples/by_feature/deepspeed_with_config_support.py` showing this:
|
||||
```python
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
|
||||
# New Code #
|
||||
# Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
|
||||
# `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
|
||||
# `zero3_save_16bit_model` is True in DeepSpeed Plugin.
|
||||
# For Zero Stages 1 and 2, models are saved as usual in the output directory.
|
||||
# The model name saved is `pytorch_model.bin`
|
||||
unwrapped_model.save_pretrained(
|
||||
args.output_dir,
|
||||
is_main_process=accelerator.is_main_process,
|
||||
save_function=accelerator.save,
|
||||
state_dict=accelerator.get_state_dict(model),
|
||||
)
|
||||
```
|
||||
|
||||
b. To get 32bit weights, first save the model using `model.save_checkpoint()`.
|
||||
Below is the snippet from `examples/by_feature/deepspeed_with_config_support.py` showing this:
|
||||
```python
|
||||
success = model.save_checkpoint(PATH, ckpt_id, checkpoint_state_dict)
|
||||
status_msg = "checkpointing: PATH={}, ckpt_id={}".format(PATH, ckpt_id)
|
||||
if success:
|
||||
logging.info(f"Success {status_msg}")
|
||||
else:
|
||||
logging.warning(f"Failure {status_msg}")
|
||||
```
|
||||
This will create ZeRO model and optimizer partitions along with `zero_to_fp32.py` script in checkpoint directory.
|
||||
One can use this script to do offline consolidation.
|
||||
It requires no configuration files or GPUs. Here is an example of its usage:
|
||||
```bash
|
||||
$ cd /path/to/checkpoint_dir
|
||||
$ ./zero_to_fp32.py . pytorch_model.bin
|
||||
Processing zero checkpoint at global_step1
|
||||
Detected checkpoint of type zero stage 3, world_size: 2
|
||||
Saving fp32 state dict to pytorch_model.bin (total_numel=60506624)
|
||||
```
|
||||
To get 32bit model for saving/inference, one can do the following:
|
||||
```python
|
||||
from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
|
||||
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
fp32_model = load_state_dict_from_zero_checkpoint(unwrapped_model, checkpoint_dir)
|
||||
```
|
||||
If only interested in state_dict, one can do the following:
|
||||
```python
|
||||
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
||||
|
||||
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
|
||||
```
|
||||
Note that all these functions require ~2x memory (general RAM) of the size of the final checkpoint.
|
||||
|
||||
## ZeRO Inference
|
||||
DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity.
|
||||
It uses the same ZeRO protocol as training, but it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant.
|
||||
With accelerate integration, one has to just prepare model and dataloader as shown below:
|
||||
|
||||
```python
|
||||
model, eval_dataloader = accelerator.prepare(model, eval_dataloader)
|
||||
```
|
||||
|
||||
## Few caveats to be aware of
|
||||
|
||||
1. Current integration doesn’t support Pipeline Parallelism of DeepSpeed.
|
||||
2. Current integration doesn’t support `mpu`, limiting the tensor parallelism which is supported in Megatron-LM.
|
||||
3. Current integration doesn’t support multiple models for a given `accelerator` object.
|
||||
|
||||
|
||||
## Internals
|
||||
|
||||
[[autodoc]] utils.DeepSpeedPlugin
|
||||
|
||||
[[autodoc]] utils.DummyOptim
|
||||
|
||||
[[autodoc]] utils.DummyScheduler
|
||||
|
||||
[[autodoc]] utils.DeepSpeedEngineWrapper
|
||||
|
||||
[[autodoc]] utils.DeepSpeedOptimizerWrapper
|
||||
|
||||
[[autodoc]] utils.DeepSpeedSchedulerWrapper
|
||||
|
||||
|
||||
## Main DeepSpeed Resources
|
||||
|
||||
- [Project's github](https://github.com/microsoft/deepspeed)
|
||||
- [Usage docs](https://www.deepspeed.ai/getting-started/)
|
||||
- [API docs](https://deepspeed.readthedocs.io/en/latest/index.html)
|
||||
- [Blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
|
||||
|
||||
Papers:
|
||||
|
||||
- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
|
||||
- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
|
||||
- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
|
||||
|
||||
Finally, please, remember that, 🤗 `Accelerate` only integrates DeepSpeed, therefore if you
|
||||
have any problems or questions with regards to DeepSpeed usage, please, file an issue with [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues).
|
||||
|
||||
@ -18,7 +18,7 @@ To read more about it and the benefits, check out the [Fully Sharded Data Parall
|
||||
We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
|
||||
All you need to do is enable it through the config.
|
||||
|
||||
## How it works out the box
|
||||
## How it works out of the box
|
||||
|
||||
On your machine(s) just run:
|
||||
|
||||
@ -57,7 +57,7 @@ use_cpu: false
|
||||
accelerate launch examples/nlp_example.py
|
||||
```
|
||||
|
||||
Currently, `Accelerate` supports following config through the CLI:
|
||||
Currently, `Accelerate` supports the following config through the CLI:
|
||||
|
||||
```bash
|
||||
`Sharding Strategy`: [1] FULL_SHARD, [2] SHARD_GRAD_OP
|
||||
@ -65,11 +65,11 @@ Currently, `Accelerate` supports following config through the CLI:
|
||||
`Offload Params`: Decides Whether to offload parameters and gradients to CPU.
|
||||
```
|
||||
|
||||
## Few caveats to be aware of
|
||||
## A few caveats to be aware of
|
||||
|
||||
- PyTorch FSDP auto wraps sub-modules, flattens the parameters and shards the parameters in place.
|
||||
Due to this, any optimizer created before model wrapping gets broken and occupies more memory.
|
||||
Hence, it is highly recommended and efficient to prepare model before creating optimizer.
|
||||
Hence, it is highly recommended and efficient to prepare the model before creating the optimizer.
|
||||
`Accelerate` will automatically wrap the model and create an optimizer for you in case of single model with a warning message.
|
||||
> FSDP Warning: When using FSDP, it is efficient and recommended to call prepare for the model before creating the optimizer
|
||||
|
||||
@ -91,14 +91,14 @@ optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
```
|
||||
|
||||
- In case of a single model, if you have created optimizer with multiple parameter groups and called prepare with them together,
|
||||
- In case of a single model, if you have created the optimizer with multiple parameter groups and called prepare with them together,
|
||||
then the parameter groups will be lost and the following warning is displayed:
|
||||
> FSDP Warning: When using FSDP, several parameter groups will be conflated into
|
||||
> a single one due to nested module wrapping and parameter flattening.
|
||||
|
||||
This is because parameter groups created before wrapping will have no meaning post wrapping due parameter flattening of nested FSDP modules into 1D arrays (which can consume many layers).
|
||||
For instance, below are the named parameters of FSDP model on GPU 0 (When using 2 GPUs. Around 55M (110M/2) params in 1D arrays as this will have the 1st shard of the parameters).
|
||||
Here, if one has applied no weight decay for [bias, LayerNorm.weight] named parameters of unwrapped BERT model,
|
||||
This is because parameter groups created before wrapping will have no meaning post wrapping due to parameter flattening of nested FSDP modules into 1D arrays (which can consume many layers).
|
||||
For instance, below are the named parameters of an FSDP model on GPU 0 (When using 2 GPUs. Around 55M (110M/2) params in 1D arrays as this will have the 1st shard of the parameters).
|
||||
Here, if one has applied no weight decay for [bias, LayerNorm.weight] the named parameters of an unwrapped BERT model,
|
||||
it can't be applied to the below FSDP wrapped model as there are no named parameters with either of those strings and
|
||||
the parameters of those layers are concatenated with parameters of various other layers.
|
||||
```
|
||||
@ -110,7 +110,7 @@ optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
|
||||
```
|
||||
|
||||
|
||||
- In case of multiple models, it is necessary to prepare the models before creating optimizers else it will throw an error.
|
||||
- In case of multiple models, it is necessary to prepare the models before creating optimizers or else it will throw an error.
|
||||
- Mixed precision is currently not supported with FSDP.
|
||||
|
||||
For more control, users can leverage the `FullyShardedDataParallelPlugin` wherein they can specify `auto_wrap_policy`, `backward_prefetch` and `ignored_modules`.
|
||||
|
||||
126
docs/source/gradient_accumulation.mdx
Normal file
126
docs/source/gradient_accumulation.mdx
Normal file
@ -0,0 +1,126 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Performing gradient accumulation with 🤗 Accelerate
|
||||
|
||||
Gradient accumulation is a technique where you can train on bigger batch sizes than
|
||||
your machine would normally be able to fit into memory. This is done by accumulating gradients over
|
||||
several batches, and only stepping the optimizer after a certain number of batches have been performed.
|
||||
|
||||
While technically standard gradient accumulation code would work fine in a distributed setup, it is not the most efficient
|
||||
method for doing so and you may experience considerable slowdowns!
|
||||
|
||||
In this tutorial you will see how to quickly setup gradient accumulation and perform it with the utilities provided in 🤗 Accelerate,
|
||||
which can total to adding just one new line of code!
|
||||
|
||||
This example will use a very simplistic PyTorch training loop that performs gradient accumulation every two batches:
|
||||
|
||||
```python
|
||||
device = "cuda"
|
||||
model.to(device)
|
||||
|
||||
gradient_accumulation_steps = 2
|
||||
|
||||
for index, batch in enumerate(training_dataloader):
|
||||
optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
inputs = inputs.to(device)
|
||||
targets = targets.to(device)
|
||||
outputs = model(inputs)
|
||||
loss = loss_function(outputs, targets)
|
||||
loss = loss / gradient_accumulation_steps
|
||||
loss.backward()
|
||||
if (index + 1) % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
```
|
||||
|
||||
## Converting it to 🤗 Accelerate
|
||||
|
||||
First the code shown earlier will be converted to utilize 🤗 Accelerate without the special gradient accumulation helper:
|
||||
|
||||
```diff
|
||||
+ from accelerate import Accelerator
|
||||
+ accelerator = Accelerator()
|
||||
|
||||
+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
|
||||
+ model, optimizer, training_dataloader, scheduler
|
||||
+ )
|
||||
|
||||
for index, batch in enumerate(training_dataloader):
|
||||
optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
- inputs = inputs.to(device)
|
||||
- targets = targets.to(device)
|
||||
outputs = model(inputs)
|
||||
loss = loss_function(outputs, targets)
|
||||
loss = loss / gradient_accumulation_steps
|
||||
+ accelerator.backward(loss)
|
||||
if (index+1) % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
In its current state, this code is not going to perform gradient accumulation efficiently due to a process called gradient synchronization.
|
||||
</Tip>
|
||||
|
||||
## Letting 🤗 Accelerate handle gradient accumulation
|
||||
|
||||
All that is left now is to let 🤗 Accelerate handle the gradient accumulation for us. To do so you should pass in a `gradient_accumulation_steps` parameter to [`Accelerator`], dictating the number
|
||||
of steps to perform before each call to `step()` and how to automatically adjust the loss during the call to [`Accelerator.backward`]:
|
||||
|
||||
```diff
|
||||
from accelerate import Accelerator
|
||||
- accelerator = Accelerator()
|
||||
+ accelerator = Accelerator(gradient_accumulation_steps=2)
|
||||
```
|
||||
|
||||
From here you can use the [`Accelerator.accumulate`] context manager from inside your training loop to automatically perform the gradient accumulation for you!
|
||||
You just wrap it around the entire training part of your code:
|
||||
|
||||
```diff
|
||||
- for index, batch in enumerate(training_dataloader):
|
||||
+ for batch in training_dataloader:
|
||||
+ with accelerator.accumulate(model):
|
||||
optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
outputs = model(inputs)
|
||||
```
|
||||
|
||||
and you can remove all the special checks for the step number and the loss adjustment:
|
||||
|
||||
```diff
|
||||
- loss = loss / gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
- if (index+1) % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
```
|
||||
|
||||
As you can see the [`Accelerator`] is able to keep track of the batch number you are on and it will automatically know whether to step through the prepared optimizer and how to adjust the loss.
|
||||
|
||||
## The finished code
|
||||
|
||||
Below is the finished implementation for performing gradient accumulation with 🤗 Accelerate
|
||||
|
||||
```python
|
||||
for batch in training_dataloader:
|
||||
with accelerator.accumulate(model):
|
||||
optimizer.zero_grad()
|
||||
inputs, targets = batch
|
||||
outputs = model(inputs)
|
||||
loss = loss_function(outputs, targets)
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
```
|
||||
@ -12,16 +12,16 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Accelerate
|
||||
|
||||
Run your *raw* PyTorch training script on any kind of device
|
||||
Run your *raw* PyTorch training script on any kind of device.
|
||||
|
||||
## Features
|
||||
|
||||
- 🤗 Accelerate provides an easy API to make your scripts run with mixed precision and on any kind of distributed
|
||||
setting (multi-GPUs, TPUs etc.) while still letting you write your own training loop. The same code can then runs
|
||||
- 🤗 Accelerate provides an easy API to make your scripts run with mixed precision and in any kind of distributed
|
||||
setting (multi-GPUs, TPUs etc.) while still letting you write your own training loop. The same code can then run
|
||||
seamlessly on your local machine for debugging or your training environment.
|
||||
|
||||
- 🤗 Accelerate also provides a CLI tool that allows you to quickly configure and test your training environment then
|
||||
launch the scripts.
|
||||
- 🤗 Accelerate also provides a CLI tool that allows you to quickly configure and test your training environment and
|
||||
then launch the scripts.
|
||||
|
||||
|
||||
## Easy to integrate
|
||||
@ -52,7 +52,7 @@ Changing it to work with accelerate is really easy and only adds a few lines of
|
||||
+ device = accelerator.device
|
||||
my_model.to(device)
|
||||
# Pass every important object (model, optimizer, dataloader) to *accelerator.prepare*
|
||||
+ my_model, my_optimizer, my_training_dataloader = accelerate.prepare(
|
||||
+ my_model, my_optimizer, my_training_dataloader = accelerator.prepare(
|
||||
+ my_model, my_optimizer, my_training_dataloader
|
||||
+ )
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@ pip install git+https://github.com/huggingface/accelerate
|
||||
|
||||
Note that this will install not the latest released version, but the bleeding edge `main` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't been yet rolled out.
|
||||
|
||||
While we strive to keep `main` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/accelerate/issues) and this way, things will get fixed even sooner.
|
||||
While we strive to keep `main` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/accelerate/issues) and this way, things will get fixed even sooner.
|
||||
|
||||
Again, you can run:
|
||||
|
||||
@ -85,7 +85,7 @@ now this editable install will reside where you clone the folder to, e.g. `~/acc
|
||||
|
||||
Do note that you have to keep that `accelerate` folder around and not delete it to continue using the 🤗 Accelerate library.
|
||||
|
||||
Now, let's get to the real benefit of this installation approach. Say, you saw some new feature has been just committed into `main`. If you have already performed all the steps above, to update your accelerate repo to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
|
||||
Now, let's get to the real benefit of this installation approach. Say, you saw some new feature just has been committed into `main`. If you have already performed all the steps above, to update your accelerate repo to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
|
||||
|
||||
```bash
|
||||
cd ~/accelerate/
|
||||
|
||||
@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Internals
|
||||
|
||||
## Gradient Accumulation states
|
||||
|
||||
[[autodoc]] state.GradientState
|
||||
|
||||
## Optimizer
|
||||
|
||||
[[autodoc]] optimizer.AcceleratedOptimizer
|
||||
@ -22,7 +26,7 @@ The main work on your PyTorch `DataLoader` is done by the following function:
|
||||
|
||||
[[autodoc]] data_loader.prepare_data_loader
|
||||
|
||||
### BatchSamplerShard
|
||||
### DataLoaderShard
|
||||
|
||||
[[autodoc]] data_loader.DataLoaderShard
|
||||
|
||||
@ -44,26 +48,6 @@ The main work on your PyTorch `DataLoader` is done by the following function:
|
||||
|
||||
[[autodoc]] state.AcceleratorState
|
||||
|
||||
### DistributedType
|
||||
|
||||
[[autodoc]] state.DistributedType
|
||||
|
||||
## Tracking
|
||||
|
||||
[[autodoc]] tracking.GeneralTracker
|
||||
|
||||
## Utilities
|
||||
|
||||
[[autodoc]] utils.extract_model_from_parallel
|
||||
|
||||
[[autodoc]] utils.gather
|
||||
|
||||
[[autodoc]] utils.send_to_device
|
||||
|
||||
[[autodoc]] utils.set_seed
|
||||
|
||||
[[autodoc]] utils.synchronize_rng_state
|
||||
|
||||
[[autodoc]] utils.synchronize_rng_states
|
||||
|
||||
[[autodoc]] utils.wait_for_everyone
|
||||
|
||||
@ -48,4 +48,4 @@ def training_function(args):
|
||||
+ inner_training_loop()
|
||||
```
|
||||
|
||||
[[autodoc]] memory_utils.find_executable_batch_size
|
||||
[[autodoc]] utils.find_executable_batch_size
|
||||
@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Quick tour
|
||||
|
||||
Let's have a look at a look at 🤗 Accelerate main features and traps to avoid.
|
||||
Let's have a look at the 🤗 Accelerate main features and traps to avoid.
|
||||
|
||||
## Main use
|
||||
|
||||
@ -54,7 +54,7 @@ model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
|
||||
)
|
||||
```
|
||||
|
||||
In particular, your training dataloader will be sharded accross all GPUs/TPU cores available so that each one sees a
|
||||
In particular, your training dataloader will be sharded across all GPUs/TPU cores available so that each one sees a
|
||||
different portion of the training dataset. Also, the random states of all processes will be synchronized at the
|
||||
beginning of each iteration through your dataloader, to make sure the data is shuffled the same way (if you decided to
|
||||
use `shuffle=True` or any kind of random sampler).
|
||||
@ -118,7 +118,7 @@ method:
|
||||
validation_dataloader = accelerator.prepare(validation_dataloader)
|
||||
```
|
||||
|
||||
Like for your training dataloader, it will mean that (should you run your script on multiple devices) each device will
|
||||
As for your training dataloader, it will mean that (should you run your script on multiple devices) each device will
|
||||
only see part of the evaluation data. This means you will need to group your predictions together. This is very easy to
|
||||
do with the [`~Accelerator.gather`] method.
|
||||
|
||||
@ -134,8 +134,8 @@ for inputs, targets in validation_dataloader:
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Like for the training dataloader, passing your validation dataloader through
|
||||
[`~Accelerator.prepare`] may change its: if you run on X GPUs, it will have its length divided by X
|
||||
As for the training dataloader, passing your validation dataloader through
|
||||
[`~Accelerator.prepare`] may change it: if you run on X GPUs, it will have its length divided by X
|
||||
(since your actual batch size will be multiplied by X), unless you set `split_batches=True`.
|
||||
|
||||
Any instruction using your training dataloader length (for instance if you need the number of total training steps
|
||||
@ -159,7 +159,7 @@ PyTorch), they are fully compatible with 🤗 Accelerate. The only caveat here i
|
||||
to determine all useful information, so `torch.distributed.launch` should be used with the flag `--use_env`.
|
||||
|
||||
🤗 Accelerate also provides a CLI tool that unifies all launcher, so you only have to remember one command. To use it,
|
||||
just run
|
||||
just run:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
@ -175,7 +175,7 @@ on your machine and reply to the questions asked. This will save a *default_conf
|
||||
|
||||
You can also specify with the flag `--config_file` the location of the file you want to save.
|
||||
|
||||
Once this is done, you can test everything is going well on your setup by running
|
||||
Once this is done, you can test everything is going well on your setup by running:
|
||||
|
||||
```bash
|
||||
accelerate test
|
||||
@ -235,14 +235,14 @@ step). This is why your first step of training will always be very long as build
|
||||
optimizations takes some time.
|
||||
|
||||
The good news is that this compilation will be cached so the second step and all the following will be much faster. The
|
||||
bas news is that it only applies if all of your steps do exactly the same operations, which implies:
|
||||
bad news is that it only applies if all of your steps do exactly the same operations, which implies:
|
||||
|
||||
- having all tensors of the same length in all your lengths
|
||||
- having static code (i.e., not a for loop of length that could change from step to step)
|
||||
|
||||
Having any of the things above change between two steps will trigger a new compilation which will, once again, take a
|
||||
lot of time. In practice, that means you must take special care to have all your tensors in your inputs of the same
|
||||
shape (so no dynamic padding for instance if you are in an NLP problem) and should not use layer with for loops that
|
||||
shape (so no dynamic padding for instance if you are in an NLP problem) and should not use layers with for loops that
|
||||
have different lengths depending on the inputs (such as an LSTM) or the training will be excruciatingly slow.
|
||||
|
||||
To introduce special behavior in your script for TPUs you can check the `distributed_type` of your
|
||||
@ -257,10 +257,10 @@ else:
|
||||
# go crazy and be dynamic
|
||||
```
|
||||
|
||||
The [NLP example](https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py) shows an example in
|
||||
The [NLP example](https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py) shows an example in a
|
||||
situation with dynamic padding.
|
||||
|
||||
One last thing to pay close attnetion to: if your model has tied weights (such as language models which tie the weights
|
||||
One last thing to pay close attention to: if your model has tied weights (such as language models which tie the weights
|
||||
of the embedding matrix with the weights of the decoder), moving this model to the TPU (either yourself or after you
|
||||
passed your model to [`~Accelerator.prepare`]) will break the tying. You will need to retie the weights
|
||||
after. You can find an example of this in the [run_clm_no_trainer](https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm.py) script in
|
||||
@ -317,8 +317,8 @@ following line in your code:
|
||||
accelerator.wait_for_everyone()
|
||||
```
|
||||
|
||||
This instruction will block all the processes that arrive them first until all the other processes have reached that
|
||||
point (if you run your script on just one GPU or CPU, this wont' do anything).
|
||||
This instruction will block all the processes that arrive first until all the other processes have reached that
|
||||
point (if you run your script on just one GPU or CPU, this won't do anything).
|
||||
|
||||
|
||||
### Saving/loading a model
|
||||
@ -338,7 +338,7 @@ unwrapped_model = accelerator.unwrap_model(model)
|
||||
accelerator.save(unwrapped_model.state_dict(), filename)
|
||||
```
|
||||
|
||||
If your script contains a logic to load checkpoint, we also recommend you load your weights in the unwrapped model
|
||||
If your script contains logic to load a checkpoint, we also recommend you load your weights in the unwrapped model
|
||||
(this is only useful if you use the load function after making your model go through
|
||||
[`~Accelerator.prepare`]). Here is an example:
|
||||
|
||||
@ -368,7 +368,7 @@ and `accelerator.clip_grad_value_` respectively.
|
||||
|
||||
### Mixed Precision training
|
||||
|
||||
If you are running your training in Mixed Precision with Accelerate, you will get the best result with your loss being
|
||||
If you are running your training in Mixed Precision with 🤗 Accelerate, you will get the best result with your loss being
|
||||
computed inside your model (like in Transformer models for instance). Every computation outside of the model will be
|
||||
executed in full precision (which is generally what you want for loss computation, expecially if it involves a
|
||||
softmax). However you might want to put your loss computation inside the *accelerator.autocast* context manager:
|
||||
@ -438,14 +438,14 @@ The random number generator synchronization will by default synchronize:
|
||||
- the main random number generator in PyTorch <=1.5.1
|
||||
|
||||
You can choose which random number generator(s) to synchronize with the `rng_types` argument of the main
|
||||
[`Accelerator`]. In PyTorch >= 1.6, it is recommended to rely on local `generator` to avoid
|
||||
[`Accelerator`]. In PyTorch >= 1.6, it is recommended to rely on a local `generator` to avoid
|
||||
setting the same seed in the main random number generator in all processes.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Synchronization the main torch (or CUDA or XLA) random number generator will affect any other potential random
|
||||
artifacts you could have in your dataset (like random data augmentation) in the sense all processes will get the
|
||||
same random numbers from the torch random modules (so will apply the same random data augmentation if it's
|
||||
Synchronization of the main torch (or CUDA or XLA) random number generator will affect any other potential random
|
||||
artifacts you could have in your dataset (like random data augmentation) in the sense that all processes will get
|
||||
the same random numbers from the torch random modules (so will apply the same random data augmentation if it's
|
||||
controlled by torch).
|
||||
|
||||
</Tip>
|
||||
@ -457,4 +457,4 @@ The randomization part of your custom sampler, batch sampler or iterable dataset
|
||||
|
||||
</Tip>
|
||||
|
||||
See more details about the internal in the [Internals page](internal).
|
||||
For more details about the internals, see the [Internals page](internal).
|
||||
|
||||
@ -23,7 +23,7 @@ make it easier than ever to train Hugging Face Transformer models in [Amazon Sag
|
||||
Before you can run your 🤗 Accelerate scripts on Amazon SageMaker you need to sign up for an AWS account. If you do not
|
||||
have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).
|
||||
|
||||
After you have your AWS Account you need to install the `sagemaker` sdk for 🤗 Accelerate with.
|
||||
After you have your AWS Account you need to install the `sagemaker` sdk for 🤗 Accelerate with:
|
||||
|
||||
```bash
|
||||
pip install "accelerate[sagemaker]" --upgrade
|
||||
@ -31,7 +31,7 @@ pip install "accelerate[sagemaker]" --upgrade
|
||||
|
||||
🤗 Accelerate currently uses the 🤗 DLCs, with `transformers`, `datasets` and `tokenizers` pre-installed. 🤗
|
||||
Accelerate is not in the DLC yet (will soon be added!) so to use it within Amazon SageMaker you need to create a
|
||||
`requirements.txt` in the same directory where your training script is located and add it as dependency.
|
||||
`requirements.txt` in the same directory where your training script is located and add it as dependency:
|
||||
|
||||
```
|
||||
accelerate
|
||||
@ -43,7 +43,7 @@ You should also add any other dependencies you have to this `requirements.txt`.
|
||||
### Configure 🤗 Accelerate
|
||||
|
||||
You can configure the launch configuration for Amazon SageMaker the same as you do for non SageMaker training jobs with
|
||||
the 🤗 Accelerate CLI.
|
||||
the 🤗 Accelerate CLI:
|
||||
|
||||
```bash
|
||||
accelerate config
|
||||
@ -62,7 +62,7 @@ accelerate config
|
||||
|
||||
The training script is very similar to a training script you might run outside of SageMaker, but to save your model
|
||||
after training you need to specify either `/opt/ml/model` or use `os.environ["SM_MODEL_DIR"]` as your save
|
||||
directory. After training, artifacts in this directory are uploaded to S3.
|
||||
directory. After training, artifacts in this directory are uploaded to S3:
|
||||
|
||||
|
||||
```diff
|
||||
@ -79,7 +79,7 @@ specify type as bool in your script and provide an explicit True or False value
|
||||
|
||||
### Launch Training
|
||||
|
||||
You can launch your training with 🤗 Accelerate CLI with
|
||||
You can launch your training with 🤗 Accelerate CLI with:
|
||||
|
||||
```
|
||||
accelerate launch path_to_script.py --args_to_the_script
|
||||
|
||||
@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
|
||||
# Tracking
|
||||
|
||||
There are a large number of experiment tracking API's available, however getting them all to work with in a multi-processing environment can oftentimes be complex.
|
||||
Accelerate provides a general tracking API that can be used to log useful items during your script through [`~Accelerator.log`]
|
||||
🤗 Accelerate provides a general tracking API that can be used to log useful items during your script through [`~Accelerator.log`]
|
||||
|
||||
## Integrated Trackers
|
||||
|
||||
|
||||
91
docs/source/utilities.mdx
Normal file
91
docs/source/utilities.mdx
Normal file
@ -0,0 +1,91 @@
|
||||
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Helpful Utilities
|
||||
|
||||
Below are a variety of utility functions that 🤗 Accelerate provides, broken down by use-case.
|
||||
|
||||
## Data Classes
|
||||
|
||||
These are basic dataclasses used throughout 🤗 Accelerate and they can be passed in as parameters.
|
||||
|
||||
[[autodoc]] utils.DistributedType
|
||||
|
||||
[[autodoc]] utils.LoggerType
|
||||
|
||||
[[autodoc]] utils.PrecisionType
|
||||
|
||||
## Data Manipulation and Operations
|
||||
|
||||
These include data operations that mimic the same `torch` ops but can be used on distributed processes.
|
||||
|
||||
[[autodoc]] utils.broadcast
|
||||
|
||||
[[autodoc]] utils.concatenate
|
||||
|
||||
[[autodoc]] utils.gather
|
||||
|
||||
[[autodoc]] utils.pad_across_processes
|
||||
|
||||
[[autodoc]] utils.reduce
|
||||
|
||||
[[autodoc]] utils.send_to_device
|
||||
|
||||
## Environment Checks
|
||||
|
||||
These functionalities check the state of the current working environment including information about the operating system itself, what it can support, and if particular dependencies are installed.
|
||||
|
||||
[[autodoc]] utils.get_max_memory
|
||||
|
||||
[[autodoc]] utils.is_bf16_available
|
||||
|
||||
[[autodoc]] utils.is_torch_version
|
||||
|
||||
[[autodoc]] utils.is_tpu_available
|
||||
|
||||
## Environment Configuration
|
||||
|
||||
[[autodoc]] utils.write_basic_config
|
||||
|
||||
When setting up 🤗 Accelerate for the first time, rather than running `accelerate config` [~utils.write_basic_config] can be used as an alternative for quick configuration.
|
||||
|
||||
## Modeling
|
||||
|
||||
These utilities relate to interacting with PyTorch models
|
||||
|
||||
[[autodoc]] utils.extract_model_from_parallel
|
||||
|
||||
[[autodoc]] utils.get_max_layer_size
|
||||
|
||||
[[autodoc]] utils.offload_state_dict
|
||||
|
||||
|
||||
## Parallel
|
||||
|
||||
These include general utilities that should be used when working in parallel.
|
||||
|
||||
[[autodoc]] utils.extract_model_from_parallel
|
||||
|
||||
[[autodoc]] utils.save
|
||||
|
||||
[[autodoc]] utils.wait_for_everyone
|
||||
|
||||
|
||||
## Random
|
||||
|
||||
These utilities relate to setting and synchronizing of all the random states.
|
||||
|
||||
[[autodoc]] utils.set_seed
|
||||
|
||||
[[autodoc]] utils.synchronize_rng_state
|
||||
|
||||
[[autodoc]] utils.synchronize_rng_states
|
||||
@ -23,7 +23,7 @@ The [nlp_example.py](./nlp_example.py) script is a simple example to train a Ber
|
||||
Prior to running it you should install 🤗 Dataset and 🤗 Transformers:
|
||||
|
||||
```bash
|
||||
pip install datasets transformers
|
||||
pip install datasets evaluate transformers
|
||||
```
|
||||
|
||||
The same script can be run in any of the following configurations:
|
||||
|
||||
@ -42,6 +42,18 @@ These arguments should be added at the end of any method for starting the python
|
||||
accelerate launch ./checkpointing.py --checkpointing_steps epoch output_dir "checkpointing_tutorial" --resume_from_checkpoint "checkpointing_tutorial/epoch_0"
|
||||
```
|
||||
|
||||
### Cross Validation (`cross_validation.py`)
|
||||
|
||||
- Shows how to use `Accelerator.free_memory` and run cross validation efficiently with `datasets`.
|
||||
- Arguments available:
|
||||
- `num_folds`, the number of folds the training dataset should be split into.
|
||||
|
||||
These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torch.distributed.launch`), such as:
|
||||
|
||||
```bash
|
||||
accelerate launch ./cross_validation.py --num_folds 2
|
||||
```
|
||||
|
||||
### Experiment Tracking (`tracking.py`)
|
||||
|
||||
- Shows how to use `Accelerate.init_trackers` and `Accelerator.log`
|
||||
@ -55,14 +67,14 @@ These arguments should be added at the end of any method for starting the python
|
||||
accelerate launch ./tracking.py --with_tracking
|
||||
```
|
||||
|
||||
### Cross Validation (`cross_validation.py`)
|
||||
### Gradient Accumulation (`gradient_accumulation.py`)
|
||||
|
||||
- Shows how to use `Accelerator.free_memory` and run cross validation efficiently with `datasets`.
|
||||
- Shows how to use `Accelerator.no_sync` to prevent gradient averaging in a distributed setup.
|
||||
- Arguments available:
|
||||
- `num_folds`, the number of folds the training dataset should be split into.
|
||||
- `gradient_accumulation_steps`, the number of steps to perform before the gradients are accumulated and the optimizer and scheduler are stepped + zero_grad
|
||||
|
||||
These arguments should be added at the end of any method for starting the python script (such as `python`, `accelerate launch`, `python -m torch.distributed.launch`), such as:
|
||||
|
||||
```bash
|
||||
accelerate launch ./cross_validation.py --num_folds 2
|
||||
```
|
||||
accelerate launch ./gradient_accumulation.py --gradient_accumulation_steps 5
|
||||
```
|
||||
@ -16,17 +16,13 @@ import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
@ -103,13 +99,19 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
@ -130,11 +132,11 @@ def training_function(config, args):
|
||||
set_seed(seed)
|
||||
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
metric = load_metric("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
@ -147,7 +149,7 @@ def training_function(config, args):
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
@ -289,7 +291,7 @@ def main():
|
||||
help="If the training should continue from a checkpoint folder.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
|
||||
@ -17,21 +17,17 @@ from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import DatasetDict, load_dataset, load_metric
|
||||
from datasets import DatasetDict, load_dataset
|
||||
|
||||
# New Code #
|
||||
# We'll be using StratifiedKFold for this example
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
@ -129,7 +125,6 @@ def get_fold_dataloaders(
|
||||
|
||||
def training_function(config, args):
|
||||
# New Code #
|
||||
test_labels = None
|
||||
test_predictions = []
|
||||
# Download the dataset
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
@ -140,15 +135,14 @@ def training_function(config, args):
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
@ -157,17 +151,15 @@ def training_function(config, args):
|
||||
# New Code #
|
||||
# Create our folds:
|
||||
folds = kfold.split(np.zeros(datasets["train"].num_rows), datasets["train"]["label"])
|
||||
|
||||
test_references = []
|
||||
# Iterate over them
|
||||
for train_idxs, valid_idxs in folds:
|
||||
for i, (train_idxs, valid_idxs) in enumerate(folds):
|
||||
train_dataloader, eval_dataloader, test_dataloader = get_fold_dataloaders(
|
||||
accelerator,
|
||||
datasets,
|
||||
train_idxs,
|
||||
valid_idxs,
|
||||
)
|
||||
if test_labels is None:
|
||||
test_labels = datasets["validation"]["label"]
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
@ -177,7 +169,7 @@ def training_function(config, args):
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
@ -236,19 +228,18 @@ def training_function(config, args):
|
||||
predictions = outputs.logits
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
fold_predictions.append(predictions.cpu())
|
||||
metric.add_batch(
|
||||
predictions=predictions.argmax(dim=-1),
|
||||
references=references,
|
||||
)
|
||||
test_metric = metric.compute()
|
||||
if i == 0:
|
||||
# We need all of the test predictions
|
||||
test_references.append(references.cpu())
|
||||
# Use accelerator.print to print only on the main process.
|
||||
test_predictions.append(torch.cat(fold_predictions, dim=0))
|
||||
# We now need to release all our memory and get rid of the current model, optimizer, etc
|
||||
accelerator.free_memory()
|
||||
# New Code #
|
||||
# Finally we check the accuracy of our folded results:
|
||||
preds = torch.stack(test_predictions, dim=0).sum(dim=0).div(int(config["n_splits"])).argmax(dim=-1)
|
||||
test_metric = metric.compute(predictions=preds, references=test_labels)
|
||||
test_references = torch.cat(test_references, dim=0)
|
||||
preds = torch.stack(test_predictions, dim=0).sum(dim=0).div(int(args.num_folds)).argmax(dim=-1)
|
||||
test_metric = metric.compute(predictions=preds, references=test_references)
|
||||
accelerator.print("Average test metrics from all folds:", test_metric)
|
||||
|
||||
|
||||
@ -267,7 +258,7 @@ def main():
|
||||
# New Code #
|
||||
parser.add_argument("--num_folds", type=int, default=3, help="The number of splits to perform across the dataset")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
|
||||
736
examples/by_feature/deepspeed_with_config_support.py
Executable file
736
examples/by_feature/deepspeed_with_config_support.py
Executable file
@ -0,0 +1,736 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
|
||||
on a text file or a dataset without using HuggingFace Trainer.
|
||||
|
||||
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
|
||||
https://huggingface.co/models?filter=text-generation
|
||||
"""
|
||||
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import datasets
|
||||
import transformers
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from accelerate.logging import get_logger
|
||||
from accelerate.utils import DummyOptim, DummyScheduler, set_seed
|
||||
from datasets import load_dataset
|
||||
from huggingface_hub import Repository
|
||||
from tqdm.auto import tqdm
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
MODEL_MAPPING,
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
SchedulerType,
|
||||
default_data_collator,
|
||||
get_scheduler,
|
||||
)
|
||||
from transformers.utils import get_full_repo_name
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
|
||||
parser.add_argument(
|
||||
"--dataset_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The name of the dataset to use (via the datasets library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_config_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The configuration name of the dataset to use (via the datasets library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_file", type=str, default=None, help="A csv or a json file containing the training data."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--validation_split_percentage",
|
||||
default=5,
|
||||
help="The percentage of the train set used as validation set in case there's no validation split",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
type=str,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Pretrained config name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_slow_tokenizer",
|
||||
action="store_true",
|
||||
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_train_batch_size",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Batch size (per device) for the training dataloader.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_device_eval_batch_size",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Batch size (per device) for the evaluation dataloader.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=5e-5,
|
||||
help="Initial learning rate (after the potential warmup period) to use.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
|
||||
parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
|
||||
parser.add_argument(
|
||||
"--max_train_steps",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr_scheduler_type",
|
||||
type=SchedulerType,
|
||||
default="linear",
|
||||
help="The scheduler type to use.",
|
||||
choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
|
||||
)
|
||||
parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
|
||||
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Model type to use if training from scratch.",
|
||||
choices=MODEL_TYPES,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block_size",
|
||||
type=int,
|
||||
default=None,
|
||||
help=(
|
||||
"Optional input sequence length after tokenization. The training dataset will be truncated in block of"
|
||||
" this size for training. Default to the model max input length for single sentence inputs (take into"
|
||||
" account special tokens)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preprocessing_num_workers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="The number of processes to use for the preprocessing.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files."
|
||||
)
|
||||
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
|
||||
parser.add_argument(
|
||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||
)
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--checkpointing_steps",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume_from_checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If the training should continue from a checkpoint folder.",
|
||||
)
|
||||
# New Code #
|
||||
# Whether to load the best model at the end of training
|
||||
parser.add_argument(
|
||||
"--load_best_model",
|
||||
action="store_true",
|
||||
help="Whether to load the best model at the end of training",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_tracking",
|
||||
action="store_true",
|
||||
help="Whether to enable experiment trackers for logging.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report_to",
|
||||
type=str,
|
||||
default="all",
|
||||
help=(
|
||||
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
|
||||
' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
|
||||
"Only applicable when `--with_tracking` is passed."
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Sanity checks
|
||||
if args.dataset_name is None and args.train_file is None and args.validation_file is None:
|
||||
raise ValueError("Need either a dataset name or a training/validation file.")
|
||||
else:
|
||||
if args.train_file is not None:
|
||||
extension = args.train_file.split(".")[-1]
|
||||
assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
|
||||
if args.validation_file is not None:
|
||||
extension = args.validation_file.split(".")[-1]
|
||||
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
|
||||
|
||||
if args.push_to_hub:
|
||||
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
|
||||
|
||||
return args
|
||||
|
||||
|
||||
# New Code #
|
||||
def checkpoint_model(checkpoint_folder, ckpt_id, model, epoch, last_global_step, **kwargs):
|
||||
"""Utility function for checkpointing model + optimizer dictionaries
|
||||
The main purpose for this is to be able to resume training from that instant again
|
||||
"""
|
||||
checkpoint_state_dict = {
|
||||
"epoch": epoch,
|
||||
"last_global_step": last_global_step,
|
||||
}
|
||||
# Add extra kwargs too
|
||||
checkpoint_state_dict.update(kwargs)
|
||||
|
||||
success = model.save_checkpoint(checkpoint_folder, ckpt_id, checkpoint_state_dict)
|
||||
status_msg = f"checkpointing: checkpoint_folder={checkpoint_folder}, ckpt_id={ckpt_id}"
|
||||
if success:
|
||||
logging.info(f"Success {status_msg}")
|
||||
else:
|
||||
logging.warning(f"Failure {status_msg}")
|
||||
return
|
||||
|
||||
|
||||
# New Code #
|
||||
def load_training_checkpoint(model, load_dir, tag=None, **kwargs):
|
||||
"""Utility function for checkpointing model + optimizer dictionaries
|
||||
The main purpose for this is to be able to resume training from that instant again
|
||||
"""
|
||||
_, checkpoint_state_dict = model.load_checkpoint(load_dir, tag=tag, **kwargs)
|
||||
epoch = checkpoint_state_dict["epoch"]
|
||||
last_global_step = checkpoint_state_dict["last_global_step"]
|
||||
del checkpoint_state_dict
|
||||
return (epoch, last_global_step)
|
||||
|
||||
|
||||
# New Code #
|
||||
def evaluate(args, model, eval_dataloader, accelerator, eval_dataset):
|
||||
model.eval()
|
||||
losses = []
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
|
||||
loss = outputs.loss
|
||||
losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
|
||||
|
||||
losses = torch.cat(losses)
|
||||
losses = losses[: len(eval_dataset)]
|
||||
try:
|
||||
eval_loss = torch.mean(losses)
|
||||
perplexity = math.exp(eval_loss)
|
||||
except OverflowError:
|
||||
perplexity = float("inf")
|
||||
return perplexity, eval_loss
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
logger.info(accelerator.state, main_process_only=False)
|
||||
if accelerator.is_local_main_process:
|
||||
datasets.utils.logging.set_verbosity_warning()
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
else:
|
||||
datasets.utils.logging.set_verbosity_error()
|
||||
transformers.utils.logging.set_verbosity_error()
|
||||
|
||||
# If passed along, set the training seed now.
|
||||
if args.seed is not None:
|
||||
set_seed(args.seed)
|
||||
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
|
||||
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
||||
# (the dataset will be downloaded automatically from the datasets Hub).
|
||||
#
|
||||
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
|
||||
# 'text' is found. You can easily tweak this behavior (see below).
|
||||
#
|
||||
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
dataset_args = {}
|
||||
if args.train_file is not None:
|
||||
data_files["train"] = args.train_file
|
||||
if args.validation_file is not None:
|
||||
data_files["validation"] = args.validation_file
|
||||
extension = args.train_file.split(".")[-1]
|
||||
if extension == "txt":
|
||||
extension = "text"
|
||||
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
|
||||
raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
|
||||
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
extension,
|
||||
data_files=data_files,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
**dataset_args,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
extension,
|
||||
data_files=data_files,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
**dataset_args,
|
||||
)
|
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
|
||||
# download model & vocab.
|
||||
if args.config_name:
|
||||
config = AutoConfig.from_pretrained(args.config_name)
|
||||
elif args.model_name_or_path:
|
||||
config = AutoConfig.from_pretrained(args.model_name_or_path)
|
||||
else:
|
||||
config = CONFIG_MAPPING[args.model_type]()
|
||||
logger.warning("You are instantiating a new config instance from scratch.")
|
||||
|
||||
if args.tokenizer_name:
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
|
||||
elif args.model_name_or_path:
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
|
||||
else:
|
||||
raise ValueError(
|
||||
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
|
||||
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
||||
)
|
||||
|
||||
if args.model_name_or_path:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
)
|
||||
else:
|
||||
logger.info("Training new model from scratch")
|
||||
model = AutoModelForCausalLM.from_config(config)
|
||||
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
# Preprocessing the datasets.
|
||||
# First we tokenize all the texts.
|
||||
column_names = raw_datasets["train"].column_names
|
||||
text_column_name = "text" if "text" in column_names else column_names[0]
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples[text_column_name])
|
||||
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
|
||||
if args.block_size is None:
|
||||
block_size = tokenizer.model_max_length
|
||||
if block_size > 1024:
|
||||
logger.warning(
|
||||
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
|
||||
"Picking 1024 instead. You can change that default value by passing --block_size xxx."
|
||||
)
|
||||
block_size = 1024
|
||||
else:
|
||||
if args.block_size > tokenizer.model_max_length:
|
||||
logger.warning(
|
||||
f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
|
||||
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
|
||||
)
|
||||
block_size = min(args.block_size, tokenizer.model_max_length)
|
||||
|
||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||
def group_texts(examples):
|
||||
# Concatenate all texts.
|
||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||
# customize this part to your needs.
|
||||
if total_length >= block_size:
|
||||
total_length = (total_length // block_size) * block_size
|
||||
# Split by chunks of max_len.
|
||||
result = {
|
||||
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
||||
for k, t in concatenated_examples.items()
|
||||
}
|
||||
result["labels"] = result["input_ids"].copy()
|
||||
return result
|
||||
|
||||
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
|
||||
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
|
||||
# to preprocess.
|
||||
#
|
||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||
|
||||
with accelerator.main_process_first():
|
||||
lm_datasets = tokenized_datasets.map(
|
||||
group_texts,
|
||||
batched=True,
|
||||
num_proc=args.preprocessing_num_workers,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc=f"Grouping texts in chunks of {block_size}",
|
||||
)
|
||||
|
||||
train_dataset = lm_datasets["train"]
|
||||
eval_dataset = lm_datasets["validation"]
|
||||
|
||||
# Log a few random samples from the training set:
|
||||
for index in random.sample(range(len(train_dataset)), 3):
|
||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||
|
||||
# DataLoaders creation:
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
|
||||
)
|
||||
|
||||
# Optimizer
|
||||
# Split weights in two groups, one with weight decay and the other not.
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
# New Code #
|
||||
# Creates Dummy Optimizer if `optimizer` was spcified in the config file else creates Adam Optimizer
|
||||
optimizer_cls = (
|
||||
torch.optim.AdamW
|
||||
if accelerator.state.deepspeed_plugin is None
|
||||
or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
|
||||
else DummyOptim
|
||||
)
|
||||
optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||
|
||||
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
model.tie_weights()
|
||||
|
||||
# Scheduler and math around the number of training steps.
|
||||
|
||||
# New Code
|
||||
# Get gradient accumulation steps from deepspeed config if available
|
||||
if accelerator.state.deepspeed_plugin is not None:
|
||||
args.gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
|
||||
"gradient_accumulation_steps"
|
||||
]
|
||||
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||
if args.max_train_steps is None:
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
else:
|
||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||
|
||||
# New Code #
|
||||
# Creates Dummy Scheduler if `scheduler` was spcified in the config file else creates `args.lr_scheduler_type` Scheduler
|
||||
if (
|
||||
accelerator.state.deepspeed_plugin is None
|
||||
or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
|
||||
):
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
)
|
||||
else:
|
||||
lr_scheduler = DummyScheduler(
|
||||
optimizer, total_num_steps=args.max_train_steps, warmup_num_steps=args.num_warmup_steps
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||
|
||||
# Figure out how many steps we should save the Accelerator states
|
||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||
checkpointing_steps = args.checkpointing_steps
|
||||
if args.checkpointing_steps.isdigit():
|
||||
checkpointing_steps = int(args.checkpointing_steps)
|
||||
else:
|
||||
checkpointing_steps = None
|
||||
|
||||
# We need to initialize the trackers we use, and also store our configuration.
|
||||
# We initialize the trackers only on main process because `accelerator.log`
|
||||
# only logs on main process and we don't want empty logs/runs on other processes.
|
||||
if args.with_tracking:
|
||||
if accelerator.is_main_process:
|
||||
experiment_config = vars(args)
|
||||
# TensorBoard cannot log Enums, need the raw value
|
||||
experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
|
||||
accelerator.init_trackers("clm_no_trainer", experiment_config)
|
||||
|
||||
# Train!
|
||||
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(train_dataset)}")
|
||||
logger.info(f" Num Epochs = {args.num_train_epochs}")
|
||||
logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}")
|
||||
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
|
||||
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
|
||||
logger.info(f" Total optimization steps = {args.max_train_steps}")
|
||||
# Only show the progress bar once on each machine.
|
||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||
completed_steps = 0
|
||||
starting_epoch = 0
|
||||
best_metric = None
|
||||
best_metric_checkpoint = None
|
||||
|
||||
# Potentially load in the weights and states from a previous save
|
||||
if args.resume_from_checkpoint:
|
||||
# New Code #
|
||||
# Loads the DeepSpeed checkpoint from the specified path
|
||||
_, last_global_step = load_training_checkpoint(
|
||||
model,
|
||||
args.resume_from_checkpoint,
|
||||
**{"load_optimizer_states": True, "load_lr_scheduler_states": True},
|
||||
)
|
||||
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||
resume_step = last_global_step
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
total_loss = 0
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
if isinstance(checkpointing_steps, int):
|
||||
if completed_steps % checkpointing_steps == 0:
|
||||
output_dir = f"step_{completed_steps }"
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
if completed_steps >= args.max_train_steps:
|
||||
break
|
||||
|
||||
perplexity, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
|
||||
logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"perplexity": perplexity,
|
||||
"eval_loss": eval_loss,
|
||||
"train_loss": total_loss.item() / len(train_dataloader),
|
||||
"epoch": epoch,
|
||||
"step": completed_steps,
|
||||
},
|
||||
step=completed_steps,
|
||||
)
|
||||
|
||||
# New Code #
|
||||
# Save the DeepSpeed checkpoint to the specified path
|
||||
checkpoint_model(args.output_dir, epoch, model, epoch, completed_steps)
|
||||
|
||||
# New Code #
|
||||
# Tracks the best checkpoint and best metric
|
||||
if best_metric is None or best_metric > perplexity:
|
||||
best_metric = perplexity
|
||||
best_metric_checkpoint = os.path.join(args.output_dir, str(epoch))
|
||||
accelerator.print(f"New best metric: {best_metric} at epoch {epoch}")
|
||||
accelerator.print(f"best_metric_checkpoint: {best_metric_checkpoint}")
|
||||
|
||||
# New Code #
|
||||
# Loads the best checkpoint after the training is finished
|
||||
if args.load_best_model:
|
||||
_, last_global_step = load_training_checkpoint(
|
||||
model,
|
||||
"/".join(best_metric_checkpoint.split("/")[:-1]),
|
||||
tag=best_metric_checkpoint.split("/")[-1],
|
||||
**{"load_optimizer_states": True, "load_lr_scheduler_states": True},
|
||||
)
|
||||
|
||||
# New Code #
|
||||
# Evaluates using the best checkpoint
|
||||
perplexity, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset)
|
||||
logger.info(f"Best model metrics: perplexity: {perplexity} eval_loss: {eval_loss}")
|
||||
if perplexity != best_metric:
|
||||
raise AssertionError(
|
||||
f"Best metric {best_metric} does not match the metric {perplexity} of the loaded best model."
|
||||
)
|
||||
|
||||
if args.output_dir is not None:
|
||||
accelerator.wait_for_everyone()
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
|
||||
# New Code #
|
||||
# Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
|
||||
# `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
|
||||
# `zero3_save_16bit_model` is True in DeepSpeed Plugin.
|
||||
# For Zero Stages 1 and 2, models are saved as usual in the output directory.
|
||||
# The model name saved is `pytorch_model.bin`
|
||||
unwrapped_model.save_pretrained(
|
||||
args.output_dir,
|
||||
is_main_process=accelerator.is_main_process,
|
||||
save_function=accelerator.save,
|
||||
state_dict=accelerator.get_state_dict(model),
|
||||
)
|
||||
if accelerator.is_main_process:
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
|
||||
|
||||
with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
|
||||
json.dump({"perplexity": perplexity, "eval_loss": eval_loss.item()}, f)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
387
examples/by_feature/fsdp_with_peak_mem_tracking.py
Normal file
387
examples/by_feature/fsdp_with_peak_mem_tracking.py
Normal file
@ -0,0 +1,387 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import gc
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
# - FSDP
|
||||
#
|
||||
# This example also demonstrates the checkpointing and sharding capabilities
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
# New Code #
|
||||
# Converting Bytes to Megabytes
|
||||
def b2mb(x):
|
||||
return int(x / 2**20)
|
||||
|
||||
|
||||
# New Code #
|
||||
# This context manager is used to track the peak memory usage of the process
|
||||
class TorchTracemalloc:
|
||||
def __enter__(self):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_max_memory_allocated() # reset the peak gauge to zero
|
||||
self.begin = torch.cuda.memory_allocated()
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
self.end = torch.cuda.memory_allocated()
|
||||
self.peak = torch.cuda.max_memory_allocated()
|
||||
self.used = b2mb(self.end - self.begin)
|
||||
self.peaked = b2mb(self.peak - self.begin)
|
||||
# print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
if args.with_tracking:
|
||||
accelerator = Accelerator(
|
||||
cpu=args.cpu, mixed_precision=args.mixed_precision, log_with="wandb", logging_dir=args.logging_dir
|
||||
)
|
||||
else:
|
||||
accelerator = Accelerator()
|
||||
accelerator.print(accelerator.distributed_type)
|
||||
|
||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||
if args.checkpointing_steps == "epoch":
|
||||
checkpointing_steps = args.checkpointing_steps
|
||||
elif args.checkpointing_steps.isdigit():
|
||||
checkpointing_steps = int(args.checkpointing_steps)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
|
||||
)
|
||||
else:
|
||||
checkpointing_steps = None
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
# We need to initialize the trackers we use, and also store our configuration
|
||||
if args.with_tracking:
|
||||
if accelerator.is_main_process:
|
||||
experiment_config = vars(args)
|
||||
accelerator.init_trackers("fsdp_glue_no_trainer", experiment_config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
set_seed(seed)
|
||||
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, return_dict=True)
|
||||
# New Code #
|
||||
# For FSDP feature, it is highly recommended and efficient to prepare the model before creating optimizer
|
||||
model = accelerator.prepare(model)
|
||||
accelerator.print(model)
|
||||
|
||||
# Instantiate optimizer
|
||||
# New Code #
|
||||
# For FSDP feature, at present it doesn't support multiple parameter groups,
|
||||
# so we need to create a single parameter group for the whole model
|
||||
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=2e-4)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=10,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs) // gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# New Code #
|
||||
# For FSDP feature, prepare everything except the model as we have already prepared the model
|
||||
# before creating the optimizer
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
overall_step = 0
|
||||
|
||||
# Potentially load in the weights and states from a previous save
|
||||
if args.resume_from_checkpoint:
|
||||
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||
accelerator.load_state(args.resume_from_checkpoint)
|
||||
path = os.path.basename(args.resume_from_checkpoint)
|
||||
else:
|
||||
# Get the most recent checkpoint
|
||||
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||
dirs.sort(key=os.path.getctime)
|
||||
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||
# Extract `epoch_{i}` or `step_{i}`
|
||||
training_difference = os.path.splitext(path)[0]
|
||||
|
||||
if "epoch" in training_difference:
|
||||
num_epochs -= int(training_difference.replace("epoch_", ""))
|
||||
resume_step = None
|
||||
else:
|
||||
resume_step = int(training_difference.replace("step_", ""))
|
||||
num_epochs -= resume_step // len(train_dataloader)
|
||||
# If resuming by step, we also need to know exactly how far into the DataLoader we went
|
||||
resume_step = (num_epochs * len(train_dataloader)) - resume_step
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
# New Code #
|
||||
# context manager to track the peak memory usage during the training epoch
|
||||
with TorchTracemalloc() as tracemalloc:
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
total_loss = 0
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == 0:
|
||||
if resume_step is not None and step < resume_step:
|
||||
pass
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss = loss / gradient_accumulation_steps
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
accelerator.backward(loss)
|
||||
if step % gradient_accumulation_steps == 0:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
# accelerator.print(lr_scheduler.get_lr())
|
||||
|
||||
overall_step += 1
|
||||
|
||||
if isinstance(checkpointing_steps, int):
|
||||
output_dir = f"step_{overall_step}"
|
||||
if overall_step % checkpointing_steps == 0:
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
# New Code #
|
||||
# Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
|
||||
accelerator.print("Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
|
||||
accelerator.print("Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
|
||||
accelerator.print("Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
|
||||
accelerator.print(
|
||||
"Total Peak Memory consumed during the train (max): {}".format(
|
||||
tracemalloc.peaked + b2mb(tracemalloc.begin)
|
||||
)
|
||||
)
|
||||
# Logging the peak memory usage of the GPU to the tracker
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"train_total_peak_memory": tracemalloc.peaked + b2mb(tracemalloc.begin),
|
||||
},
|
||||
step=epoch,
|
||||
)
|
||||
|
||||
# New Code #
|
||||
# context manager to track the peak memory usage during the evaluation
|
||||
with TorchTracemalloc() as tracemalloc:
|
||||
model.eval()
|
||||
samples_seen = 0
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
# It is slightly faster to call this once, than multiple times
|
||||
predictions, references = accelerator.gather(
|
||||
(predictions, batch["labels"])
|
||||
) # If we are in a multiprocess environment, the last batch has duplicates
|
||||
if accelerator.use_distributed:
|
||||
if step == len(eval_dataloader) - 1:
|
||||
predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
|
||||
references = references[: len(eval_dataloader.dataset) - samples_seen]
|
||||
else:
|
||||
samples_seen += references.shape[0]
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"accuracy": eval_metric["accuracy"],
|
||||
"f1": eval_metric["f1"],
|
||||
"train_loss": total_loss.item() / len(train_dataloader),
|
||||
},
|
||||
step=epoch,
|
||||
)
|
||||
|
||||
if checkpointing_steps == "epoch":
|
||||
output_dir = f"epoch_{epoch}"
|
||||
if args.output_dir is not None:
|
||||
output_dir = os.path.join(args.output_dir, output_dir)
|
||||
accelerator.save_state(output_dir)
|
||||
# New Code #
|
||||
# Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
|
||||
accelerator.print("Memory before entering the eval : {}".format(b2mb(tracemalloc.begin)))
|
||||
accelerator.print("Memory consumed at the end of the eval (end-begin): {}".format(tracemalloc.used))
|
||||
accelerator.print("Peak Memory consumed during the eval (max-begin): {}".format(tracemalloc.peaked))
|
||||
accelerator.print(
|
||||
"Total Peak Memory consumed during the eval (max): {}".format(tracemalloc.peaked + b2mb(tracemalloc.begin))
|
||||
)
|
||||
# Logging the peak memory usage of the GPU to the tracker
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"eval_total_peak_memory": tracemalloc.peaked + b2mb(tracemalloc.begin),
|
||||
},
|
||||
step=epoch,
|
||||
)
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator.end_training()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
parser.add_argument(
|
||||
"--checkpointing_steps",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume_from_checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="If the training should continue from a checkpoint folder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with_tracking",
|
||||
action="store_true",
|
||||
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
default=".",
|
||||
help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging_dir",
|
||||
type=str,
|
||||
default="logs",
|
||||
help="Location on where to store experiment tracking logs`",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
type=str,
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 1, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
210
examples/by_feature/gradient_accumulation.py
Normal file
210
examples/by_feature/gradient_accumulation.py
Normal file
@ -0,0 +1,210 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
# This is a fully working simple example to use Accelerate
|
||||
# and perform gradient accumulation
|
||||
#
|
||||
# This example trains a Bert base model on GLUE MRPC
|
||||
# in any of the following settings (with the same script):
|
||||
# - single CPU or single GPU
|
||||
# - multi GPUS (using PyTorch distributed mode)
|
||||
# - (multi) TPUs
|
||||
# - fp16 (mixed-precision) or fp32 (normal precision)
|
||||
#
|
||||
# To run it in each of these various modes, follow the instructions
|
||||
# in the readme for examples:
|
||||
# https://github.com/huggingface/accelerate/tree/main/examples
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
MAX_GPU_BATCH_SIZE = 16
|
||||
EVAL_BATCH_SIZE = 32
|
||||
|
||||
|
||||
def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
"""
|
||||
Creates a set of `DataLoader`s for the `glue` dataset,
|
||||
using "bert-base-cased" as the tokenizer.
|
||||
|
||||
Args:
|
||||
accelerator (`Accelerator`):
|
||||
An `Accelerator` object
|
||||
batch_size (`int`, *optional*):
|
||||
The batch size for the train and validation DataLoaders.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["idx", "sentence1", "sentence2"],
|
||||
)
|
||||
|
||||
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
|
||||
# transformers library
|
||||
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(
|
||||
tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(
|
||||
tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
|
||||
)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# New Code #
|
||||
gradient_accumulation_steps = int(args.gradient_accumulation_steps)
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(
|
||||
cpu=args.cpu, mixed_precision=args.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps
|
||||
)
|
||||
if accelerator.distributed_type == DistributedType.TPU and gradient_accumulation_steps > 1:
|
||||
raise NotImplementedError(
|
||||
"Gradient accumulation on TPUs is currently not supported. Pass `gradient_accumulation_steps=1`"
|
||||
)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
set_seed(seed)
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
|
||||
|
||||
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
|
||||
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
|
||||
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=100,
|
||||
num_training_steps=(len(train_dataloader) * num_epochs),
|
||||
)
|
||||
|
||||
# Prepare everything
|
||||
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
|
||||
# prepare method.
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
model.train()
|
||||
for step, batch in enumerate(train_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
# New code #
|
||||
# We use the new `accumulate` context manager to perform gradient accumulation
|
||||
# We also currently do not support TPUs nor advise it as bugs were found on the XLA side when running our tests.
|
||||
with accelerator.accumulate(model):
|
||||
output = model(**batch)
|
||||
loss = output.loss
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
model.eval()
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
)
|
||||
|
||||
eval_metric = metric.compute()
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple example of training script.")
|
||||
parser.add_argument(
|
||||
"--mixed_precision",
|
||||
type=str,
|
||||
default="no",
|
||||
choices=["no", "fp16", "bf16"],
|
||||
help="Whether to use mixed precision. Choose"
|
||||
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
|
||||
"and an Nvidia Ampere GPU.",
|
||||
)
|
||||
# New Code #
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="The number of minibatches to be ran before gradients are accumulated.",
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -12,22 +12,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator, DistributedType
|
||||
|
||||
# New Code #
|
||||
from accelerate.memory_utils import find_executable_batch_size
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from accelerate.utils import find_executable_batch_size
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
@ -103,21 +99,27 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
@ -131,7 +133,7 @@ def training_function(config, args):
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# New Code #
|
||||
# We now can define an inner training loop function. It should take a batch size as the only parameter,
|
||||
@ -210,7 +212,7 @@ def main():
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
|
||||
@ -13,19 +13,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
@ -104,21 +101,27 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
@ -133,7 +136,7 @@ def training_function(config, args):
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
@ -175,9 +178,9 @@ def training_function(config, args):
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
# New Code #
|
||||
# First we check if it's a distributed system
|
||||
if accelerator.num_processes > 1:
|
||||
if accelerator.use_distributed:
|
||||
# Then see if we're on the last batch of our eval dataloader
|
||||
if step == len(eval_dataloader):
|
||||
if step == len(eval_dataloader) - 1:
|
||||
# Last batch needs to be truncated on distributed systems as it contains additional samples
|
||||
predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
|
||||
references = references[: len(eval_dataloader.dataset) - samples_seen]
|
||||
@ -207,7 +210,7 @@ def main():
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
|
||||
@ -13,24 +13,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
@ -107,6 +99,13 @@ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
# For testing only
|
||||
if os.environ.get("TESTING_MOCKED_DATALOADERS", None) == "1":
|
||||
from accelerate.test_utils.training import mocked_dataloaders
|
||||
|
||||
get_dataloaders = mocked_dataloaders # noqa: F811
|
||||
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize Accelerator
|
||||
|
||||
@ -123,17 +122,16 @@ def training_function(config, args):
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
set_seed(seed)
|
||||
|
||||
train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
|
||||
metric = load_metric("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
@ -146,7 +144,7 @@ def training_function(config, args):
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
@ -165,10 +163,9 @@ def training_function(config, args):
|
||||
# New Code #
|
||||
# We need to initalize the trackers we use. Overall configurations can also be stored
|
||||
if args.with_tracking:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
if accelerator.is_main_process:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
accelerator.init_trackers(run, config)
|
||||
|
||||
# Now we train the model
|
||||
for epoch in range(num_epochs):
|
||||
@ -212,15 +209,16 @@ def training_function(config, args):
|
||||
|
||||
# New Code #
|
||||
# To actually log, we call `Accelerator.log`
|
||||
# The values passed can be of `str`, `int`, or `float`
|
||||
# The values passed can be of `str`, `int`, `float` or `dict` of `str` to `float`/`int`
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{
|
||||
"accuracy": eval_metric["accuracy"],
|
||||
"f1": eval_metric["f1"],
|
||||
"train_loss": total_loss,
|
||||
"train_loss": total_loss.item() / len(train_dataloader),
|
||||
"epoch": epoch,
|
||||
}
|
||||
},
|
||||
step=epoch,
|
||||
)
|
||||
|
||||
# New Code #
|
||||
@ -254,7 +252,7 @@ def main():
|
||||
help="Location on where to store experiment tracking logs`",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
|
||||
@ -104,10 +104,11 @@ def training_function(config, args):
|
||||
|
||||
# We need to initialize the trackers we use, and also store our configuration
|
||||
if args.with_tracking:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
if accelerator.is_main_process:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
|
||||
# Grab all the image filenames
|
||||
file_names = [os.path.join(args.data_dir, fname) for fname in os.listdir(args.data_dir) if fname.endswith(".jpg")]
|
||||
@ -232,7 +233,7 @@ def training_function(config, args):
|
||||
accelerator.save_state(output_dir)
|
||||
model.eval()
|
||||
accurate = 0
|
||||
num_elems = 0
|
||||
samples_seen = 0
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch = {k: v.to(accelerator.device) for k, v in batch.items()}
|
||||
@ -240,16 +241,29 @@ def training_function(config, args):
|
||||
with torch.no_grad():
|
||||
outputs = model(inputs)
|
||||
predictions = outputs.argmax(dim=-1)
|
||||
accurate_preds = accelerator.gather(predictions) == accelerator.gather(batch["label"])
|
||||
num_elems += accurate_preds.shape[0]
|
||||
predictions, references = accelerator.gather((predictions, batch["label"]))
|
||||
if accelerator.use_distributed:
|
||||
if step == len(eval_dataloader) - 1:
|
||||
predictions = predictions[: len(eval_dataloader) - samples_seen]
|
||||
references = references[: len(eval_dataloader) - samples_seen]
|
||||
else:
|
||||
samples_seen += references.shape[0]
|
||||
else:
|
||||
samples_seen += references.shape[0]
|
||||
accurate_preds = predictions == references
|
||||
accurate += accurate_preds.long().sum()
|
||||
|
||||
eval_metric = accurate.item() / num_elems
|
||||
eval_metric = accurate.item() / samples_seen
|
||||
# Use accelerator.print to print only on the main process.
|
||||
accelerator.print(f"epoch {epoch}: {100 * eval_metric:.2f}")
|
||||
if args.with_tracking:
|
||||
accelerator.log(
|
||||
{"accuracy": 100 * eval_metric, "total_loss": total_loss, "epoch": epoch}, step=overall_step
|
||||
{
|
||||
"accuracy": 100 * eval_metric,
|
||||
"train_loss": total_loss.item() / len(train_dataloader),
|
||||
"epoch": epoch,
|
||||
},
|
||||
step=overall_step,
|
||||
)
|
||||
if checkpointing_steps == "epoch":
|
||||
output_dir = f"epoch_{epoch}"
|
||||
|
||||
@ -16,17 +16,13 @@ import argparse
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
@ -75,20 +71,20 @@ def training_function(config, args):
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
# We need to initialize the trackers we use, and also store our configuration
|
||||
if args.with_tracking:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
if accelerator.is_main_process:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
if args.logging_dir:
|
||||
run = os.path.join(args.logging_dir, run)
|
||||
accelerator.init_trackers(run, config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
datasets = load_dataset("glue", "mrpc")
|
||||
metric = load_metric("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
@ -108,7 +104,7 @@ def training_function(config, args):
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
@ -137,7 +133,7 @@ def training_function(config, args):
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
@ -215,6 +211,7 @@ def training_function(config, args):
|
||||
accelerator.save_state(output_dir)
|
||||
|
||||
model.eval()
|
||||
samples_seen = 0
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
# We could avoid this line since we set the accelerator with `device_placement=True`.
|
||||
batch.to(accelerator.device)
|
||||
@ -222,7 +219,15 @@ def training_function(config, args):
|
||||
outputs = model(**batch)
|
||||
predictions = outputs.logits.argmax(dim=-1)
|
||||
# It is slightly faster to call this once, than multiple times
|
||||
predictions, references = accelerator.gather((predictions, batch["labels"]))
|
||||
predictions, references = accelerator.gather(
|
||||
(predictions, batch["labels"])
|
||||
) # If we are in a multiprocess environment, the last batch has duplicates
|
||||
if accelerator.use_distributed:
|
||||
if step == len(eval_dataloader) - 1:
|
||||
predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
|
||||
references = references[: len(eval_dataloader.dataset) - samples_seen]
|
||||
else:
|
||||
samples_seen += references.shape[0]
|
||||
metric.add_batch(
|
||||
predictions=predictions,
|
||||
references=references,
|
||||
@ -236,9 +241,10 @@ def training_function(config, args):
|
||||
{
|
||||
"accuracy": eval_metric["accuracy"],
|
||||
"f1": eval_metric["f1"],
|
||||
"train_loss": total_loss,
|
||||
"train_loss": total_loss.item() / len(train_dataloader),
|
||||
"epoch": epoch,
|
||||
}
|
||||
},
|
||||
step=epoch,
|
||||
)
|
||||
|
||||
if checkpointing_steps == "epoch":
|
||||
@ -293,7 +299,7 @@ def main():
|
||||
help="Location on where to store experiment tracking logs`",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
|
||||
@ -73,7 +73,7 @@ class PetsDataset(Dataset):
|
||||
|
||||
def training_function(config, args):
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mix_precision)
|
||||
accelerator = Accelerator(cpu=args.cpu, mixed_precision=args.mixed_precision)
|
||||
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
|
||||
43
examples/deepspeed_config_templates/zero_stage1_config.json
Normal file
43
examples/deepspeed_config_templates/zero_stage1_config.json
Normal file
@ -0,0 +1,43 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto",
|
||||
"torch_adam": true,
|
||||
"adam_w_mode": true
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 1,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
43
examples/deepspeed_config_templates/zero_stage2_config.json
Normal file
43
examples/deepspeed_config_templates/zero_stage2_config.json
Normal file
@ -0,0 +1,43 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto",
|
||||
"torch_adam": true,
|
||||
"adam_w_mode": true
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@ -0,0 +1,47 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto",
|
||||
"torch_adam": true,
|
||||
"adam_w_mode": true
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
44
examples/deepspeed_config_templates/zero_stage3_config.json
Normal file
44
examples/deepspeed_config_templates/zero_stage3_config.json
Normal file
@ -0,0 +1,44 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"sub_group_size": 1e9,
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": "auto"
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@ -0,0 +1,52 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"sub_group_size": 1e9,
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": "auto"
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@ -15,17 +15,13 @@
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import evaluate
|
||||
from accelerate import Accelerator, DistributedType
|
||||
from datasets import load_dataset, load_metric
|
||||
from transformers import (
|
||||
AdamW,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
set_seed,
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
|
||||
|
||||
|
||||
########################################################################
|
||||
@ -102,15 +98,14 @@ def training_function(config, args):
|
||||
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
|
||||
lr = config["lr"]
|
||||
num_epochs = int(config["num_epochs"])
|
||||
correct_bias = config["correct_bias"]
|
||||
seed = int(config["seed"])
|
||||
batch_size = int(config["batch_size"])
|
||||
|
||||
metric = load_metric("glue", "mrpc")
|
||||
metric = evaluate.load("glue", "mrpc")
|
||||
|
||||
# If the batch size is too big we use gradient accumulation
|
||||
gradient_accumulation_steps = 1
|
||||
if batch_size > MAX_GPU_BATCH_SIZE:
|
||||
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
|
||||
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
|
||||
batch_size = MAX_GPU_BATCH_SIZE
|
||||
|
||||
@ -125,7 +120,7 @@ def training_function(config, args):
|
||||
model = model.to(accelerator.device)
|
||||
|
||||
# Instantiate optimizer
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=correct_bias)
|
||||
optimizer = AdamW(params=model.parameters(), lr=lr)
|
||||
|
||||
# Instantiate scheduler
|
||||
lr_scheduler = get_linear_schedule_with_warmup(
|
||||
@ -187,7 +182,7 @@ def main():
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.")
|
||||
args = parser.parse_args()
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "correct_bias": True, "seed": 42, "batch_size": 16}
|
||||
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
|
||||
training_function(config, args)
|
||||
|
||||
|
||||
|
||||
@ -1 +1,3 @@
|
||||
accelerate # used to be installed in Amazon SageMaker environment
|
||||
accelerate # used to be installed in Amazon SageMaker environment
|
||||
evaluate
|
||||
datasets==2.3.2
|
||||
19
setup.py
19
setup.py
@ -16,18 +16,22 @@ from setuptools import setup
|
||||
from setuptools import find_packages
|
||||
|
||||
extras = {}
|
||||
extras["quality"] = ["black ~= 22.0", "isort >= 5.5.4", "flake8 >= 3.8.3"]
|
||||
extras["quality"] = ["black ~= 22.0", "isort >= 5.5.4", "flake8 >= 3.8.3", "hf-doc-builder >= 0.3.0"]
|
||||
extras["docs"] = []
|
||||
extras["test"] = [
|
||||
"pytest",
|
||||
"pytest-xdist",
|
||||
"pytest-subtests",
|
||||
"datasets",
|
||||
"datasets<=2.2.2",
|
||||
"evaluate",
|
||||
"transformers",
|
||||
"scipy",
|
||||
"sklearn"
|
||||
"sklearn",
|
||||
"parameterized",
|
||||
"deepspeed",
|
||||
]
|
||||
extras["test_trackers"] = ["wandb", "comet-ml", "tensorflow"]
|
||||
|
||||
extras["test_trackers"] = ["wandb", "comet-ml", "tensorboard"]
|
||||
extras["dev"] = extras["quality"] + extras["test"]
|
||||
|
||||
extras["sagemaker"] = [
|
||||
@ -36,7 +40,7 @@ extras["sagemaker"] = [
|
||||
|
||||
setup(
|
||||
name="accelerate",
|
||||
version="0.7.0.dev0",
|
||||
version="0.11.0",
|
||||
description="Accelerate",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
@ -54,8 +58,8 @@ setup(
|
||||
"accelerate-launch=accelerate.commands.launch:main",
|
||||
]
|
||||
},
|
||||
python_requires=">=3.6.0",
|
||||
install_requires=["torch>=1.4.0", "pyyaml", "numpy>=1.17"],
|
||||
python_requires=">=3.7.0",
|
||||
install_requires=["numpy>=1.17", "packaging>=20.0", "psutil", "pyyaml", "torch>=1.4.0"],
|
||||
extras_require=extras,
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
@ -65,7 +69,6 @@ setup(
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
],
|
||||
|
||||
@ -2,10 +2,20 @@
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
__version__ = "0.7.0.dev0"
|
||||
__version__ = "0.11.0"
|
||||
|
||||
from .accelerator import Accelerator
|
||||
from .kwargs_handlers import DistributedDataParallelKwargs, GradScalerKwargs, InitProcessGroupKwargs
|
||||
from .big_modeling import cpu_offload, disk_offload, dispatch_model, init_empty_weights, load_checkpoint_and_dispatch
|
||||
from .launchers import debug_launcher, notebook_launcher
|
||||
from .state import DistributedType
|
||||
from .utils import DeepSpeedPlugin, synchronize_rng_states
|
||||
from .utils import (
|
||||
DeepSpeedPlugin,
|
||||
DistributedDataParallelKwargs,
|
||||
DistributedType,
|
||||
FullyShardedDataParallelPlugin,
|
||||
GradScalerKwargs,
|
||||
InitProcessGroupKwargs,
|
||||
find_executable_batch_size,
|
||||
infer_auto_device_map,
|
||||
load_checkpoint_in_model,
|
||||
synchronize_rng_states,
|
||||
)
|
||||
|
||||
@ -12,7 +12,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import contextlib
|
||||
import gc
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
@ -21,25 +23,33 @@ from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
|
||||
from .data_loader import prepare_data_loader
|
||||
from .kwargs_handlers import DistributedDataParallelKwargs, GradScalerKwargs, InitProcessGroupKwargs, KwargsHandler
|
||||
from .logging import get_logger
|
||||
from .optimizer import AcceleratedOptimizer
|
||||
from .scheduler import AcceleratedScheduler
|
||||
from .state import AcceleratorState, DistributedType, is_deepspeed_available
|
||||
from .state import AcceleratorState, GradientState
|
||||
from .tracking import LOGGER_TYPE_TO_CLASS, GeneralTracker, filter_trackers
|
||||
from .utils import (
|
||||
DeepSpeedPlugin,
|
||||
DistributedDataParallelKwargs,
|
||||
DistributedType,
|
||||
FullyShardedDataParallelPlugin,
|
||||
GradScalerKwargs,
|
||||
InitProcessGroupKwargs,
|
||||
KwargsHandler,
|
||||
LoggerType,
|
||||
PrecisionType,
|
||||
RNGType,
|
||||
compare_versions,
|
||||
convert_outputs_to_fp32,
|
||||
extract_model_from_parallel,
|
||||
gather,
|
||||
get_pretty_name,
|
||||
is_bf16_available,
|
||||
is_deepspeed_available,
|
||||
is_torch_version,
|
||||
is_tpu_available,
|
||||
pad_across_processes,
|
||||
reduce,
|
||||
save,
|
||||
@ -50,12 +60,18 @@ from .utils import (
|
||||
if is_deepspeed_available():
|
||||
import deepspeed
|
||||
|
||||
from .deepspeed_utils import DeepSpeedEngineWrapper, DeepSpeedOptimizerWrapper
|
||||
from .utils import (
|
||||
DeepSpeedEngineWrapper,
|
||||
DeepSpeedOptimizerWrapper,
|
||||
DeepSpeedSchedulerWrapper,
|
||||
DummyOptim,
|
||||
DummyScheduler,
|
||||
)
|
||||
|
||||
import logging
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.distributed.xla_multiprocessing as xmp
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class Accelerator:
|
||||
@ -76,6 +92,9 @@ class Accelerator:
|
||||
default to the value in the environment variable `MIXED_PRECISION`, which will use the default value in the
|
||||
accelerate config of the current system or the flag passed with the `accelerate.launch` command. 'fp16'
|
||||
requires pytorch 1.6 or higher. 'bf16' requires pytorch 1.10 or higher.
|
||||
gradient_accumulation_steps (`int`, *optional*, default to 1):
|
||||
The number of steps that should pass before gradients are accumulated. A number > 1 should be combined with
|
||||
`Accelerator.accumulate`.
|
||||
cpu (`bool`, *optional*):
|
||||
Whether or not to force the script to execute on CPU. Will ignore GPU available if set to `True` and force
|
||||
the execution on one process only.
|
||||
@ -130,6 +149,7 @@ class Accelerator:
|
||||
split_batches: bool = False,
|
||||
fp16: bool = None,
|
||||
mixed_precision: Union[PrecisionType, str] = None,
|
||||
gradient_accumulation_steps: int = 1,
|
||||
cpu: bool = False,
|
||||
deepspeed_plugin: DeepSpeedPlugin = None,
|
||||
fsdp_plugin: FullyShardedDataParallelPlugin = None,
|
||||
@ -141,7 +161,10 @@ class Accelerator:
|
||||
kwargs_handlers: Optional[List[KwargsHandler]] = None,
|
||||
):
|
||||
self.logging_dir = logging_dir
|
||||
self.log_with = filter_trackers(log_with, self.logging_dir)
|
||||
trackers = filter_trackers(log_with, self.logging_dir)
|
||||
if len(trackers) < 1 and log_with is not None:
|
||||
warnings.warn(f"`log_with={log_with}` was passed but no supported trackers are currently installed.")
|
||||
self.log_with = trackers
|
||||
|
||||
if mixed_precision is not None:
|
||||
mixed_precision = str(mixed_precision)
|
||||
@ -160,15 +183,27 @@ class Accelerator:
|
||||
assert isinstance(
|
||||
deepspeed_plugin, DeepSpeedPlugin
|
||||
), "`deepspeed_plugin` must be a DeepSpeedPlugin object."
|
||||
os.environ["USE_DEEPSPEED"] = "true" # use DeepSpeed if plugin is provided
|
||||
if deepspeed_plugin:
|
||||
if not is_deepspeed_available():
|
||||
raise ImportError("DeepSpeed is not installed => run `pip install deepspeed` or build it from source.")
|
||||
if compare_versions("deepspeed", "<", "0.6.5"):
|
||||
raise ImportError("DeepSpeed version must be >= 0.6.5. Please update DeepSpeed.")
|
||||
|
||||
if os.environ.get("USE_FSDP", "false") == "true":
|
||||
if version.parse(torch.__version__) < version.parse("1.12.0.dev20220418+cu113"):
|
||||
raise ValueError("FSDP requires PyTorch >= 1.12.0.dev20220418+cu113")
|
||||
if fsdp_plugin is None: # init from env variables
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin()
|
||||
else:
|
||||
if not isinstance(fsdp_plugin, FullyShardedDataParallelPlugin):
|
||||
raise TypeError("`fsdp_plugin` must be a FullyShardedDataParallelPlugin object.")
|
||||
mixed_precision = os.environ.get("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
|
||||
deepspeed_plugin.set_mixed_precision(mixed_precision)
|
||||
deepspeed_plugin.set_deepspeed_weakref()
|
||||
|
||||
if os.environ.get("USE_FSDP", "false") == "true" or isinstance(fsdp_plugin, FullyShardedDataParallelPlugin):
|
||||
if is_torch_version("<", "1.12.0"):
|
||||
raise ValueError("FSDP requires PyTorch >= 1.12.0")
|
||||
|
||||
if fsdp_plugin is None: # init from env variables
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin() if os.environ.get("USE_FSDP", "false") == "true" else None
|
||||
else:
|
||||
if not isinstance(fsdp_plugin, FullyShardedDataParallelPlugin):
|
||||
raise TypeError("`fsdp_plugin` must be a FullyShardedDataParallelPlugin object.")
|
||||
os.environ["USE_FSDP"] = "true" # use FSDP if plugin is provided
|
||||
|
||||
# Kwargs handlers
|
||||
self.ddp_handler = None
|
||||
@ -203,10 +238,17 @@ class Accelerator:
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if gradient_accumulation_steps > 1:
|
||||
if self.state.distributed_type == DistributedType.TPU:
|
||||
raise NotImplementedError(
|
||||
"Gradient accumulation on TPU is not supported. Pass in `gradient_accumulation_steps=1`"
|
||||
)
|
||||
|
||||
self.gradient_accumulation_steps = gradient_accumulation_steps
|
||||
self.device_placement = device_placement
|
||||
self.split_batches = split_batches
|
||||
self.dispatch_batches = dispatch_batches
|
||||
if dispatch_batches is True and version.parse(torch.__version__) < version.parse("1.8.0"):
|
||||
if dispatch_batches is True and is_torch_version("<", "1.8.0"):
|
||||
raise ImportError(
|
||||
"Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
|
||||
)
|
||||
@ -215,20 +257,33 @@ class Accelerator:
|
||||
# Mixed precision attributes
|
||||
self.scaler = None
|
||||
self.native_amp = False
|
||||
err = "{mode} mixed precision requires {requirement}"
|
||||
if self.state.mixed_precision == "fp16":
|
||||
self.native_amp = version.parse(torch.__version__) >= version.parse("1.6")
|
||||
if version.parse(torch.__version__) < version.parse("1.6"):
|
||||
raise ValueError("fp16 mixed precision requires PyTorch >= 1.6")
|
||||
|
||||
self.native_amp = is_torch_version(">=", "1.6")
|
||||
if not self.native_amp:
|
||||
raise ValueError(err.format(mode="fp16", requirement="PyTorch >= 1.6"))
|
||||
if not torch.cuda.is_available():
|
||||
raise ValueError(err.format(mode="fp16", requirement="a GPU"))
|
||||
kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
|
||||
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
||||
elif self.state.mixed_precision == "bf16":
|
||||
self.native_amp = version.parse(torch.__version__) >= version.parse("1.10")
|
||||
if mixed_precision == "bf16" and version.parse(torch.__version__) < version.parse("1.10"):
|
||||
raise ValueError("bf16 mixed precision requires PyTorch >= 1.10")
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
|
||||
|
||||
kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
|
||||
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
||||
self.scaler = ShardedGradScaler(**kwargs)
|
||||
else:
|
||||
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
||||
elif self.state.mixed_precision == "bf16" and self.distributed_type != DistributedType.FSDP:
|
||||
self.native_amp = is_bf16_available(True)
|
||||
if mixed_precision == "bf16" and not self.native_amp and not is_tpu_available():
|
||||
raise ValueError(err.format(mode="bf16", requirement="PyTorch >= 1.10 and a supported device."))
|
||||
|
||||
# Only on the GPU do we care about scaling the gradients
|
||||
if torch.cuda.is_available():
|
||||
kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
|
||||
self.scaler = torch.cuda.amp.GradScaler(**kwargs)
|
||||
|
||||
# Start of internal step tracking
|
||||
self.step = 0
|
||||
self.gradient_state = GradientState()
|
||||
|
||||
# Internal references to the training objects
|
||||
self._optimizers = []
|
||||
@ -239,7 +294,11 @@ class Accelerator:
|
||||
# RNG Types
|
||||
self.rng_types = rng_types
|
||||
if self.rng_types is None:
|
||||
self.rng_types = ["torch"] if version.parse(torch.__version__) <= version.parse("1.5.1") else ["generator"]
|
||||
self.rng_types = ["torch"] if is_torch_version("<=", "1.5.1") else ["generator"]
|
||||
|
||||
@property
|
||||
def use_distributed(self):
|
||||
return self.distributed_type != DistributedType.NO and self.num_processes > 1
|
||||
|
||||
@property
|
||||
def distributed_type(self):
|
||||
@ -316,6 +375,56 @@ class Accelerator:
|
||||
if is_main:
|
||||
self.wait_for_everyone()
|
||||
|
||||
@contextmanager
|
||||
def no_sync(self, model):
|
||||
"""
|
||||
A context manager to disable gradient synchronizations across DDP processes by calling
|
||||
`torch.nn.parallel.DistributedDataParallel.no_sync`.
|
||||
|
||||
If `model` is not in DDP, this context manager does nothing
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`):
|
||||
PyTorch Module that was prepared with `Accelerator.prepare`
|
||||
"""
|
||||
context = contextlib.nullcontext
|
||||
if self.use_distributed:
|
||||
context = getattr(model, "no_sync", context)
|
||||
|
||||
with context():
|
||||
yield
|
||||
|
||||
def _do_sync(self):
|
||||
"Sets the right `sync_gradients` context and either resets or increases `self.step`"
|
||||
if self.gradient_state.end_of_dataloader:
|
||||
self.step = 0
|
||||
self.gradient_state._set_sync_gradients(True)
|
||||
else:
|
||||
self.step += 1
|
||||
self.gradient_state._set_sync_gradients((self.step % self.gradient_accumulation_steps) == 0)
|
||||
|
||||
@property
|
||||
def sync_gradients(self):
|
||||
return self.gradient_state.sync_gradients
|
||||
|
||||
@contextmanager
|
||||
def accumulate(self, model):
|
||||
"""
|
||||
A context manager that will lightly wrap around and perform gradient accumulation automatically
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`):
|
||||
PyTorch Module that was prepared with `Accelerator.prepare`
|
||||
"""
|
||||
self._do_sync()
|
||||
if self.sync_gradients:
|
||||
context = contextlib.nullcontext
|
||||
else:
|
||||
context = self.no_sync
|
||||
|
||||
with context(model):
|
||||
yield
|
||||
|
||||
def print(self, *args, **kwargs):
|
||||
"""
|
||||
Use in replacement of `print()` to only print once per server.
|
||||
@ -462,36 +571,50 @@ class Accelerator:
|
||||
elif self.distributed_type == DistributedType.FSDP:
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
||||
|
||||
fsdp_plugin = self.state.fsdp_plugin
|
||||
model = FSDP(
|
||||
model,
|
||||
sharding_strategy=fsdp_plugin.sharding_strategy,
|
||||
cpu_offload=fsdp_plugin.cpu_offload,
|
||||
auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
|
||||
backward_prefetch=fsdp_plugin.backward_prefetch,
|
||||
ignored_modules=fsdp_plugin.ignored_modules,
|
||||
)
|
||||
if not fsdp_plugin.cpu_offload.offload_params:
|
||||
model.to(self.device)
|
||||
# Check if the model is already a FSDP model due to `Manual Wrapping` and if so,
|
||||
# don't wrap it again
|
||||
if type(model) != FSDP:
|
||||
self.state.fsdp_plugin.set_auto_wrap_policy(model)
|
||||
fsdp_plugin = self.state.fsdp_plugin
|
||||
model = FSDP(
|
||||
model,
|
||||
sharding_strategy=fsdp_plugin.sharding_strategy,
|
||||
cpu_offload=fsdp_plugin.cpu_offload,
|
||||
auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
|
||||
backward_prefetch=fsdp_plugin.backward_prefetch,
|
||||
mixed_precision=fsdp_plugin.mixed_precision_policy,
|
||||
ignored_modules=fsdp_plugin.ignored_modules,
|
||||
)
|
||||
if not fsdp_plugin.cpu_offload.offload_params:
|
||||
model.to(self.device)
|
||||
elif self.distributed_type == DistributedType.MULTI_CPU:
|
||||
kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
|
||||
if self.native_amp:
|
||||
if self.mixed_precision == "fp16" and version.parse(torch.__version__) >= version.parse("1.10"):
|
||||
if self.mixed_precision == "fp16" and is_torch_version(">=", "1.10"):
|
||||
model.forward = torch.cuda.amp.autocast(dtype=torch.float16)(model.forward)
|
||||
elif self.mixed_precision == "bf16":
|
||||
model.forward = torch.cuda.amp.autocast(dtype=torch.bfloat16)(model.forward)
|
||||
elif self.mixed_precision == "bf16" and self.distributed_type != DistributedType.TPU:
|
||||
device_type = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model.forward = torch.autocast(device_type=device_type, dtype=torch.bfloat16)(model.forward)
|
||||
else:
|
||||
model.forward = torch.cuda.amp.autocast()(model.forward)
|
||||
model.forward = convert_outputs_to_fp32(model.forward)
|
||||
if self.distributed_type == DistributedType.TPU and self.state.fork_launched:
|
||||
model = xmp.MpModelWrapper(model).to(self.device)
|
||||
return model
|
||||
|
||||
def _prepare_deepspeed(self, *args):
|
||||
|
||||
deepspeed_plugin = self.state.deepspeed_plugin
|
||||
self.deepspeed_config = deepspeed_plugin.deepspeed_config
|
||||
|
||||
result = [
|
||||
self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
|
||||
for obj in args
|
||||
]
|
||||
|
||||
batch_sizes = [obj.batch_size for obj in args if hasattr(obj, "batch_size")]
|
||||
if self.split_batches:
|
||||
batch_sizes = [batch_size // self.num_processes for batch_size in batch_sizes]
|
||||
if len(batch_sizes) == 0:
|
||||
raise ValueError(
|
||||
"You must specify a training or evaluation dataloader in `accelerate.prepare()` when using DeepSpeed."
|
||||
@ -500,73 +623,141 @@ class Accelerator:
|
||||
batch_size_per_device = min(batch_sizes) if deepspeed_plugin.is_train_batch_min else max(batch_sizes)
|
||||
if len(batch_sizes) > 1:
|
||||
logger.info(
|
||||
f"Since you passed both train and evaluation dataloader, `is_train_batch_min` (here \
|
||||
{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
|
||||
"Since you passed both train and evaluation dataloader, `is_train_batch_min` (here "
|
||||
f"{deepspeed_plugin.is_train_batch_min} will decide the `train_batch_size` ({batch_size_per_device})."
|
||||
)
|
||||
|
||||
self.deepspeed_config["train_batch_size"] = (
|
||||
batch_size_per_device * deepspeed_plugin.gradient_accumulation_steps * self.num_processes
|
||||
)
|
||||
|
||||
result = [
|
||||
self._prepare_one(obj, first_pass=True) if isinstance(obj, torch.utils.data.DataLoader) else obj
|
||||
for obj in args
|
||||
]
|
||||
config_kwargs = {
|
||||
"train_micro_batch_size_per_gpu": batch_size_per_device,
|
||||
"train_batch_size": batch_size_per_device
|
||||
* deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
|
||||
* self.num_processes,
|
||||
"gradient_clipping": 1.0,
|
||||
"zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
|
||||
}
|
||||
|
||||
model = None
|
||||
optimizer = None
|
||||
scheduler = None
|
||||
for obj in result:
|
||||
if isinstance(obj, torch.nn.Module):
|
||||
model = obj
|
||||
elif isinstance(obj, (torch.optim.Optimizer, dict)):
|
||||
elif isinstance(obj, (torch.optim.Optimizer, DummyOptim)):
|
||||
optimizer = obj
|
||||
elif (isinstance(obj, (torch.optim.lr_scheduler._LRScheduler, DummyScheduler))) or (
|
||||
type(obj).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES
|
||||
):
|
||||
scheduler = obj
|
||||
|
||||
if deepspeed_plugin.auto_opt_mapping:
|
||||
is_adam = isinstance(optimizer, torch.optim.Adam)
|
||||
is_adamw = isinstance(optimizer, torch.optim.AdamW)
|
||||
if (is_adam or is_adamw) and deepspeed_plugin.offload_optimizer_device == "cpu":
|
||||
defaults = optimizer.defaults
|
||||
params = []
|
||||
for group in optimizer.param_groups:
|
||||
params.extend(group["params"])
|
||||
|
||||
optimizer = deepspeed.ops.adam.DeepSpeedCPUAdam(
|
||||
params,
|
||||
lr=defaults["lr"],
|
||||
bias_correction=True,
|
||||
betas=defaults["betas"],
|
||||
eps=defaults["eps"],
|
||||
weight_decay=defaults["weight_decay"],
|
||||
amsgrad=defaults["amsgrad"],
|
||||
adamw_mode=is_adamw,
|
||||
if optimizer is not None:
|
||||
if "optimizer" in deepspeed_plugin.deepspeed_config and not isinstance(optimizer, (DummyOptim)):
|
||||
raise ValueError(
|
||||
"You cannot specify an optimizer in the config file and in the code at the same time. "
|
||||
"Please remove the optimizer from the config file or "
|
||||
"create `accelerate.utils.DummyOptim` in the code."
|
||||
)
|
||||
elif "optimizer" not in deepspeed_plugin.deepspeed_config and isinstance(optimizer, (DummyOptim)):
|
||||
raise ValueError(
|
||||
"You cannot create a `DummyOptim` without specifying an optimizer in the config file."
|
||||
)
|
||||
|
||||
if isinstance(optimizer, (torch.optim.Optimizer)):
|
||||
deepspeed_plugin.deepspeed_config["zero_allow_untested_optimizer"] = True
|
||||
|
||||
if scheduler is not None:
|
||||
if "scheduler" in deepspeed_plugin.deepspeed_config and not isinstance(scheduler, (DummyScheduler)):
|
||||
raise ValueError(
|
||||
"You cannot specify a scheduler in the config file and in the code at the same time. "
|
||||
"Please remove the scheduler from the config file or "
|
||||
"create `accelerate.utils.DummyScheduler` in the code."
|
||||
)
|
||||
elif "scheduler" not in deepspeed_plugin.deepspeed_config and isinstance(scheduler, (DummyScheduler)):
|
||||
raise ValueError(
|
||||
"You cannot create a `DummyScheduler` without specifying a scheduler in the config file."
|
||||
)
|
||||
|
||||
if optimizer is not None and scheduler is not None:
|
||||
if isinstance(optimizer, (DummyOptim)) and not isinstance(scheduler, (DummyScheduler)):
|
||||
raise ValueError(
|
||||
"You can only specify `accelerate.utils.DummyScheduler` in the code when using "
|
||||
"`accelerate.utils.DummyOptim`."
|
||||
)
|
||||
|
||||
# useful when only eval_dataloader is given into `accelerator.prepare()`
|
||||
if model is not None:
|
||||
engine = DeepSpeedEngineWrapper(
|
||||
args=None,
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
config_params=self.deepspeed_config,
|
||||
dist_init_required=False,
|
||||
)
|
||||
if hasattr(model, "config") and hasattr(model.config, "hidden_size"):
|
||||
hidden_size = model.config.hidden_size
|
||||
config_kwargs.update(
|
||||
{
|
||||
"zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
|
||||
}
|
||||
)
|
||||
|
||||
if isinstance(optimizer, (DummyOptim)):
|
||||
config_kwargs.update(
|
||||
{"optimizer.params.lr": optimizer.lr, "optimizer.params.weight_decay": optimizer.weight_decay}
|
||||
)
|
||||
if isinstance(scheduler, (DummyScheduler)):
|
||||
config_kwargs.update(
|
||||
{
|
||||
"scheduler.params.warmup_min_lr": 0,
|
||||
"scheduler.params.warmup_max_lr": scheduler.optimizer.lr,
|
||||
"scheduler.params.warmup_num_steps": scheduler.warmup_num_steps,
|
||||
}
|
||||
)
|
||||
if scheduler.total_num_steps is not None:
|
||||
config_kwargs["scheduler.params.total_num_steps"] = (
|
||||
math.ceil(scheduler.total_num_steps / self.num_processes)
|
||||
if not self.split_batches
|
||||
else scheduler.total_num_steps
|
||||
)
|
||||
deepspeed_plugin.deepspeed_config_process(must_match=False, **config_kwargs)
|
||||
self.deepspeed_config = deepspeed_plugin.deepspeed_config
|
||||
kwargs = dict(model=model, config_params=self.deepspeed_config)
|
||||
if optimizer is not None:
|
||||
if isinstance(optimizer, (DummyOptim)):
|
||||
kwargs["model_parameters"] = optimizer.params
|
||||
else:
|
||||
kwargs["optimizer"] = optimizer
|
||||
if scheduler is not None:
|
||||
if type(scheduler).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES:
|
||||
kwargs["lr_scheduler"] = scheduler
|
||||
|
||||
engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
|
||||
if optimizer is not None:
|
||||
optimizer = DeepSpeedOptimizerWrapper(optimizer)
|
||||
if scheduler is not None:
|
||||
if lr_scheduler is None:
|
||||
scheduler = AcceleratedScheduler(
|
||||
scheduler,
|
||||
optimizer,
|
||||
step_with_optimizer=self.step_scheduler_with_optimizer,
|
||||
split_batches=self.split_batches,
|
||||
)
|
||||
else:
|
||||
scheduler = DeepSpeedSchedulerWrapper(lr_scheduler, optimizer)
|
||||
|
||||
for i in range(len(result)):
|
||||
if isinstance(result[i], torch.nn.Module):
|
||||
result[i] = engine
|
||||
elif isinstance(result[i], torch.optim.Optimizer):
|
||||
result[i] = DeepSpeedOptimizerWrapper(engine.optimizer, engine)
|
||||
self.deepspeed_engine = engine # pointing for deepspeed_engine.backward()
|
||||
elif isinstance(result[i], (torch.optim.Optimizer, DummyOptim)):
|
||||
result[i] = optimizer
|
||||
elif (isinstance(result[i], (torch.optim.lr_scheduler._LRScheduler, DummyScheduler))) or (
|
||||
type(result[i]).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES
|
||||
):
|
||||
result[i] = scheduler
|
||||
# pointing for deepspeed_engine_wrapped.backward()
|
||||
self.deepspeed_engine_wrapped = DeepSpeedEngineWrapper(engine)
|
||||
self._models.append(engine)
|
||||
self._optimizers.append(engine.optimizer)
|
||||
assert (
|
||||
len(self._models) == 1
|
||||
), "You can't use same `Accelerator()` instance with 2 models when using DeepSpeed"
|
||||
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
assert hasattr(
|
||||
self, "deepspeed_engine"
|
||||
), "You need to pass the model along the optimizer when using Deepspeed."
|
||||
|
||||
if optimizer is not None:
|
||||
self._optimizers.append(optimizer)
|
||||
if scheduler is not None:
|
||||
self._schedulers.append(scheduler)
|
||||
if len(self._models) > 1:
|
||||
raise AssertionError(
|
||||
"You can't use same `Accelerator()` instance with multiple models when using DeepSpeed"
|
||||
)
|
||||
return tuple(result)
|
||||
|
||||
def prepare_data_loader(self, data_loader):
|
||||
@ -576,7 +767,7 @@ class Accelerator:
|
||||
num_processes=self.num_processes,
|
||||
process_index=self.process_index,
|
||||
split_batches=self.split_batches,
|
||||
put_on_device=self.device_placement,
|
||||
put_on_device=self.device_placement if self.distributed_type != DistributedType.TPU else False,
|
||||
rng_types=self.rng_types.copy(),
|
||||
dispatch_batches=self.dispatch_batches,
|
||||
)
|
||||
@ -603,8 +794,9 @@ class Accelerator:
|
||||
"""
|
||||
Use `accelerator.backward(loss)` in lieu of `loss.backward()`.
|
||||
"""
|
||||
loss /= self.gradient_accumulation_steps
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
self.deepspeed_engine.backward(loss, **kwargs)
|
||||
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
|
||||
elif self.scaler is not None:
|
||||
self.scaler.scale(loss).backward(**kwargs)
|
||||
else:
|
||||
@ -635,11 +827,15 @@ class Accelerator:
|
||||
Should be used in place of `torch.nn.utils.clip_grad_norm_`.
|
||||
"""
|
||||
if self.distributed_type == DistributedType.FSDP:
|
||||
self.unscale_gradients()
|
||||
parameters = [p for p in parameters]
|
||||
for model in self._models:
|
||||
if parameters == [p for p in model.parameters()]:
|
||||
model.clip_grad_norm_(max_norm, norm_type)
|
||||
return
|
||||
elif self.distributed_type == DistributedType.DEEPSPEED:
|
||||
# `accelerator.backward(loss)` is doing that automatically. Therefore, it's implementation is not needed
|
||||
return
|
||||
self.unscale_gradients()
|
||||
torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
|
||||
|
||||
@ -647,6 +843,8 @@ class Accelerator:
|
||||
"""
|
||||
Should be used in place of `torch.nn.utils.clip_grad_value_`.
|
||||
"""
|
||||
if self.distributed_type in [DistributedType.DEEPSPEED, DistributedType.FSDP]:
|
||||
raise Exception("DeepSpeed and FSDP do not support `clip_grad_value_`. Use `clip_grad_norm_` instead.")
|
||||
self.unscale_gradients()
|
||||
torch.nn.utils.clip_grad_value_(parameters, clip_value)
|
||||
|
||||
@ -668,17 +866,23 @@ class Accelerator:
|
||||
"""
|
||||
return gather(tensor)
|
||||
|
||||
def reduce(self, tensor: torch.Tensor, reduction="sum"):
|
||||
def reduce(self, tensor, reduction="sum"):
|
||||
"""
|
||||
Reduce the values in *tensor* across all processes based on *reduction*.
|
||||
|
||||
Note:
|
||||
All processes get the reduced value.
|
||||
|
||||
Args:
|
||||
tensor (`torch.Tensor`):
|
||||
tensor (`torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`):
|
||||
The tensors to reduce across all processes.
|
||||
reduction (`str`, *optional*, defaults to "sum"):
|
||||
A reduction type, can be one of 'sum', 'mean', or 'none'. If 'none', will not perform any operation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`, or a nested tuple/list/dictionary of `torch.Tensor`: The reduced tensor(s).
|
||||
"""
|
||||
reduce(tensor, reduction)
|
||||
return reduce(tensor, reduction)
|
||||
|
||||
def pad_across_processes(self, tensor, dim=0, pad_index=0, pad_first=False):
|
||||
"""
|
||||
@ -786,7 +990,7 @@ class Accelerator:
|
||||
output_dir = os.path.expanduser(output_dir)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logger.info(f"Saving current state to {output_dir}")
|
||||
weights = [self.get_state_dict(m) for m in self._models]
|
||||
weights = [self.get_state_dict(m, unwrap=False) for m in self._models]
|
||||
save_location = save_accelerator_state(
|
||||
output_dir, weights, self._optimizers, self._schedulers, self.state.process_index, self.scaler
|
||||
)
|
||||
@ -829,7 +1033,7 @@ class Accelerator:
|
||||
self._schedulers = []
|
||||
self._optimizers = []
|
||||
self._models = []
|
||||
self.deepspeed_engine = None
|
||||
self.deepspeed_engine_wrapped = None
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@ -865,21 +1069,30 @@ class Accelerator:
|
||||
break
|
||||
return (model_device, optimizer_device)
|
||||
|
||||
def get_state_dict(self, model):
|
||||
def get_state_dict(self, model, unwrap=True):
|
||||
is_zero_3 = False
|
||||
if is_deepspeed_available():
|
||||
if isinstance(model, DeepSpeedEngineWrapper) and self.distributed_type == DistributedType.DEEPSPEED:
|
||||
is_zero_3 = self.state.deepspeed_plugin.zero_stage == 3
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
is_zero_3 = self.deepspeed_config["zero_optimization"]["stage"] == 3
|
||||
|
||||
if is_zero_3:
|
||||
state_dict = model._zero3_consolidated_fp16_state_dict()
|
||||
if model.zero_gather_16bit_weights_on_model_save():
|
||||
state_dict = model._zero3_consolidated_16bit_state_dict()
|
||||
else:
|
||||
raise ValueError(
|
||||
"Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
|
||||
"To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
|
||||
"set `zero3_save_16bit_model` to True when using `accelerate config`. "
|
||||
"To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
|
||||
)
|
||||
else:
|
||||
model = self.unwrap_model(model)
|
||||
if unwrap:
|
||||
model = self.unwrap_model(model)
|
||||
state_dict = model.state_dict()
|
||||
|
||||
for k in state_dict:
|
||||
if state_dict[k].dtype == torch.float16:
|
||||
state_dict[k] = state_dict[k].float()
|
||||
if state_dict is not None:
|
||||
for k in state_dict:
|
||||
if state_dict[k].dtype == torch.float16:
|
||||
state_dict[k] = state_dict[k].float()
|
||||
|
||||
return state_dict
|
||||
|
||||
@ -914,10 +1127,12 @@ class Accelerator:
|
||||
different will happen otherwise.
|
||||
"""
|
||||
if self.native_amp:
|
||||
if self.mixed_precision == "fp16" and version.parse(torch.__version__) >= version.parse("1.10"):
|
||||
if self.mixed_precision == "fp16" and is_torch_version(">=", "1.10"):
|
||||
autocast_context = torch.cuda.amp.autocast(dtype=torch.float16)
|
||||
elif self.mixed_precision == "bf16":
|
||||
autocast_context = torch.cuda.amp.autocast(dtype=torch.bfloat16)
|
||||
elif self.mixed_precision == "bf16" and is_bf16_available():
|
||||
if self.distributed_type in [DistributedType.NO, DistributedType.MULTI_CPU, DistributedType.MULTI_GPU]:
|
||||
device_type = "cpu" if not torch.cuda.is_available() else "cuda"
|
||||
autocast_context = torch.autocast(dtype=torch.bfloat16, device_type=device_type)
|
||||
else:
|
||||
autocast_context = torch.cuda.amp.autocast()
|
||||
|
||||
|
||||
324
src/accelerate/big_modeling.py
Normal file
324
src/accelerate/big_modeling.py
Normal file
@ -0,0 +1,324 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .hooks import AlignDevicesHook, add_hook_to_module, attach_align_device_hook, attach_align_device_hook_on_blocks
|
||||
from .utils import (
|
||||
OffloadedWeightsLoader,
|
||||
check_device_map,
|
||||
extract_submodules_state_dict,
|
||||
infer_auto_device_map,
|
||||
load_checkpoint_in_model,
|
||||
offload_state_dict,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def init_empty_weights(include_buffers: bool = False):
|
||||
"""
|
||||
A context manager under which models are initialized with all parameters on the meta device, therefore creating an
|
||||
empty model. Useful when just initializing the model would blow the available RAM.
|
||||
|
||||
Args:
|
||||
include_buffers (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to also put all buffers on the meta device while initializing.
|
||||
|
||||
Example:
|
||||
|
||||
```pyton
|
||||
import torch.nn as nn
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
# Initialize a model with 100 billions parameters in no time and without using any RAM.
|
||||
with init_empty_weights():
|
||||
tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Any model created under this context manager has no weights. As such you can't do something like
|
||||
`model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
old_register_parameter = nn.Module.register_parameter
|
||||
if include_buffers:
|
||||
old_register_buffer = nn.Module.register_buffer
|
||||
|
||||
def register_empty_parameter(module, name, param):
|
||||
old_register_parameter(module, name, param)
|
||||
if param is not None:
|
||||
param_cls = type(module._parameters[name])
|
||||
kwargs = module._parameters[name].__dict__
|
||||
module._parameters[name] = param_cls(module._parameters[name].to(torch.device("meta")), **kwargs)
|
||||
|
||||
def register_empty_buffer(module, name, buffer):
|
||||
old_register_buffer(module, name, buffer)
|
||||
if buffer is not None:
|
||||
module._buffers[name] = module._buffers[name].to(torch.device("meta"))
|
||||
|
||||
try:
|
||||
nn.Module.register_parameter = register_empty_parameter
|
||||
if include_buffers:
|
||||
nn.Module.register_buffer = register_empty_buffer
|
||||
yield
|
||||
finally:
|
||||
nn.Module.register_parameter = old_register_parameter
|
||||
if include_buffers:
|
||||
nn.Module.register_buffer = old_register_buffer
|
||||
|
||||
|
||||
def cpu_offload(
|
||||
model: nn.Module,
|
||||
execution_device: Optional[torch.device] = None,
|
||||
offload_buffers: bool = False,
|
||||
state_dict: Optional[Dict[str, torch.Tensor]] = None,
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Activates full CPU offload for a model. As a result, all parameters of the model will be offloaded and only one
|
||||
copy of the state dict of the model will be kept. During the forward pass, parameters will be extracted from that
|
||||
state dict and put on the execution device passed as they are needed, then offloaded again.
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`):
|
||||
The model to offload.
|
||||
execution_device (`torch.device`, *optional*):
|
||||
The device on which the forward pass of the model will be executed (should be a GPU). Will default to the
|
||||
model first parameter device.
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to offload the buffers with the model parameters.
|
||||
state_dict (`Dict[str, torch.Tensor]`, *optional*):
|
||||
The state dict of the model that will be kept on CPU.
|
||||
preload_module_classes (`List[str]`, *optional*):
|
||||
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
|
||||
of the forward. This should only be used for classes that have submodules which are registered but not
|
||||
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
|
||||
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
|
||||
"""
|
||||
if execution_device is None:
|
||||
execution_device = next(iter(model.parameters())).device
|
||||
if state_dict is None:
|
||||
state_dict = {n: p.to("cpu") for n, p in model.state_dict().items()}
|
||||
attach_align_device_hook(
|
||||
model,
|
||||
execution_device=execution_device,
|
||||
offload=True,
|
||||
offload_buffers=offload_buffers,
|
||||
weights_map=state_dict,
|
||||
preload_module_classes=preload_module_classes,
|
||||
)
|
||||
add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
|
||||
return model
|
||||
|
||||
|
||||
def disk_offload(
|
||||
model: nn.Module,
|
||||
offload_dir: Union[str, os.PathLike],
|
||||
execution_device: Optional[torch.device] = None,
|
||||
offload_buffers: bool = False,
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Activates full disk offload for a model. As a result, all parameters of the model will be offloaded as
|
||||
memory-mapped array in a given folder. During the forward pass, parameters will be accessed from that folder and
|
||||
put on the execution device passed as they are needed, then offloaded again.
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to offload.
|
||||
offload_dir (`str` or `os.PathLike`):
|
||||
The folder in which to offload the model weights (or where the model weights are already offloaded).
|
||||
execution_device (`torch.device`, *optional*):
|
||||
The device on which the forward pass of the model will be executed (should be a GPU). Will default to the
|
||||
model's first parameter device.
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to offload the buffers with the model parameters.
|
||||
preload_module_classes (`List[str]`, *optional*):
|
||||
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
|
||||
of the forward. This should only be used for classes that have submodules which are registered but not
|
||||
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
|
||||
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
|
||||
"""
|
||||
if not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")):
|
||||
offload_state_dict(offload_dir, model.state_dict())
|
||||
if execution_device is None:
|
||||
execution_device = next(iter(model.parameters())).device
|
||||
weights_map = OffloadedWeightsLoader(save_folder=offload_dir)
|
||||
attach_align_device_hook(
|
||||
model,
|
||||
execution_device=execution_device,
|
||||
offload=True,
|
||||
offload_buffers=offload_buffers,
|
||||
weights_map=weights_map,
|
||||
preload_module_classes=preload_module_classes,
|
||||
)
|
||||
add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
|
||||
return model
|
||||
|
||||
|
||||
def dispatch_model(
|
||||
model: nn.Module,
|
||||
device_map: Dict[str, Union[str, int, torch.device]],
|
||||
main_device: Optional[torch.device] = None,
|
||||
state_dict: Optional[Dict[str, torch.Tensor]] = None,
|
||||
offload_dir: Union[str, os.PathLike] = None,
|
||||
offload_buffers: bool = False,
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Dispatches a model according to a given device map. Layers of the model might be spread across GPUs, offloaded on
|
||||
the CPU or even the disk.
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`):
|
||||
The model to dispatch.
|
||||
device_map (`Dict[str, Union[str, int, torch.device]]`):
|
||||
A dictionary mapping module names in the models `state_dict` to the device they should go to. Note that
|
||||
`"disk"` is accepted even if it's not a proper value for `torch.device`.
|
||||
main_device (`str`, `int` or `torch.device`, *optional*):
|
||||
The main execution device. Will default to the first device in the `device_map` different from `"cpu"` or
|
||||
`"disk"`.
|
||||
state_dict (`Dict[str, torch.Tensor]`, *optional*):
|
||||
The state dict of the part of the model that will be kept on CPU.
|
||||
offload_dir (`str` or `os.PathLike`):
|
||||
The folder in which to offload the model weights (or where the model weights are already offloaded).
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to offload the buffers with the model parameters.
|
||||
preload_module_classes (`List[str]`, *optional*):
|
||||
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
|
||||
of the forward. This should only be used for classes that have submodules which are registered but not
|
||||
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
|
||||
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
|
||||
"""
|
||||
# Error early if the device map is incomplete.
|
||||
check_device_map(model, device_map)
|
||||
|
||||
if main_device is None:
|
||||
main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]
|
||||
|
||||
cpu_modules = [name for name, device in device_map.items() if device == "cpu"]
|
||||
if state_dict is None and len(cpu_modules) > 0:
|
||||
state_dict = extract_submodules_state_dict(model.state_dict(), cpu_modules)
|
||||
|
||||
disk_modules = [name for name, device in device_map.items() if device == "disk"]
|
||||
if offload_dir is None and len(disk_modules) > 0:
|
||||
raise ValueError(
|
||||
"We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules "
|
||||
f"need to be offloaded: {', '.join(disk_modules)}."
|
||||
)
|
||||
if len(disk_modules) > 0 and (
|
||||
not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json"))
|
||||
):
|
||||
disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules)
|
||||
offload_state_dict(offload_dir, disk_state_dict)
|
||||
|
||||
execution_device = {
|
||||
name: main_device if device in ["cpu", "disk"] else device for name, device in device_map.items()
|
||||
}
|
||||
offload = {name: device in ["cpu", "disk"] for name, device in device_map.items()}
|
||||
save_folder = offload_dir if len(disk_modules) > 0 else None
|
||||
if state_dict is not None or save_folder is not None:
|
||||
weights_map = OffloadedWeightsLoader(state_dict=state_dict, save_folder=save_folder)
|
||||
else:
|
||||
weights_map = None
|
||||
|
||||
attach_align_device_hook_on_blocks(
|
||||
model,
|
||||
execution_device=execution_device,
|
||||
offload=offload,
|
||||
offload_buffers=offload_buffers,
|
||||
weights_map=weights_map,
|
||||
preload_module_classes=preload_module_classes,
|
||||
)
|
||||
model.hf_device_map = device_map
|
||||
return model
|
||||
|
||||
|
||||
def load_checkpoint_and_dispatch(
|
||||
model: nn.Module,
|
||||
checkpoint: Union[str, os.PathLike],
|
||||
device_map: Optional[Union[str, Dict[str, Union[int, str, torch.device]]]] = None,
|
||||
max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
|
||||
no_split_module_classes: Optional[List[str]] = None,
|
||||
offload_folder: Optional[Union[str, os.PathLike]] = None,
|
||||
offload_buffers: bool = False,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
offload_state_dict: bool = False,
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
|
||||
loaded and adds the various hooks that will make this model run properly (even if split across devices).
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model in which we want to load a checkpoint.
|
||||
checkpoint (`str` or `os.PathLike`):
|
||||
The folder checkpoint to load. It can be:
|
||||
- a path to a file containing a whole model state dict
|
||||
- a path to a `.json` file containing the index to a sharded checkpoint
|
||||
- a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
|
||||
device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
|
||||
A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
|
||||
name, once a given module name is inside, every submodule of it will be sent to the same device.
|
||||
|
||||
To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
|
||||
max_memory (`Dict`, *optional*):
|
||||
A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
|
||||
and the available CPU RAM if unset.
|
||||
no_split_module_classes (`List[str]`, *optional*):
|
||||
A list of layer class names that should never be split across device (for instance any layer that has a
|
||||
residual connection).
|
||||
offload_folder (`str` or `os.PathLike`, *optional*):
|
||||
If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
|
||||
well as the parameters.
|
||||
dtype (`str` or `torch.dtype`, *optional*):
|
||||
If provided, the weights will be converted to that type when loaded.
|
||||
offload_state_dict (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, will temporarily offload the CPU state dict on the hard drive to avoig getting out of CPU RAM if
|
||||
the weight of the CPU state dict + the biggest shard does not fit.
|
||||
preload_module_classes (`List[str]`, *optional*):
|
||||
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
|
||||
of the forward. This should only be used for classes that have submodules which are registered but not
|
||||
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
|
||||
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
|
||||
"""
|
||||
if device_map == "auto":
|
||||
device_map = infer_auto_device_map(
|
||||
model, max_memory=max_memory, no_split_module_classes=no_split_module_classes, dtype=dtype
|
||||
)
|
||||
load_checkpoint_in_model(
|
||||
model,
|
||||
checkpoint,
|
||||
device_map=device_map,
|
||||
offload_folder=offload_folder,
|
||||
dtype=dtype,
|
||||
offload_state_dict=offload_state_dict,
|
||||
)
|
||||
if device_map is None:
|
||||
return model
|
||||
return dispatch_model(
|
||||
model,
|
||||
device_map=device_map,
|
||||
offload_dir=offload_folder,
|
||||
offload_buffers=offload_buffers,
|
||||
preload_module_classes=preload_module_classes,
|
||||
)
|
||||
@ -21,17 +21,25 @@ import numpy as np
|
||||
import torch
|
||||
from torch.cuda.amp import GradScaler
|
||||
|
||||
from .state import is_tpu_available
|
||||
from .utils import MODEL_NAME, OPTIMIZER_NAME, RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME, get_pretty_name, save
|
||||
from .utils import (
|
||||
MODEL_NAME,
|
||||
OPTIMIZER_NAME,
|
||||
RNG_STATE_NAME,
|
||||
SCALER_NAME,
|
||||
SCHEDULER_NAME,
|
||||
get_pretty_name,
|
||||
is_tpu_available,
|
||||
save,
|
||||
)
|
||||
|
||||
|
||||
if is_tpu_available():
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
import logging
|
||||
from .logging import get_logger
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def save_accelerator_state(
|
||||
@ -94,7 +102,7 @@ def save_accelerator_state(
|
||||
states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
if is_tpu_available():
|
||||
states["xm_seed"] = torch.tensor(xm.get_rng_state())
|
||||
states["xm_seed"] = xm.get_rng_state()
|
||||
output_states_file = os.path.join(output_dir, states_name)
|
||||
torch.save(states, output_states_file)
|
||||
logger.info(f"Random states saved in {output_states_file}")
|
||||
@ -108,7 +116,7 @@ def load_accelerator_state(input_dir, models, optimizers, schedulers, process_in
|
||||
Args:
|
||||
input_dir (`str` or `os.PathLike`):
|
||||
The name of the folder to load all relevant weights and states.
|
||||
model_stmodelsates (`List[torch.nn.Module]`):
|
||||
models (`List[torch.nn.Module]`):
|
||||
A list of model instances
|
||||
optimizers (`List[torch.optim.Optimizer]`):
|
||||
A list of optimizer instances
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from accelerate.state import ComputeEnvironment
|
||||
from accelerate.utils import ComputeEnvironment
|
||||
|
||||
from .cluster import get_cluster_input
|
||||
from .config_args import cache_dir, default_config_file, default_yaml_config_file, load_config_from_file # noqa: F401
|
||||
|
||||
@ -14,9 +14,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from accelerate.state import ComputeEnvironment, DistributedType
|
||||
|
||||
from ...utils import is_deepspeed_available
|
||||
from ...utils import ComputeEnvironment, DistributedType, is_deepspeed_available, is_transformers_available
|
||||
from ...utils.constants import (
|
||||
DEEPSPEED_MULTINODE_LAUNCHERS,
|
||||
FSDP_AUTO_WRAP_POLICY,
|
||||
FSDP_BACKWARD_PREFETCH,
|
||||
FSDP_SHARDING_STRATEGY,
|
||||
)
|
||||
from .config_args import ClusterConfig
|
||||
from .config_utils import _ask_field, _convert_distributed_mode, _convert_yes_no_to_bool
|
||||
|
||||
@ -64,7 +68,7 @@ def get_cluster_input():
|
||||
else:
|
||||
use_cpu = False
|
||||
|
||||
deepspeed_config = None
|
||||
deepspeed_config = {}
|
||||
if distributed_type in [DistributedType.MULTI_GPU, DistributedType.NO]:
|
||||
use_deepspeed = _ask_field(
|
||||
"Do you want to use DeepSpeed? [yes/NO]: ",
|
||||
@ -78,27 +82,120 @@ def get_cluster_input():
|
||||
is_deepspeed_available()
|
||||
), "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source"
|
||||
|
||||
deepspeed_config = {}
|
||||
if distributed_type == DistributedType.DEEPSPEED:
|
||||
deepspeed_config["zero_stage"] = _ask_field(
|
||||
"What should be your DeepSpeed's ZeRO optimization stage (0, 1, 2, 3)? [2]: ",
|
||||
lambda x: int(x),
|
||||
default=2,
|
||||
use_deepspeed_config = _ask_field(
|
||||
"Do you want to specify a json file to a DeepSpeed config? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
|
||||
if deepspeed_config["zero_stage"] >= 2:
|
||||
deepspeed_config["offload_optimizer_device"] = _ask_field(
|
||||
"Where to offload optimizer states? [NONE/cpu/nvme]: ",
|
||||
if use_deepspeed_config:
|
||||
deepspeed_config["deepspeed_config_file"] = _ask_field(
|
||||
"Please enter the path to the json DeepSpeed config file: ",
|
||||
lambda x: str(x),
|
||||
default="none",
|
||||
)
|
||||
else:
|
||||
deepspeed_config["zero_stage"] = _ask_field(
|
||||
"What should be your DeepSpeed's ZeRO optimization stage (0, 1, 2, 3)? [2]: ",
|
||||
lambda x: int(x),
|
||||
default=2,
|
||||
)
|
||||
|
||||
deepspeed_config["gradient_accumulation_steps"] = _ask_field(
|
||||
"How many gradient accumulation steps you're passing in your script? [1]: ",
|
||||
lambda x: int(x),
|
||||
default=1,
|
||||
if deepspeed_config["zero_stage"] >= 2:
|
||||
deepspeed_config["offload_optimizer_device"] = _ask_field(
|
||||
"Where to offload optimizer states? [none/cpu/nvme]: ",
|
||||
lambda x: str(x),
|
||||
default="none",
|
||||
)
|
||||
deepspeed_config["offload_param_device"] = _ask_field(
|
||||
"Where to offload parameters? [none/cpu/nvme]: ",
|
||||
lambda x: str(x),
|
||||
default="none",
|
||||
)
|
||||
deepspeed_config["gradient_accumulation_steps"] = _ask_field(
|
||||
"How many gradient accumulation steps you're passing in your script? [1]: ",
|
||||
lambda x: int(x),
|
||||
default=1,
|
||||
)
|
||||
use_gradient_clipping = _ask_field(
|
||||
"Do you want to use gradient clipping? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if use_gradient_clipping:
|
||||
deepspeed_config["gradient_clipping"] = _ask_field(
|
||||
"What is the gradient clipping value? [1.0]: ",
|
||||
lambda x: float(x),
|
||||
default=1.0,
|
||||
)
|
||||
if deepspeed_config["zero_stage"] == 3:
|
||||
deepspeed_config["zero3_save_16bit_model"] = _ask_field(
|
||||
"Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
deepspeed_config["zero3_init_flag"] = _ask_field(
|
||||
"Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if deepspeed_config["zero3_init_flag"]:
|
||||
if not is_transformers_available():
|
||||
raise Exception(
|
||||
"When `zero3_init_flag` is set, it requires Transformers to be installed. "
|
||||
"Please run `pip3 install transformers`."
|
||||
)
|
||||
|
||||
if num_machines > 1:
|
||||
launcher_query = "Which Type of launcher do you want to use "
|
||||
for i, launcher in enumerate(DEEPSPEED_MULTINODE_LAUNCHERS):
|
||||
launcher_query += f"[{i}] {launcher}, "
|
||||
launcher_query = launcher_query[:-2] + ")? [0]: "
|
||||
deepspeed_config["deepspeed_multinode_launcher"] = _ask_field(
|
||||
launcher_query,
|
||||
lambda x: DEEPSPEED_MULTINODE_LAUNCHERS[int(x)],
|
||||
default=DEEPSPEED_MULTINODE_LAUNCHERS[0],
|
||||
)
|
||||
|
||||
if deepspeed_config["deepspeed_multinode_launcher"] != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
|
||||
deepspeed_config["deepspeed_hostfile"] = _ask_field(
|
||||
"DeepSpeed configures multi-node compute resources with hostfile. "
|
||||
"Each row is of the format `hostname slots=[num_gpus]`, e.g., `localhost slots=2`; "
|
||||
"for more information please refer official [documentation]"
|
||||
"(https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node). "
|
||||
"Please specify the location of hostfile: ",
|
||||
lambda x: str(x),
|
||||
)
|
||||
|
||||
is_exclusion_filter = _ask_field(
|
||||
"Do you want to specify exclusion filter string? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if is_exclusion_filter:
|
||||
deepspeed_config["deepspeed_exclusion_filter"] = _ask_field(
|
||||
"DeepSpeed exclusion filter string: ",
|
||||
lambda x: str(x),
|
||||
)
|
||||
|
||||
is_inclusion_filter = _ask_field(
|
||||
"Do you want to specify inclusion filter string? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
if is_inclusion_filter:
|
||||
deepspeed_config["deepspeed_inclusion_filter"] = _ask_field(
|
||||
"DeepSpeed inclusion filter string: ",
|
||||
lambda x: str(x),
|
||||
)
|
||||
|
||||
fsdp_config = {}
|
||||
if distributed_type in [DistributedType.MULTI_GPU]:
|
||||
use_fsdp = _ask_field(
|
||||
"Do you want to use FullyShardedDataParallel? [yes/NO]: ",
|
||||
@ -108,10 +205,13 @@ def get_cluster_input():
|
||||
)
|
||||
if use_fsdp:
|
||||
distributed_type = DistributedType.FSDP
|
||||
fsdp_config = {}
|
||||
if distributed_type == DistributedType.FSDP:
|
||||
sharding_strategy_query = "What should be your sharding strategy ("
|
||||
for i, strategy in enumerate(FSDP_SHARDING_STRATEGY):
|
||||
sharding_strategy_query += f"[{i+1}] {strategy}, "
|
||||
sharding_strategy_query = sharding_strategy_query[:-2] + ")? [1]: "
|
||||
fsdp_config["sharding_strategy"] = _ask_field(
|
||||
"What should be your sharding strategy ([1] FULL_SHARD, [2] SHARD_GRAD_OP)? [1]: ",
|
||||
sharding_strategy_query,
|
||||
lambda x: int(x),
|
||||
default=1,
|
||||
)
|
||||
@ -121,10 +221,34 @@ def get_cluster_input():
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
fsdp_config["min_num_params"] = _ask_field(
|
||||
"What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ",
|
||||
lambda x: int(x),
|
||||
default=1e8,
|
||||
fsdp_wrap_query = "What should be your auto wrap policy ("
|
||||
for i, wrap_policy in enumerate(FSDP_AUTO_WRAP_POLICY):
|
||||
fsdp_wrap_query += f"[{i}] {wrap_policy}, "
|
||||
fsdp_wrap_query = fsdp_wrap_query[:-2] + ")? [0]: "
|
||||
fsdp_config["fsdp_auto_wrap_policy"] = _ask_field(
|
||||
fsdp_wrap_query,
|
||||
lambda x: FSDP_AUTO_WRAP_POLICY[int(x)],
|
||||
default=FSDP_AUTO_WRAP_POLICY[0],
|
||||
)
|
||||
if fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[0]:
|
||||
fsdp_config["transformer_layer_cls_to_wrap"] = _ask_field(
|
||||
"What is the transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` ...? : ",
|
||||
lambda x: str(x),
|
||||
)
|
||||
elif fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[1]:
|
||||
fsdp_config["min_num_params"] = _ask_field(
|
||||
"What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ",
|
||||
lambda x: int(x),
|
||||
default=1e8,
|
||||
)
|
||||
fsdp_backward_prefetch_query = "What should be your FSDP's backward prefetch policy ("
|
||||
for i, backward_prefetch_policy in enumerate(FSDP_BACKWARD_PREFETCH):
|
||||
fsdp_backward_prefetch_query += f"[{i}] {backward_prefetch_policy}, "
|
||||
fsdp_backward_prefetch_query = fsdp_backward_prefetch_query[:-2] + ")? [0]: "
|
||||
fsdp_config["fsdp_backward_prefetch_policy"] = _ask_field(
|
||||
fsdp_backward_prefetch_query,
|
||||
lambda x: FSDP_BACKWARD_PREFETCH[int(x)],
|
||||
default=FSDP_BACKWARD_PREFETCH[0],
|
||||
)
|
||||
|
||||
if distributed_type == DistributedType.TPU:
|
||||
@ -135,19 +259,37 @@ def get_cluster_input():
|
||||
else:
|
||||
main_training_function = "main"
|
||||
|
||||
num_processes = _ask_field(
|
||||
"How many processes in total will you use? [1]: ",
|
||||
lambda x: int(x),
|
||||
default=1,
|
||||
error_message="Please enter an integer.",
|
||||
)
|
||||
if distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_GPU, DistributedType.TPU]:
|
||||
machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
|
||||
if machine_type == "TPU":
|
||||
machine_type += " cores"
|
||||
else:
|
||||
machine_type += "(s)"
|
||||
num_processes = _ask_field(
|
||||
f"How many {machine_type} should be used for distributed training? [1]:",
|
||||
lambda x: int(x),
|
||||
default=1,
|
||||
error_message="Please enter an integer.",
|
||||
)
|
||||
elif distributed_type in [DistributedType.FSDP, DistributedType.DEEPSPEED]:
|
||||
num_processes = _ask_field(
|
||||
"How many GPU(s) should be used for distributed training? [1]:",
|
||||
lambda x: int(x),
|
||||
default=1,
|
||||
error_message="Please enter an integer.",
|
||||
)
|
||||
else:
|
||||
num_processes = 1
|
||||
|
||||
if distributed_type != DistributedType.TPU:
|
||||
mixed_precision = _ask_field(
|
||||
"Do you wish to use FP16 or BF16 (mixed precision)? [NO/fp16/bf16]: ",
|
||||
lambda x: str(x).lower(),
|
||||
default="no",
|
||||
)
|
||||
if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config:
|
||||
mixed_precision = "no"
|
||||
else:
|
||||
mixed_precision = _ask_field(
|
||||
"Do you wish to use FP16 or BF16 (mixed precision)? [NO/fp16/bf16]: ",
|
||||
lambda x: str(x).lower(),
|
||||
default="no",
|
||||
)
|
||||
else:
|
||||
mixed_precision = "no"
|
||||
|
||||
|
||||
@ -21,7 +21,9 @@ from enum import Enum
|
||||
from typing import Optional, Union
|
||||
|
||||
import yaml
|
||||
from accelerate.state import ComputeEnvironment, DistributedType, SageMakerDistributedType
|
||||
|
||||
from ...utils import ComputeEnvironment, DistributedType, SageMakerDistributedType
|
||||
from ...utils.constants import SAGEMAKER_PYTHON_VERSION, SAGEMAKER_PYTORCH_VERSION, SAGEMAKER_TRANSFORMERS_VERSION
|
||||
|
||||
|
||||
hf_cache_home = os.path.expanduser(
|
||||
@ -122,7 +124,10 @@ class BaseConfig:
|
||||
if isinstance(self.compute_environment, str):
|
||||
self.compute_environment = ComputeEnvironment(self.compute_environment)
|
||||
if isinstance(self.distributed_type, str):
|
||||
self.distributed_type = DistributedType(self.distributed_type)
|
||||
if self.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
|
||||
self.distributed_type = SageMakerDistributedType(self.distributed_type)
|
||||
else:
|
||||
self.distributed_type = DistributedType(self.distributed_type)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -139,14 +144,25 @@ class ClusterConfig(BaseConfig):
|
||||
# args for fsdp
|
||||
fsdp_config: dict = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.deepspeed_config is None:
|
||||
self.deepspeed_config = {}
|
||||
if self.fsdp_config is None:
|
||||
self.fsdp_config = {}
|
||||
return super().__post_init__()
|
||||
|
||||
|
||||
@dataclass
|
||||
class SageMakerConfig(BaseConfig):
|
||||
ec2_instance_type: str
|
||||
iam_role_name: str
|
||||
image_uri: str
|
||||
profile: Optional[str] = None
|
||||
region: str = "us-east-1"
|
||||
num_machines: int = 1
|
||||
base_job_name: str = f"accelerate-sagemaker-{num_machines}"
|
||||
pytorch_version: str = "1.6"
|
||||
transformers_version: str = "4.4"
|
||||
pytorch_version: str = SAGEMAKER_PYTORCH_VERSION
|
||||
transformers_version: str = SAGEMAKER_TRANSFORMERS_VERSION
|
||||
py_version: str = SAGEMAKER_PYTHON_VERSION
|
||||
sagemaker_inputs_file: str = None
|
||||
sagemaker_metrics_file: str = None
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from accelerate.state import ComputeEnvironment, DistributedType, SageMakerDistributedType
|
||||
from ...utils.dataclasses import ComputeEnvironment, DistributedType, SageMakerDistributedType
|
||||
|
||||
|
||||
def _ask_field(input_text, convert_value=None, default=None, error_message=None):
|
||||
|
||||
@ -16,11 +16,11 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from accelerate.state import ComputeEnvironment, SageMakerDistributedType
|
||||
from accelerate.utils import is_boto3_available
|
||||
|
||||
from ...utils.constants import SAGEMAKER_PARALLEL_EC2_INSTANCES
|
||||
from ...utils.dataclasses import ComputeEnvironment, SageMakerDistributedType
|
||||
from ...utils.imports import is_boto3_available
|
||||
from .config_args import SageMakerConfig
|
||||
from .config_utils import _ask_field, _convert_sagemaker_distributed_mode
|
||||
from .config_utils import _ask_field, _convert_sagemaker_distributed_mode, _convert_yes_no_to_bool
|
||||
|
||||
|
||||
if is_boto3_available():
|
||||
@ -120,24 +120,68 @@ def get_sagemaker_input():
|
||||
print(f'Accelerate will create an iam role "{iam_role_name}" using the provided credentials')
|
||||
_create_iam_role_for_sagemaker(iam_role_name)
|
||||
|
||||
is_custom_docker_image = _ask_field(
|
||||
"Do you want to use custom Docker image? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
docker_image = None
|
||||
if is_custom_docker_image:
|
||||
docker_image = _ask_field("Enter your Docker image: ", lambda x: str(x).lower())
|
||||
|
||||
is_sagemaker_inputs_enabled = _ask_field(
|
||||
"Do you want to provide SageMaker input channels with data locations? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
sagemaker_inputs_file = None
|
||||
if is_sagemaker_inputs_enabled:
|
||||
sagemaker_inputs_file = _ask_field(
|
||||
"Enter the path to the SageMaker inputs TSV file with columns (channel_name, data_location): ",
|
||||
lambda x: str(x).lower(),
|
||||
)
|
||||
|
||||
is_sagemaker_metrics_enabled = _ask_field(
|
||||
"Do you want to enable SageMaker metrics? [yes/NO]: ",
|
||||
_convert_yes_no_to_bool,
|
||||
default=False,
|
||||
error_message="Please enter yes or no.",
|
||||
)
|
||||
sagemaker_metrics_file = None
|
||||
if is_sagemaker_metrics_enabled:
|
||||
sagemaker_metrics_file = _ask_field(
|
||||
"Enter the path to the SageMaker metrics TSV file with columns (metric_name, metric_regex): ",
|
||||
lambda x: str(x).lower(),
|
||||
)
|
||||
|
||||
distributed_type = _ask_field(
|
||||
"Which type of machine are you using? ([0] No distributed training, [1] data parallelism, [2] model parallelism): ",
|
||||
"Which type of machine are you using? ([0] No distributed training, [1] data parallelism): ",
|
||||
_convert_sagemaker_distributed_mode,
|
||||
error_message="Please enter 0, 1 or 2",
|
||||
error_message="Please enter 0 or 1",
|
||||
)
|
||||
|
||||
# using the best two instances for single-gpu training or multi-gpu -> can turn into question to make it more diverse
|
||||
ec2_instance_type = "ml.p3.2xlarge" if distributed_type == SageMakerDistributedType.NO else "ml.p3dn.24xlarge"
|
||||
ec2_instance_query = "Which EC2 instance type you want to use for your training "
|
||||
if distributed_type != SageMakerDistributedType.NO:
|
||||
ec2_instance_query += "("
|
||||
for i, instance_type in enumerate(SAGEMAKER_PARALLEL_EC2_INSTANCES):
|
||||
ec2_instance_query += f"[{i}] {instance_type}, "
|
||||
ec2_instance_query = ec2_instance_query[:-2] + ")? [0]: "
|
||||
ec2_instance_type = _ask_field(ec2_instance_query, lambda x: SAGEMAKER_PARALLEL_EC2_INSTANCES[int(x)])
|
||||
else:
|
||||
ec2_instance_query += "? [ml.p3.2xlarge]:"
|
||||
ec2_instance_type = _ask_field(ec2_instance_query, lambda x: str(x).lower(), default="ml.p3.2xlarge")
|
||||
|
||||
num_machines = 1
|
||||
if (
|
||||
distributed_type == SageMakerDistributedType.DATA_PARALLEL
|
||||
or distributed_type == SageMakerDistributedType.MODEL_PARALLEL
|
||||
):
|
||||
raise NotImplementedError("Model or Data Parallelism is not implemented yet. We are working on it")
|
||||
num_machines = _ask_field(
|
||||
"How many machines do you want use? [2]: ",
|
||||
"How many machines do you want use? [1]: ",
|
||||
lambda x: int(x),
|
||||
default=2,
|
||||
default=1,
|
||||
)
|
||||
|
||||
mixed_precision = _ask_field(
|
||||
@ -147,12 +191,16 @@ def get_sagemaker_input():
|
||||
)
|
||||
|
||||
return SageMakerConfig(
|
||||
image_uri=docker_image,
|
||||
compute_environment=ComputeEnvironment.AMAZON_SAGEMAKER,
|
||||
distributed_type=distributed_type,
|
||||
use_cpu=False,
|
||||
ec2_instance_type=ec2_instance_type,
|
||||
profile=aws_profile,
|
||||
region=aws_region,
|
||||
iam_role_name=iam_role_name,
|
||||
mixed_precision=mixed_precision,
|
||||
num_machines=num_machines,
|
||||
sagemaker_inputs_file=sagemaker_inputs_file,
|
||||
sagemaker_metrics_file=sagemaker_metrics_file,
|
||||
)
|
||||
|
||||
@ -26,8 +26,17 @@ from typing import Dict, List
|
||||
|
||||
from accelerate.commands.config import default_config_file, load_config_from_file
|
||||
from accelerate.commands.config.config_args import SageMakerConfig
|
||||
from accelerate.state import ComputeEnvironment, DistributedType
|
||||
from accelerate.utils import PrecisionType, PrepareForLaunch, is_sagemaker_available
|
||||
from accelerate.utils import (
|
||||
ComputeEnvironment,
|
||||
DistributedType,
|
||||
PrecisionType,
|
||||
PrepareForLaunch,
|
||||
get_launch_prefix,
|
||||
is_deepspeed_available,
|
||||
is_sagemaker_available,
|
||||
)
|
||||
from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS
|
||||
from accelerate.utils.dataclasses import SageMakerDistributedType
|
||||
|
||||
|
||||
def launch_command_parser(subparsers=None):
|
||||
@ -51,6 +60,80 @@ def launch_command_parser(subparsers=None):
|
||||
action="store_true",
|
||||
help="Whether to use deepspeed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deepspeed_config_file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="DeepSpeed config file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zero_stage",
|
||||
default=None,
|
||||
type=int,
|
||||
help="DeepSpeed's ZeRO optimization stage (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offload_optimizer_device",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Decides where (none|cpu|nvme) to offload optimizer states (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offload_param_device",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Decides where (none|cpu|nvme) to offload parameters (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
default=None,
|
||||
type=int,
|
||||
help="No of gradient_accumulation_steps used in your training script (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_clipping",
|
||||
default=None,
|
||||
type=float,
|
||||
help="gradient clipping value used in your training script (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zero3_init_flag",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Decides Whether (true|false) to enable `deepspeed.zero.Init` for constructing massive models. "
|
||||
"Only applicable with DeepSpeed ZeRO Stage-3.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zero3_save_16bit_model",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Decides Whether (true|false) to save 16-bit model weights when using ZeRO Stage-3. "
|
||||
"Only applicable with DeepSpeed ZeRO Stage-3.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deepspeed_hostfile",
|
||||
default=None,
|
||||
type=str,
|
||||
help="DeepSpeed hostfile for configuring multi-node compute resources.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deepspeed_exclusion_filter",
|
||||
default=None,
|
||||
type=str,
|
||||
help="DeepSpeed exclusion filter string when using mutli-node setup.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deepspeed_inclusion_filter",
|
||||
default=None,
|
||||
type=str,
|
||||
help="DeepSpeed inclusion filter string when using mutli-node setup.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deepspeed_multinode_launcher",
|
||||
default=None,
|
||||
type=str,
|
||||
help="DeepSpeed multi-node launcher to use.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_fsdp",
|
||||
default=False,
|
||||
@ -75,6 +158,25 @@ def launch_command_parser(subparsers=None):
|
||||
default=1,
|
||||
help="FSDP's Sharding Strategy. (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fsdp_auto_wrap_policy",
|
||||
type=str,
|
||||
default=None,
|
||||
help="FSDP's auto wrap policy. (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--transformer_layer_cls_to_wrap",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` .... "
|
||||
"(useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fsdp_backward_prefetch_policy",
|
||||
default=None,
|
||||
type=str,
|
||||
help="FSDP's backward prefetch policy. (useful only when `use_fsdp` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tpu", default=False, action="store_true", help="Whether or not this should launch a TPU training."
|
||||
)
|
||||
@ -126,6 +228,12 @@ def launch_command_parser(subparsers=None):
|
||||
action="store_true",
|
||||
help="Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_cpu_threads_per_process",
|
||||
type=int,
|
||||
default=1,
|
||||
help="The number of CPU threads per process. Can be tuned for optimal performance.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--aws_access_key_id",
|
||||
type=str,
|
||||
@ -146,24 +254,6 @@ def launch_command_parser(subparsers=None):
|
||||
"script."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--zero_stage",
|
||||
default=None,
|
||||
type=int,
|
||||
help="DeepSpeed's ZeRO optimization stage (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offload_optimizer_device",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Decides where (none|cpu|nvme) to offload optimizer states (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
default=None,
|
||||
type=int,
|
||||
help="No of gradient_accumulation_steps used in your training script (useful only when `use_deepspeed` flag is passed).",
|
||||
)
|
||||
|
||||
# Other arguments of the training scripts
|
||||
parser.add_argument("training_script_args", nargs=argparse.REMAINDER, help="Arguments of the training script.")
|
||||
@ -206,7 +296,7 @@ def simple_launcher(args):
|
||||
|
||||
|
||||
def multi_gpu_launcher(args):
|
||||
cmd = [sys.executable, "-m", "torch.distributed.launch", "--use_env"]
|
||||
cmd = get_launch_prefix()
|
||||
if args.num_machines > 1:
|
||||
cmd.extend(
|
||||
[
|
||||
@ -251,9 +341,13 @@ def multi_gpu_launcher(args):
|
||||
current_env["MIXED_PRECISION"] = str(mixed_precision)
|
||||
if args.use_fsdp:
|
||||
current_env["USE_FSDP"] = "true"
|
||||
current_env["FSDP_AUTO_WRAP_POLICY"] = str(args.fsdp_auto_wrap_policy)
|
||||
current_env["FSDP_TRANSFORMER_CLS_TO_WRAP"] = str(args.transformer_layer_cls_to_wrap)
|
||||
current_env["FSDP_OFFLOAD_PARAMS"] = str(args.offload_params).lower()
|
||||
current_env["FSDP_MIN_NUM_PARAMS"] = str(args.min_num_params)
|
||||
current_env["FSDP_SHARDING_STRATEGY"] = str(args.sharding_strategy)
|
||||
current_env["FSDP_BACKWARD_PREFETCH"] = str(args.fsdp_backward_prefetch_policy)
|
||||
current_env["OMP_NUM_THREADS"] = str(args.num_cpu_threads_per_process)
|
||||
process = subprocess.Popen(cmd, env=current_env)
|
||||
process.wait()
|
||||
if process.returncode != 0:
|
||||
@ -261,22 +355,46 @@ def multi_gpu_launcher(args):
|
||||
|
||||
|
||||
def deepspeed_launcher(args):
|
||||
if not is_deepspeed_available():
|
||||
raise ImportError("DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source.")
|
||||
cmd = ["deepspeed", "--no_local_rank"]
|
||||
if args.num_machines > 1:
|
||||
cmd.extend(
|
||||
[
|
||||
"--num_gpus",
|
||||
str(args.num_processes // args.num_machines),
|
||||
"--num_nodes",
|
||||
str(args.num_machines),
|
||||
"--node_rank",
|
||||
str(args.machine_rank),
|
||||
"--master_addr",
|
||||
args.main_process_ip,
|
||||
"--master_port",
|
||||
str(args.main_process_port),
|
||||
]
|
||||
)
|
||||
if args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
|
||||
cmd = get_launch_prefix()
|
||||
cmd.extend(
|
||||
[
|
||||
"--nproc_per_node",
|
||||
str(args.num_processes // args.num_machines),
|
||||
"--nnodes",
|
||||
str(args.num_machines),
|
||||
"--node_rank",
|
||||
str(args.machine_rank),
|
||||
"--master_addr",
|
||||
args.main_process_ip,
|
||||
"--master_port",
|
||||
str(args.main_process_port),
|
||||
]
|
||||
)
|
||||
else:
|
||||
cmd.extend(
|
||||
["--hostfile", str(args.deepspeed_hostfile), "--launcher", str(args.deepspeed_multinode_launcher)]
|
||||
)
|
||||
if args.deepspeed_exclusion_filter is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
"--exclude",
|
||||
str(args.deepspeed_exclusion_filter),
|
||||
]
|
||||
)
|
||||
elif args.deepspeed_inclusion_filter is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
"--include",
|
||||
str(args.deepspeed_inclusion_filter),
|
||||
]
|
||||
)
|
||||
else:
|
||||
cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
|
||||
else:
|
||||
cmd.extend(["--num_gpus", str(args.num_processes)])
|
||||
|
||||
@ -301,11 +419,24 @@ def deepspeed_launcher(args):
|
||||
warnings.warn('--fp16 flag is deprecated. Use "--mixed_precision fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
|
||||
current_env["PYTHONPATH"] = sys.executable
|
||||
current_env["MIXED_PRECISION"] = str(mixed_precision)
|
||||
current_env["USE_DEEPSPEED"] = "true"
|
||||
current_env["DEEPSPEED_ZERO_STAGE"] = str(args.zero_stage)
|
||||
current_env["GRADIENT_ACCUMULATION_STEPS"] = str(args.gradient_accumulation_steps)
|
||||
current_env["DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE"] = str(args.offload_optimizer_device)
|
||||
current_env["GRADIENT_CLIPPING"] = str(args.gradient_clipping).lower()
|
||||
current_env["DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE"] = str(args.offload_optimizer_device).lower()
|
||||
current_env["DEEPSPEED_OFFLOAD_PARAM_DEVICE"] = str(args.offload_param_device).lower()
|
||||
current_env["DEEPSPEED_ZERO3_INIT"] = str(args.zero3_init_flag).lower()
|
||||
current_env["DEEPSPEED_ZERO3_SAVE_16BIT_MODEL"] = str(args.zero3_save_16bit_model).lower()
|
||||
current_env["DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file).lower()
|
||||
|
||||
if args.num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
|
||||
with open(".deepspeed_env", "a") as f:
|
||||
for key, value in current_env.items():
|
||||
if ";" in value or " " in value:
|
||||
continue
|
||||
f.write(f"{key}={value}\n")
|
||||
|
||||
process = subprocess.Popen(cmd, env=current_env)
|
||||
process.wait()
|
||||
@ -433,19 +564,56 @@ def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
|
||||
mixed_precision = "fp16"
|
||||
|
||||
# Environment variables to be set for use during training job
|
||||
environment = {"MIXED_PRECISION": str(mixed_precision)}
|
||||
environment = {
|
||||
"USE_SAGEMAKER": "true",
|
||||
"MIXED_PRECISION": str(mixed_precision),
|
||||
"SAGEMAKER_DISTRIBUTED_TYPE": sagemaker_config.distributed_type.value,
|
||||
}
|
||||
# configure distribution set up
|
||||
distribution = None # TODO: not yet implemented
|
||||
distribution = None
|
||||
if sagemaker_config.distributed_type == SageMakerDistributedType.DATA_PARALLEL:
|
||||
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
|
||||
|
||||
# configure sagemaker inputs
|
||||
sagemaker_inputs = None
|
||||
if sagemaker_config.sagemaker_inputs_file is not None:
|
||||
print(f"Loading SageMaker Inputs from {sagemaker_config.sagemaker_inputs_file} file")
|
||||
sagemaker_inputs = {}
|
||||
with open(sagemaker_config.sagemaker_inputs_file) as file:
|
||||
for i, line in enumerate(file):
|
||||
if i == 0:
|
||||
continue
|
||||
l = line.split("\t")
|
||||
sagemaker_inputs[l[0]] = l[1].strip()
|
||||
print(f"Loaded SageMaker Inputs: {sagemaker_inputs}")
|
||||
|
||||
# configure sagemaker metrics
|
||||
sagemaker_metrics = None
|
||||
if sagemaker_config.sagemaker_metrics_file is not None:
|
||||
print(f"Loading SageMaker Metrics from {sagemaker_config.sagemaker_metrics_file} file")
|
||||
sagemaker_metrics = []
|
||||
with open(sagemaker_config.sagemaker_metrics_file) as file:
|
||||
for i, line in enumerate(file):
|
||||
if i == 0:
|
||||
continue
|
||||
l = line.split("\t")
|
||||
metric_dict = {
|
||||
"Name": l[0],
|
||||
"Regex": l[1].strip(),
|
||||
}
|
||||
sagemaker_metrics.append(metric_dict)
|
||||
print(f"Loaded SageMaker Metrics: {sagemaker_metrics}")
|
||||
|
||||
# configure session
|
||||
print("Creating Estimator")
|
||||
huggingface_estimator = HuggingFace(
|
||||
image_uri=sagemaker_config.image_uri,
|
||||
entry_point=entry_point,
|
||||
source_dir=source_dir,
|
||||
role=sagemaker_config.iam_role_name,
|
||||
transformers_version="4.4",
|
||||
pytorch_version="1.6",
|
||||
py_version="py36",
|
||||
transformers_version=sagemaker_config.transformers_version,
|
||||
pytorch_version=sagemaker_config.pytorch_version,
|
||||
py_version=sagemaker_config.py_version,
|
||||
base_job_name=sagemaker_config.base_job_name,
|
||||
instance_count=sagemaker_config.num_machines,
|
||||
instance_type=sagemaker_config.ec2_instance_type,
|
||||
@ -453,9 +621,10 @@ def sagemaker_launcher(sagemaker_config: SageMakerConfig, args):
|
||||
distribution=distribution,
|
||||
hyperparameters=hyperparameters,
|
||||
environment=environment,
|
||||
metric_definitions=sagemaker_metrics,
|
||||
)
|
||||
|
||||
huggingface_estimator.fit()
|
||||
huggingface_estimator.fit(inputs=sagemaker_inputs)
|
||||
print(f"You can find your model data at: {huggingface_estimator.model_data}")
|
||||
|
||||
|
||||
|
||||
@ -43,7 +43,7 @@ def test_command_parser(subparsers=None):
|
||||
|
||||
|
||||
def test_command(args):
|
||||
script_name = os.path.sep.join(__file__.split(os.path.sep)[:-2] + ["test_utils", "test_script.py"])
|
||||
script_name = os.path.sep.join(__file__.split(os.path.sep)[:-2] + ["test_utils", "scripts", "test_script.py"])
|
||||
|
||||
test_args = f"""
|
||||
--config_file={args.config_file} {script_name}
|
||||
|
||||
@ -18,9 +18,8 @@ from typing import List, Optional, Union
|
||||
import torch
|
||||
from torch.utils.data import BatchSampler, DataLoader, IterableDataset
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .state import AcceleratorState, DistributedType, is_tpu_available
|
||||
from .logging import get_logger
|
||||
from .state import AcceleratorState, DistributedType, GradientState, is_tpu_available
|
||||
from .utils import (
|
||||
RNGType,
|
||||
broadcast,
|
||||
@ -29,15 +28,33 @@ from .utils import (
|
||||
find_batch_size,
|
||||
get_data_structure,
|
||||
initialize_tensors,
|
||||
is_torch_version,
|
||||
send_to_device,
|
||||
slice_tensors,
|
||||
synchronize_rng_states,
|
||||
)
|
||||
|
||||
|
||||
if is_tpu_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.distributed.parallel_loader as xpl
|
||||
|
||||
class MpDeviceLoaderWrapper(xpl.MpDeviceLoader):
|
||||
"""
|
||||
Wrapper for the xpl.MpDeviceLoader class that knows the total batch size.
|
||||
|
||||
**Available attributes:**
|
||||
|
||||
- **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
|
||||
Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
|
||||
number of processes
|
||||
"""
|
||||
|
||||
@property
|
||||
def total_batch_size(self):
|
||||
return self._loader.total_batch_size
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# kwargs of the DataLoader in min version 1.4.0.
|
||||
_PYTORCH_DATALOADER_KWARGS = {
|
||||
@ -61,7 +78,7 @@ _PYTORCH_DATALOADER_ADDITIONAL_KWARGS = {
|
||||
}
|
||||
|
||||
for v, additional_kwargs in _PYTORCH_DATALOADER_ADDITIONAL_KWARGS.items():
|
||||
if version.parse(torch.__version__) >= version.parse(v):
|
||||
if is_torch_version(">=", v):
|
||||
_PYTORCH_DATALOADER_KWARGS.update(additional_kwargs)
|
||||
|
||||
|
||||
@ -288,6 +305,12 @@ class DataLoaderShard(DataLoader):
|
||||
A random number generator to keep synchronized across processes.
|
||||
kwargs:
|
||||
All other keyword arguments to pass to the regular `DataLoader` initialization.
|
||||
|
||||
**Available attributes:**
|
||||
|
||||
- **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
|
||||
Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
|
||||
number of processes
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, device=None, rng_types=None, generator=None, **kwargs):
|
||||
@ -295,123 +318,176 @@ class DataLoaderShard(DataLoader):
|
||||
self.device = device
|
||||
self.rng_types = rng_types
|
||||
self.generator = generator
|
||||
self.gradient_state = GradientState()
|
||||
|
||||
def __iter__(self):
|
||||
if self.rng_types is not None:
|
||||
synchronize_rng_states(self.rng_types, self.generator)
|
||||
state = AcceleratorState()
|
||||
for batch in super().__iter__():
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
xm.mark_step()
|
||||
yield batch if self.device is None else send_to_device(batch, self.device)
|
||||
self.gradient_state._set_end_of_dataloader(False)
|
||||
dataloader_iter = super().__iter__()
|
||||
# We iterate one batch ahead to check when we are at the end
|
||||
try:
|
||||
current_batch = next(dataloader_iter)
|
||||
except StopIteration:
|
||||
yield
|
||||
while True:
|
||||
try:
|
||||
# But we still move it to the device so it is done before `StopIteration` is reached
|
||||
if self.device is not None:
|
||||
current_batch = send_to_device(current_batch, self.device)
|
||||
next_batch = next(dataloader_iter)
|
||||
yield current_batch
|
||||
current_batch = next_batch
|
||||
except StopIteration:
|
||||
self.gradient_state._set_end_of_dataloader(True)
|
||||
yield current_batch
|
||||
break
|
||||
|
||||
@property
|
||||
def total_batch_size(self):
|
||||
return (
|
||||
self.batch_sampler.batch_size
|
||||
if self.batch_sampler.split_batches
|
||||
else (self.batch_sampler.batch_size * self.batch_sampler.num_processes)
|
||||
)
|
||||
|
||||
|
||||
class DataLoaderDispatcher(DataLoader):
|
||||
"""
|
||||
Args:
|
||||
Subclass of a PyTorch `DataLoader` that will iterate and preprocess on process 0 only, then dispatch on each
|
||||
process their part of the batch.
|
||||
|
||||
Args:
|
||||
split_batches (`bool`, *optional*, defaults to `False`):
|
||||
Whether the resulting `DataLoader` should split the batches of the original data loader across devices or
|
||||
yield full batches (in which case it will yield batches starting at the `process_index`-th and advancing of
|
||||
`num_processes` batches at each iteration).
|
||||
`num_processes` batches at each iteration). Another way to see this is that the observed batch size will be
|
||||
the same as the initial `dataloader` if this option is set to `True`, the batch size of the initial
|
||||
`dataloader` multiplied by `num_processes` otherwise. Setting this option to `True` requires that the batch
|
||||
size of the `dataloader` is a round multiple of `batch_size`.
|
||||
|
||||
Another way to see this is that the observed batch size will be the same as the initial `dataloader` if
|
||||
this option is set to `True`, the batch size of the initial `dataloader` multiplied by `num_processes`
|
||||
otherwise.
|
||||
**Available attributes:**
|
||||
|
||||
Setting this option to `True` requires that the batch size of the `dataloader` is a round multiple of
|
||||
`batch_size`.
|
||||
- **total_batch_size** (`int`) -- Total batch size of the dataloader across all processes.
|
||||
Equal to the original batch size when `split_batches=True`; otherwise the original batch size * the total
|
||||
number of processes
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, split_batches: bool = False, **kwargs):
|
||||
shuffle = False
|
||||
if is_torch_version(">=", "1.11.0"):
|
||||
from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe
|
||||
|
||||
# We need to save the shuffling state of the DataPipe
|
||||
if isinstance(dataset, ShufflerIterDataPipe):
|
||||
shuffle = dataset._shuffle_enabled
|
||||
super().__init__(dataset, **kwargs)
|
||||
self.split_batches = split_batches
|
||||
if version.parse(torch.__version__) < version.parse("1.8.0"):
|
||||
if is_torch_version("<", "1.8.0"):
|
||||
raise ImportError(
|
||||
"Using `DataLoaderDispatcher` requires PyTorch 1.8.0 minimum. You have {torch.__version__}."
|
||||
)
|
||||
if shuffle:
|
||||
torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
|
||||
|
||||
self.gradient_state = GradientState()
|
||||
self.state = AcceleratorState()
|
||||
|
||||
def _fetch_batches(self, iterator):
|
||||
batches, batch = None, None
|
||||
# On process 0, we gather the batch to dispatch.
|
||||
if self.state.process_index == 0:
|
||||
try:
|
||||
if self.split_batches:
|
||||
# One batch of the main iterator is dispatched and split.
|
||||
batch = next(iterator)
|
||||
else:
|
||||
# num_processes batches of the main iterator are concatenated then dispatched and split.
|
||||
# We add the batches one by one so we have the remainder available when drop_last=False.
|
||||
batches = []
|
||||
for _ in range(self.state.num_processes):
|
||||
batches.append(next(iterator))
|
||||
batch = concatenate(batches, dim=0)
|
||||
# In both cases, we need to get the structure of the batch that we will broadcast on other
|
||||
# processes to initialize the tensors with the right shape.
|
||||
# data_structure, stop_iteration
|
||||
batch_info = [get_data_structure(batch), False]
|
||||
except StopIteration:
|
||||
batch_info = [None, True]
|
||||
else:
|
||||
batch_info = [None, self._stop_iteration]
|
||||
# This is inplace, so after this instruction, every process has the same `batch_info` as process 0.
|
||||
broadcast_object_list(batch_info)
|
||||
self._stop_iteration = batch_info[1]
|
||||
if self._stop_iteration:
|
||||
# If drop_last is False and split_batches is False, we may have a remainder to take care of.
|
||||
if not self.split_batches and not self.drop_last:
|
||||
if self.state.process_index == 0 and len(batches) > 0:
|
||||
batch = concatenate(batches, dim=0)
|
||||
batch_info = [get_data_structure(batch), False]
|
||||
else:
|
||||
batch_info = [None, True]
|
||||
broadcast_object_list(batch_info)
|
||||
if batch_info[1]:
|
||||
return batch, batch_info, True
|
||||
else:
|
||||
return batch, batch_info, True
|
||||
return batch, batch_info, False
|
||||
|
||||
def __iter__(self):
|
||||
state = AcceleratorState()
|
||||
if state.process_index == 0:
|
||||
self.gradient_state._set_end_of_dataloader(False)
|
||||
main_iterator = None
|
||||
if self.state.process_index == 0:
|
||||
# We only iterate through the DataLoader on process 0.
|
||||
main_iterator = super().__iter__()
|
||||
stop_iteration = False
|
||||
self._stop_iteration = False
|
||||
first_batch = None
|
||||
while not stop_iteration:
|
||||
# On process 0, we gather the batch to dispatch.
|
||||
if state.process_index == 0:
|
||||
try:
|
||||
if self.split_batches:
|
||||
# One batch of the main iterator is dispatched and split.
|
||||
batch = next(main_iterator)
|
||||
else:
|
||||
# num_processes batches of the main iterator are concatenated then dispatched and split.
|
||||
# We add the batches one by one so we have the remainder available when drop_last=False.
|
||||
batches = []
|
||||
for _ in range(state.num_processes):
|
||||
batches.append(next(main_iterator))
|
||||
batch = concatenate(batches, dim=0)
|
||||
# In both cases, we need to get the structure of the batch that we will broadcast on other
|
||||
# processes to initialize the tensors with the right shape.
|
||||
# data_structure, stop_iteration
|
||||
batch_info = [get_data_structure(batch), False]
|
||||
except StopIteration:
|
||||
batch_info = [None, True]
|
||||
else:
|
||||
batch_info = [None, stop_iteration]
|
||||
|
||||
# This is inplace, so after this instruction, every process has the same `batch_info` as process 0.
|
||||
broadcast_object_list(batch_info)
|
||||
stop_iteration = batch_info[1]
|
||||
if stop_iteration:
|
||||
# If drop_last is False and split_batches is False, we may have a remainder to take care of.
|
||||
if not self.split_batches and not self.drop_last:
|
||||
if state.process_index == 0 and len(batches) > 0:
|
||||
batch = concatenate(batches, dim=0)
|
||||
batch_info = [get_data_structure(batch), False]
|
||||
else:
|
||||
batch_info = [None, True]
|
||||
broadcast_object_list(batch_info)
|
||||
if batch_info[1]:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
if state.process_index != 0:
|
||||
batch, batch_info, skip = self._fetch_batches(main_iterator)
|
||||
while True:
|
||||
if skip:
|
||||
continue
|
||||
if self.state.process_index != 0:
|
||||
# Initialize tensors on other processes than process 0.
|
||||
batch = initialize_tensors(batch_info[0])
|
||||
batch = send_to_device(batch, state.device)
|
||||
batch = send_to_device(batch, self.state.device)
|
||||
# Broadcast the batch before splitting it.
|
||||
batch = broadcast(batch, from_process=0)
|
||||
|
||||
if not self.drop_last and first_batch is None:
|
||||
# We keep at least num processes elements of the first batch to be able to complete the last batch
|
||||
first_batch = slice_tensors(batch, slice(0, state.num_processes))
|
||||
first_batch = slice_tensors(batch, slice(0, self.state.num_processes))
|
||||
|
||||
observed_batch_size = find_batch_size(batch)
|
||||
batch_size = observed_batch_size // state.num_processes
|
||||
batch_size = observed_batch_size // self.state.num_processes
|
||||
|
||||
if not self.drop_last and stop_iteration and observed_batch_size % state.num_processes != 0:
|
||||
if not self.drop_last and self._stop_iteration and observed_batch_size % self.state.num_processes != 0:
|
||||
# If the last batch is not complete, let's add the first batch to it.
|
||||
batch = concatenate([batch, first_batch], dim=0)
|
||||
batch_size += 1
|
||||
|
||||
data_slice = slice(state.process_index * batch_size, (state.process_index + 1) * batch_size)
|
||||
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
xm.mark_step()
|
||||
yield slice_tensors(batch, data_slice)
|
||||
data_slice = slice(self.state.process_index * batch_size, (self.state.process_index + 1) * batch_size)
|
||||
next_batch, next_batch_info, next_skip = self._fetch_batches(main_iterator)
|
||||
if not self._stop_iteration:
|
||||
yield slice_tensors(batch, data_slice)
|
||||
batch, batch_info, skip = next_batch, next_batch_info, next_skip
|
||||
else:
|
||||
self.gradient_state._set_end_of_dataloader(True)
|
||||
yield slice_tensors(batch, data_slice)
|
||||
break
|
||||
|
||||
def __len__(self):
|
||||
state = AcceleratorState()
|
||||
whole_length = super().__len__()
|
||||
if self.drop_last:
|
||||
return whole_length // state.num_processes
|
||||
if self.split_batches:
|
||||
return whole_length
|
||||
elif self.drop_last:
|
||||
return whole_length // self.state.num_processes
|
||||
else:
|
||||
return math.ceil(whole_length / state.num_processes)
|
||||
return math.ceil(whole_length / self.state.num_processes)
|
||||
|
||||
@property
|
||||
def total_batch_size(self):
|
||||
return (
|
||||
self.dataset.batch_size if self.split_batches else (self.dataset.batch_size * self.dataset.num_processes)
|
||||
)
|
||||
|
||||
|
||||
def prepare_data_loader(
|
||||
@ -478,7 +554,7 @@ def prepare_data_loader(
|
||||
|
||||
</Tip>"""
|
||||
if dispatch_batches is None:
|
||||
if version.parse(torch.__version__) < version.parse("1.8.0") or not put_on_device:
|
||||
if is_torch_version("<", "1.8.0") or not put_on_device:
|
||||
dispatch_batches = False
|
||||
else:
|
||||
dispatch_batches = isinstance(dataloader.dataset, IterableDataset)
|
||||
@ -557,15 +633,22 @@ def prepare_data_loader(
|
||||
kwargs["batch_size"] = dataloader.batch_size // num_processes if split_batches else dataloader.batch_size
|
||||
|
||||
if dispatch_batches:
|
||||
return DataLoaderDispatcher(
|
||||
new_dataset, split_batches=split_batches, batch_sampler=new_batch_sampler, **kwargs
|
||||
dataloader = DataLoaderDispatcher(
|
||||
new_dataset,
|
||||
split_batches=split_batches,
|
||||
batch_sampler=new_batch_sampler,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
dataloader = DataLoaderShard(
|
||||
new_dataset,
|
||||
device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
|
||||
batch_sampler=new_batch_sampler,
|
||||
rng_types=rng_types,
|
||||
generator=generator,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return DataLoaderShard(
|
||||
new_dataset,
|
||||
device=device if put_on_device else None,
|
||||
batch_sampler=new_batch_sampler,
|
||||
rng_types=rng_types,
|
||||
generator=generator,
|
||||
**kwargs,
|
||||
)
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
return MpDeviceLoaderWrapper(dataloader, device)
|
||||
return dataloader
|
||||
|
||||
@ -1,96 +0,0 @@
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .optimizer import AcceleratedOptimizer
|
||||
from .state import is_apex_available, is_deepspeed_available
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
from deepspeed import DeepSpeedEngine
|
||||
|
||||
if is_apex_available():
|
||||
from apex import amp
|
||||
|
||||
|
||||
class DeepSpeedEngineWrapper(DeepSpeedEngine):
|
||||
"""
|
||||
Wrapper over deepspeed.DeepSpeedEngine object
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# overwriting micro_steps for user's gradient_accumulation
|
||||
self.micro_steps = -1
|
||||
|
||||
def step(self, lr_kwargs=None):
|
||||
"""DeepSpeedEngine.step() without `micro_steps` update & no profiling"""
|
||||
if self.is_gradient_accumulation_boundary(): # it shouldn't matter whether we keep this line or not
|
||||
if self.progressive_layer_drop:
|
||||
self.progressive_layer_drop.update_state(self.global_steps)
|
||||
|
||||
self._take_model_step(lr_kwargs)
|
||||
|
||||
def backward(self, loss):
|
||||
"""DeepSpeedEngine.backward() with with no loss scaling; no profiling but with `micro_steps` update"""
|
||||
|
||||
if self.zero_optimization():
|
||||
self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
|
||||
self.optimizer.backward(loss)
|
||||
elif self.amp_enabled():
|
||||
# AMP requires delaying unscale when inside gradient accumulation boundaries
|
||||
# https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
|
||||
delay_unscale = not self.is_gradient_accumulation_boundary()
|
||||
with amp.scale_loss(loss, self.optimizer, delay_unscale=delay_unscale) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
elif self.fp16_enabled():
|
||||
self.optimizer.backward(loss)
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
if self.enable_backward_allreduce:
|
||||
self.allreduce_gradients()
|
||||
|
||||
# this will ensure deepspeed gradient_accumulation matches user's accumulation
|
||||
self.micro_steps += 1
|
||||
|
||||
|
||||
class DeepSpeedOptimizerWrapper(AcceleratedOptimizer):
|
||||
"""
|
||||
Internal wrapper around a deepspeed optimizer.
|
||||
|
||||
Args:
|
||||
optimizer (`torch.optim.optimizer.Optimizer`):
|
||||
The optimizer to wrap.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, model: DeepSpeedEngineWrapper):
|
||||
super().__init__(optimizer, device_placement=False, scaler=None)
|
||||
|
||||
self.model = model
|
||||
|
||||
def zero_grad(self, set_to_none=None):
|
||||
pass # `model.step()` is doing that automatically. Therefore, it's implementation is not needed
|
||||
|
||||
def step(self):
|
||||
"""This will handle optimizer.step() & optimizer.zero_grad() with gradient_accumulation"""
|
||||
self.model.step()
|
||||
|
||||
@property
|
||||
def is_overflow(self):
|
||||
"""Whether or not the optimizer step was done, or skipped because of gradient overflow."""
|
||||
overflow = False
|
||||
if hasattr(self.optimizer, "overflow"):
|
||||
overflow = self.optimizer.overflow
|
||||
return overflow
|
||||
480
src/accelerate/hooks.py
Normal file
480
src/accelerate/hooks.py
Normal file
@ -0,0 +1,480 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
from typing import Dict, List, Mapping, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .utils import PrefixedDataset, find_device, named_module_tensors, send_to_device, set_module_tensor_to_device
|
||||
|
||||
|
||||
class ModelHook:
|
||||
"""
|
||||
A hook that contains callbacks to be executed just before and after the forward method of a model. The difference
|
||||
with PyTorch existing hooks is that they get passed along the kwargs.
|
||||
|
||||
Class attribute:
|
||||
- **no_grad** (`bool`, *optional*, defaults to `False`) -- Whether or not to execute the actual forward pass under
|
||||
the `torch.no_grad()` context manager.
|
||||
"""
|
||||
|
||||
no_grad = False
|
||||
|
||||
def init_hook(self, module):
|
||||
"""
|
||||
To be executed when the hook is attached to the module.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module attached to this hook.
|
||||
"""
|
||||
return module
|
||||
|
||||
def pre_forward(self, module, *args, **kwargs):
|
||||
"""
|
||||
To be executed just before the forward method of the model.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module whose forward pass will be executed just after this event.
|
||||
args (`Tuple[Any]`): The positional arguments passed to the module.
|
||||
kwargs (`Dict[Str, Any]`): The keyword arguments passed to the module.
|
||||
|
||||
Returns:
|
||||
`Tuple[Tuple[Any], Dict[Str, Any]]`: A tuple with the treated `args` and `kwargs`.
|
||||
"""
|
||||
return args, kwargs
|
||||
|
||||
def post_forward(self, module, output):
|
||||
"""
|
||||
To be executed just after the forward method of the model.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module whose forward pass been executed just before this event.
|
||||
output (`Any`): The output of the module.
|
||||
|
||||
Returns:
|
||||
`Any`: The processed `output`.
|
||||
"""
|
||||
return output
|
||||
|
||||
def detach_hook(self, module):
|
||||
"""
|
||||
To be executed when the hook is deached from a module.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module detached from this hook.
|
||||
"""
|
||||
return module
|
||||
|
||||
|
||||
class SequentialHook(ModelHook):
|
||||
"""
|
||||
A hook that can contain several hooks and iterates through them at each event.
|
||||
"""
|
||||
|
||||
def __init__(self, *hooks):
|
||||
self.hooks = hooks
|
||||
|
||||
def init_hook(self, module):
|
||||
for hook in self.hooks:
|
||||
module = hook.init_hook(module)
|
||||
return module
|
||||
|
||||
def pre_forward(self, module, *args, **kwargs):
|
||||
for hook in self.hooks:
|
||||
args, kwargs = hook.pre_forward(module, *args, **kwargs)
|
||||
return args, kwargs
|
||||
|
||||
def post_forward(self, module, output):
|
||||
for hook in self.hooks:
|
||||
output = hook.post_forward(module, output)
|
||||
return output
|
||||
|
||||
def detach_hook(self, module):
|
||||
for hook in self.hooks:
|
||||
module = hook.detach_hook(module)
|
||||
return module
|
||||
|
||||
|
||||
def add_hook_to_module(module: nn.Module, hook: ModelHook):
|
||||
"""
|
||||
Adds a hook to a given module. This will rewrite the `forward` method of the module to include the hook, to remove
|
||||
this behavior and restore the original `forward` method, use `remove_hook_from_module`.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
If the module already contains a hook, this will replace it with the new hook passed. To chain two hooks together,
|
||||
use the `SequentialHook` class.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module to attach a hook to.
|
||||
hook (`ModelHook`): The hook to attach.
|
||||
|
||||
Returns:
|
||||
`torch.nn.Module`: The same module, with the hook attached (the module is modified in place, so the result can
|
||||
be discarded).
|
||||
"""
|
||||
if hasattr(module, "_hf_hook") and hasattr(module, "_old_forward"):
|
||||
# If we already put some hook on this module, we replace it with the new one.
|
||||
old_forward = module._old_forward
|
||||
else:
|
||||
old_forward = module.forward
|
||||
module._old_forward = old_forward
|
||||
|
||||
module = hook.init_hook(module)
|
||||
module._hf_hook = hook
|
||||
|
||||
@functools.wraps(old_forward)
|
||||
def new_forward(*args, **kwargs):
|
||||
args, kwargs = module._hf_hook.pre_forward(module, *args, **kwargs)
|
||||
if module._hf_hook.no_grad:
|
||||
with torch.no_grad():
|
||||
output = old_forward(*args, **kwargs)
|
||||
else:
|
||||
output = old_forward(*args, **kwargs)
|
||||
return module._hf_hook.post_forward(module, output)
|
||||
|
||||
module.forward = new_forward
|
||||
return module
|
||||
|
||||
|
||||
def remove_hook_from_module(module: nn.Module):
|
||||
"""
|
||||
Removes any hook attached to a module via `add_hook_to_module`.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module to attach a hook to.
|
||||
|
||||
Returns:
|
||||
`torch.nn.Module`: The same module, with the hook detached (the module is modified in place, so the result can
|
||||
be discarded).
|
||||
"""
|
||||
if hasattr(module, "_hf_hook"):
|
||||
module._hf_hook.detach_hook(module)
|
||||
delattr(module, "_hf_hook")
|
||||
|
||||
if hasattr(module, "_old_forward"):
|
||||
module.forward = module._old_forward
|
||||
delattr(module, "_old_forward")
|
||||
|
||||
return module
|
||||
|
||||
|
||||
class AlignDevicesHook(ModelHook):
|
||||
"""
|
||||
A generic `ModelHook` that ensures inputs and model weights are on the same device for the forward pass of the
|
||||
associated module, potentially offloading the weights after the forward pass.
|
||||
|
||||
Args:
|
||||
execution_device (`torch.device`, *optional*):
|
||||
The device on which inputs and model weights should be placed before the forward pass.
|
||||
offload (`bool`, *optional*, defauts to `False`):
|
||||
Whether or not the weights should be offloaded after the forward pass.
|
||||
io_same_device (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the output should be placed on the same device as the input was.
|
||||
weights_map (`Mapping[str, torch.Tensor]`, *optional*):
|
||||
When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to include the associated module's buffers when offloading.
|
||||
place_submodules (`bool`, *optional*, defaults to `False`):
|
||||
Whether to place the submodules on `execution_device` during the `init_hook` event.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
execution_device: Optional[Union[int, str, torch.device]] = None,
|
||||
offload: bool = False,
|
||||
io_same_device: bool = False,
|
||||
weights_map: Optional[Mapping] = None,
|
||||
offload_buffers: bool = False,
|
||||
place_submodules: bool = False,
|
||||
):
|
||||
self.execution_device = execution_device
|
||||
self.offload = offload
|
||||
self.io_same_device = io_same_device
|
||||
self.weights_map = weights_map
|
||||
self.offload_buffers = offload_buffers
|
||||
self.place_submodules = place_submodules
|
||||
|
||||
# Will contain the input device when `io_same_device=True`.
|
||||
self.input_device = None
|
||||
self.param_original_devices = {}
|
||||
self.buffer_original_devices = {}
|
||||
|
||||
def init_hook(self, module):
|
||||
if not self.offload and self.execution_device is not None:
|
||||
for name, _ in named_module_tensors(module, recurse=self.place_submodules):
|
||||
set_module_tensor_to_device(module, name, self.execution_device)
|
||||
elif self.offload:
|
||||
self.original_devices = {
|
||||
name: param.device for name, param in named_module_tensors(module, recurse=self.place_submodules)
|
||||
}
|
||||
if self.weights_map is None:
|
||||
self.weights_map = {
|
||||
name: param.to("cpu")
|
||||
for name, param in named_module_tensors(
|
||||
module, include_buffers=self.offload_buffers, recurse=self.place_submodules
|
||||
)
|
||||
}
|
||||
|
||||
for name, _ in named_module_tensors(
|
||||
module, include_buffers=self.offload_buffers, recurse=self.place_submodules
|
||||
):
|
||||
set_module_tensor_to_device(module, name, "meta")
|
||||
if not self.offload_buffers and self.execution_device is not None:
|
||||
for name, _ in module.named_buffers(recurse=self.place_submodules):
|
||||
set_module_tensor_to_device(module, name, self.execution_device)
|
||||
return module
|
||||
|
||||
def pre_forward(self, module, *args, **kwargs):
|
||||
if self.io_same_device:
|
||||
self.input_device = find_device([args, kwargs])
|
||||
if self.offload:
|
||||
for name, _ in named_module_tensors(
|
||||
module, include_buffers=self.offload_buffers, recurse=self.place_submodules
|
||||
):
|
||||
set_module_tensor_to_device(module, name, self.execution_device, value=self.weights_map[name])
|
||||
|
||||
return send_to_device(args, self.execution_device), send_to_device(kwargs, self.execution_device)
|
||||
|
||||
def post_forward(self, module, output):
|
||||
if self.offload:
|
||||
for name, _ in named_module_tensors(
|
||||
module, include_buffers=self.offload_buffers, recurse=self.place_submodules
|
||||
):
|
||||
set_module_tensor_to_device(module, name, "meta")
|
||||
|
||||
if self.io_same_device and self.input_device is not None:
|
||||
output = send_to_device(output, self.input_device)
|
||||
|
||||
return output
|
||||
|
||||
def detach_hook(self, module):
|
||||
if self.offload:
|
||||
for name, device in self.original_devices.items():
|
||||
if device != torch.device("meta"):
|
||||
set_module_tensor_to_device(module, name, device, value=self.weights_map.get(name, None))
|
||||
|
||||
|
||||
def attach_execution_device_hook(
|
||||
module: torch.nn.Module,
|
||||
execution_device: Union[int, str, torch.device],
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Recursively attaches `AlignDevicesHook` to all submodules of a given model to make sure they have the right
|
||||
execution device
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`):
|
||||
The module where we want to attach the hooks.
|
||||
execution_device (`int`, `str` or `torch.device`):
|
||||
The device on which inputs and model weights should be placed before the forward pass.
|
||||
preload_module_classes (`List[str]`, *optional*):
|
||||
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
|
||||
of the forward. This should only be used for classes that have submodules which are registered but not
|
||||
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
|
||||
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
|
||||
"""
|
||||
if not hasattr(module, "_hf_hook") and len(module.state_dict()) > 0:
|
||||
add_hook_to_module(module, AlignDevicesHook(execution_device))
|
||||
|
||||
# Break the recursion if we get to a preload module.
|
||||
if preload_module_classes is not None and module.__class__.__name__ in preload_module_classes:
|
||||
return
|
||||
|
||||
for child in module.children():
|
||||
attach_execution_device_hook(child, execution_device)
|
||||
|
||||
|
||||
def attach_align_device_hook(
|
||||
module: torch.nn.Module,
|
||||
execution_device: Optional[torch.device] = None,
|
||||
offload: bool = False,
|
||||
weights_map: Optional[Mapping] = None,
|
||||
offload_buffers: bool = False,
|
||||
module_name: str = "",
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Recursively attaches `AlignDevicesHook` to all submodules of a given model that have direct parameters and/or
|
||||
buffers.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`):
|
||||
The module where we want to attach the hooks.
|
||||
execution_device (`torch.device`, *optional*):
|
||||
The device on which inputs and model weights should be placed before the forward pass.
|
||||
offload (`bool`, *optional*, defauts to `False`):
|
||||
Whether or not the weights should be offloaded after the forward pass.
|
||||
weights_map (`Mapping[str, torch.Tensor]`, *optional*):
|
||||
When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to include the associated module's buffers when offloading.
|
||||
module_name (`str`, *optional*, defaults to `""`):
|
||||
The name of the module.
|
||||
preload_module_classes (`List[str]`, *optional*):
|
||||
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
|
||||
of the forward. This should only be used for classes that have submodules which are registered but not
|
||||
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
|
||||
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
|
||||
"""
|
||||
# Attach the hook on this module if it has any direct tensor.
|
||||
directs = named_module_tensors(module)
|
||||
full_offload = (
|
||||
offload and preload_module_classes is not None and module.__class__.__name__ in preload_module_classes
|
||||
)
|
||||
|
||||
if len(list(directs)) > 0 or full_offload:
|
||||
if weights_map is not None:
|
||||
prefix = f"{module_name}." if len(module_name) > 0 else ""
|
||||
prefixed_weights_map = PrefixedDataset(weights_map, prefix)
|
||||
else:
|
||||
prefixed_weights_map = None
|
||||
hook = AlignDevicesHook(
|
||||
execution_device=execution_device,
|
||||
offload=offload,
|
||||
weights_map=prefixed_weights_map,
|
||||
offload_buffers=offload_buffers,
|
||||
place_submodules=full_offload,
|
||||
)
|
||||
add_hook_to_module(module, hook)
|
||||
|
||||
# We stop the recursion in case we hit the full offload.
|
||||
if full_offload:
|
||||
return
|
||||
|
||||
# Recurse on all children of the module.
|
||||
for child_name, child in module.named_children():
|
||||
child_name = f"{module_name}.{child_name}" if len(module_name) > 0 else child_name
|
||||
attach_align_device_hook(
|
||||
child,
|
||||
execution_device=execution_device,
|
||||
offload=offload,
|
||||
weights_map=weights_map,
|
||||
offload_buffers=offload_buffers,
|
||||
module_name=child_name,
|
||||
preload_module_classes=preload_module_classes,
|
||||
)
|
||||
|
||||
|
||||
def remove_hook_from_submodules(module: nn.Module):
|
||||
"""
|
||||
Recursively removes all hooks attached on the submodules of a given model.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module on which to remove all hooks.
|
||||
"""
|
||||
remove_hook_from_module(module)
|
||||
for child in module.children():
|
||||
remove_hook_from_submodules(child)
|
||||
|
||||
|
||||
def attach_align_device_hook_on_blocks(
|
||||
module: nn.Module,
|
||||
execution_device: Optional[Union[torch.device, Dict[str, torch.device]]] = None,
|
||||
offload: Union[bool, Dict[str, bool]] = False,
|
||||
weights_map: Mapping = None,
|
||||
offload_buffers: bool = False,
|
||||
module_name: str = "",
|
||||
preload_module_classes: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
Attaches `AlignDevicesHook` to all blocks of a given model as needed.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`):
|
||||
The module where we want to attach the hooks.
|
||||
execution_device (`torch.device` or `Dict[str, torch.device]`, *optional*):
|
||||
The device on which inputs and model weights should be placed before the forward pass. It can be one device
|
||||
for the whole module, or a dictionary mapping module name to device.
|
||||
offload (`bool`, *optional*, defauts to `False`):
|
||||
Whether or not the weights should be offloaded after the forward pass. It can be one boolean for the whole
|
||||
module, or a dictionary mapping module name to boolean.
|
||||
weights_map (`Mapping[str, torch.Tensor]`, *optional*):
|
||||
When the model weights are offloaded, a (potentially lazy) map from param names to the tensor values.
|
||||
offload_buffers (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to include the associated module's buffers when offloading.
|
||||
module_name (`str`, *optional*, defaults to `""`):
|
||||
The name of the module.
|
||||
preload_module_classes (`List[str]`, *optional*):
|
||||
A list of classes whose instances should load all their weights (even in the submodules) at the beginning
|
||||
of the forward. This should only be used for classes that have submodules which are registered but not
|
||||
called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
|
||||
`dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
|
||||
"""
|
||||
# If one device and one offload, we've got one hook.
|
||||
if not isinstance(execution_device, Mapping) and not isinstance(offload, dict):
|
||||
if not offload:
|
||||
hook = AlignDevicesHook(execution_device=execution_device, io_same_device=True, place_submodules=True)
|
||||
add_hook_to_module(module, hook)
|
||||
else:
|
||||
attach_align_device_hook(
|
||||
module,
|
||||
execution_device=execution_device,
|
||||
offload=True,
|
||||
weights_map=weights_map,
|
||||
offload_buffers=offload_buffers,
|
||||
module_name=module_name,
|
||||
)
|
||||
return
|
||||
|
||||
if not isinstance(execution_device, Mapping):
|
||||
execution_device = {key: execution_device for key in offload.keys()}
|
||||
if not isinstance(offload, Mapping):
|
||||
offload = {key: offload for key in execution_device.keys()}
|
||||
|
||||
if module_name in execution_device and not offload[module_name]:
|
||||
hook = AlignDevicesHook(
|
||||
execution_device=execution_device[module_name],
|
||||
offload_buffers=offload_buffers,
|
||||
io_same_device=(module_name == ""),
|
||||
place_submodules=True,
|
||||
)
|
||||
add_hook_to_module(module, hook)
|
||||
attach_execution_device_hook(module, execution_device[module_name])
|
||||
elif module_name in execution_device:
|
||||
attach_align_device_hook(
|
||||
module,
|
||||
execution_device=execution_device[module_name],
|
||||
offload=True,
|
||||
weights_map=weights_map,
|
||||
offload_buffers=offload_buffers,
|
||||
module_name=module_name,
|
||||
preload_module_classes=preload_module_classes,
|
||||
)
|
||||
if not hasattr(module, "_hf_hook"):
|
||||
hook = AlignDevicesHook(execution_device=execution_device[module_name], io_same_device=(module_name == ""))
|
||||
add_hook_to_module(module, hook)
|
||||
attach_execution_device_hook(
|
||||
module, execution_device[module_name], preload_module_classes=preload_module_classes
|
||||
)
|
||||
elif module_name == "":
|
||||
hook = AlignDevicesHook(io_same_device=True)
|
||||
add_hook_to_module(module, hook)
|
||||
|
||||
for child_name, child in module.named_children():
|
||||
child_name = f"{module_name}.{child_name}" if len(module_name) > 0 else child_name
|
||||
attach_align_device_hook_on_blocks(
|
||||
child,
|
||||
execution_device=execution_device,
|
||||
offload=offload,
|
||||
weights_map=weights_map,
|
||||
offload_buffers=offload_buffers,
|
||||
module_name=child_name,
|
||||
preload_module_classes=preload_module_classes,
|
||||
)
|
||||
@ -1,90 +0,0 @@
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
from dataclasses import dataclass
|
||||
from datetime import timedelta
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class KwargsHandler:
|
||||
"""
|
||||
Internal mixin that implements a `to_kwargs()` method for a dataclass.
|
||||
"""
|
||||
|
||||
def to_dict(self):
|
||||
return copy.deepcopy(self.__dict__)
|
||||
|
||||
def to_kwargs(self):
|
||||
"""
|
||||
Returns a dictionary containing the attributes with values different from the default of this class.
|
||||
"""
|
||||
default_dict = self.__class__().to_dict()
|
||||
this_dict = self.to_dict()
|
||||
return {k: v for k, v in this_dict.items() if default_dict[k] != v}
|
||||
|
||||
|
||||
@dataclass
|
||||
class DistributedDataParallelKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your [`Accelerator`] to customize how your model is wrapped in a
|
||||
`torch.nn.parallel.DistributedDataParallel`. Please refer to the documentation of this
|
||||
[wrapper](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) for more
|
||||
information on each argument.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`gradient_as_bucket_view` is only available in PyTorch 1.7.0 and later versions.
|
||||
|
||||
</Tip>"""
|
||||
|
||||
dim: int = 0
|
||||
broadcast_buffers: bool = True
|
||||
bucket_cap_mb: int = 25
|
||||
find_unused_parameters: bool = False
|
||||
check_reduction: bool = False
|
||||
gradient_as_bucket_view: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class GradScalerKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your [`Accelerator`] to customize the behavior of mixed precision, specifically how the
|
||||
`torch.cuda.amp.GradScaler` used is created. Please refer to the documentation of this
|
||||
[scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more information on each argument.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`GradScaler` is only available in PyTorch 1.5.0 and later versions.
|
||||
|
||||
</Tip>"""
|
||||
|
||||
init_scale: float = 65536.0
|
||||
growth_factor: float = 2.0
|
||||
backoff_factor: float = 0.5
|
||||
growth_interval: int = 2000
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class InitProcessGroupKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your [`Accelerator`] to customize the initialization of the distributed processes. Please refer
|
||||
to the documentation of this
|
||||
[method](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
|
||||
information on each argument.
|
||||
"""
|
||||
|
||||
init_method: Optional[str] = None
|
||||
timeout: timedelta = timedelta(seconds=1800)
|
||||
@ -19,10 +19,8 @@ import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .state import AcceleratorState
|
||||
from .utils import PrecisionType, PrepareForLaunch, patch_environment
|
||||
from .utils import PrecisionType, PrepareForLaunch, is_torch_version, patch_environment
|
||||
|
||||
|
||||
def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mixed_precision="no", use_port="29500"):
|
||||
@ -52,6 +50,13 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mix
|
||||
else:
|
||||
in_colab_or_kaggle = False
|
||||
|
||||
try:
|
||||
mixed_precision = PrecisionType(mixed_precision.lower())
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
|
||||
)
|
||||
|
||||
if in_colab_or_kaggle:
|
||||
if os.environ.get("TPU_NAME", None) is not None:
|
||||
# TPU launch
|
||||
@ -74,18 +79,18 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mix
|
||||
if torch.cuda.is_available():
|
||||
print("Launching training on one GPU.")
|
||||
else:
|
||||
print("Launching training on CPU.")
|
||||
print("Launching training on one CPU.")
|
||||
function(*args)
|
||||
|
||||
else:
|
||||
if num_processes is None:
|
||||
raise ValueError(
|
||||
"You have to specify the number of GPUs you would like to use, add `num_process=...` to your call."
|
||||
"You have to specify the number of GPUs you would like to use, add `num_processes=...` to your call."
|
||||
)
|
||||
|
||||
if num_processes > 1:
|
||||
# Multi-GPU launch
|
||||
if version.parse(torch.__version__) < version.parse("1.5.0"):
|
||||
if is_torch_version("<", "1.5.0"):
|
||||
raise ImportError(
|
||||
"Using `notebook_launcher` for distributed training on GPUs require torch >= 1.5.0, got "
|
||||
f"{torch.__version__}."
|
||||
@ -107,13 +112,6 @@ def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, mix
|
||||
"function."
|
||||
)
|
||||
|
||||
try:
|
||||
mixed_precision = PrecisionType(mixed_precision.lower())
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
|
||||
)
|
||||
|
||||
if use_fp16:
|
||||
warnings.warn('use_fp16=True is deprecated. Use mixed_precision="fp16" instead.', DeprecationWarning)
|
||||
mixed_precision = "fp16"
|
||||
@ -156,7 +154,7 @@ def debug_launcher(function, args=(), num_processes=2):
|
||||
num_processes (`int`, *optional*, defaults to 2):
|
||||
The number of processes to use for training.
|
||||
"""
|
||||
if version.parse(torch.__version__) < version.parse("1.5.0"):
|
||||
if is_torch_version("<", "1.5.0"):
|
||||
raise ImportError(
|
||||
"Using `debug_launcher` for distributed training on GPUs require torch >= 1.5.0, got "
|
||||
f"{torch.__version__}."
|
||||
|
||||
63
src/accelerate/logging.py
Normal file
63
src/accelerate/logging.py
Normal file
@ -0,0 +1,63 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
|
||||
from .state import AcceleratorState
|
||||
|
||||
|
||||
class MultiProcessAdapter(logging.LoggerAdapter):
|
||||
"""
|
||||
An adapter to assist with logging in multiprocess.
|
||||
|
||||
`log` takes in an additional `main_process_only` kwarg, which dictates whether it should be called on all processes
|
||||
or only the main executed one. Default is `main_process_only=True`.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _should_log(main_process_only):
|
||||
"Check if log should be performed"
|
||||
return not main_process_only or (main_process_only and AcceleratorState().local_process_index == 0)
|
||||
|
||||
def log(self, level, msg, *args, **kwargs):
|
||||
"""
|
||||
Delegates logger call after checking if we should log.
|
||||
|
||||
Accepts a new kwarg of `main_process_only`, which will dictate whether it will be logged across all processes
|
||||
or only the main executed one. Default is `True` if not passed
|
||||
"""
|
||||
main_process_only = kwargs.pop("main_process_only", True)
|
||||
if self.isEnabledFor(level) and self._should_log(main_process_only):
|
||||
msg, kwargs = self.process(msg, kwargs)
|
||||
self.logger.log(level, msg, *args, **kwargs)
|
||||
|
||||
|
||||
def get_logger(name: str):
|
||||
"""
|
||||
Returns a `logging.Logger` for `name` that can handle multiprocessing.
|
||||
|
||||
If a log should be called on all processes, pass `main_process_only=False`
|
||||
|
||||
E.g.
|
||||
```python
|
||||
logger.info("My log", main_process_only=False)
|
||||
logger.debug("My log", main_process_only=False)
|
||||
```
|
||||
|
||||
Args:
|
||||
name (`str`):
|
||||
The name for the logger, such as `__file__`
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
return MultiProcessAdapter(logger, {})
|
||||
@ -12,75 +12,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
A collection of utilities for ensuring that training can always occur. Heavily influenced by the
|
||||
[toma](https://github.com/BlackHC/toma) library.
|
||||
"""
|
||||
|
||||
import functools
|
||||
import gc
|
||||
import inspect
|
||||
|
||||
import torch
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all
|
||||
|
||||
|
||||
def should_reduce_batch_size(exception: Exception) -> bool:
|
||||
"""
|
||||
Checks if `exception` relates to CUDA out-of-memory, CUDNN not supported, or CPU out-of-memory
|
||||
|
||||
Args:
|
||||
exception (`Exception`):
|
||||
An exception
|
||||
"""
|
||||
_statements = [
|
||||
"CUDA out of memory.", # CUDA OOM
|
||||
"cuDNN error: CUDNN_STATUS_NOT_SUPPORTED.", # CUDNN SNAFU
|
||||
"DefaultCPUAllocator: can't allocate memory", # CPU OOM
|
||||
]
|
||||
if isinstance(exception, RuntimeError) and len(exception.args) == 1:
|
||||
return any(err in exception.args[0] for err in _statements)
|
||||
return False
|
||||
import warnings
|
||||
|
||||
|
||||
def find_executable_batch_size(function: callable = None, starting_batch_size: int = 128):
|
||||
"""
|
||||
A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
|
||||
CUDNN, the batch size is cut in half and passed to `function`
|
||||
warnings.warn(
|
||||
"memory_utils has been reorganized to utils.memory. Import `find_executable_batchsize` from the main `__init__`: "
|
||||
"`from accelerate import find_executable_batch_size` to avoid this warning.",
|
||||
FutureWarning,
|
||||
)
|
||||
|
||||
`function` must take in a `batch_size` parameter as its first argument.
|
||||
|
||||
Args:
|
||||
function (`callable`, *optional*):
|
||||
A function to wrap
|
||||
starting_batch_size (`int`, *optional*):
|
||||
The batch size to try and fit into memory
|
||||
"""
|
||||
if function is None:
|
||||
return functools.partial(find_executable_batch_size, starting_batch_size=starting_batch_size)
|
||||
|
||||
batch_size = starting_batch_size
|
||||
|
||||
def decorator(*args, **kwargs):
|
||||
nonlocal batch_size
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
params = list(inspect.signature(function).parameters.keys())
|
||||
# Guard against user error
|
||||
if len(params) < (len(args) + 1):
|
||||
arg_str = ", ".join([f"{arg}={value}" for arg, value in zip(params[1:], args[1:])])
|
||||
raise TypeError(
|
||||
f"Batch size was passed into `{function.__name__}` as the first argument when called."
|
||||
f"Remove this as the decorator already does so: `{function.__name__}({arg_str})`"
|
||||
)
|
||||
while True:
|
||||
try:
|
||||
return function(batch_size, *args, **kwargs)
|
||||
except Exception as e:
|
||||
if should_reduce_batch_size(e):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
batch_size //= 2
|
||||
else:
|
||||
raise
|
||||
|
||||
return decorator
|
||||
from .utils.memory import find_executable_batch_size
|
||||
|
||||
@ -17,13 +17,11 @@ import warnings
|
||||
|
||||
import torch
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .state import AcceleratorState, DistributedType, is_tpu_available
|
||||
from .utils import honor_type
|
||||
from .state import AcceleratorState, GradientState
|
||||
from .utils import DistributedType, honor_type, is_torch_version, is_tpu_available
|
||||
|
||||
|
||||
if is_tpu_available():
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
|
||||
@ -41,6 +39,9 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
"""
|
||||
Internal wrapper around a torch optimizer.
|
||||
|
||||
Conditionally will perform `step` and `zero_grad` if gradients should be synchronized when performing gradient
|
||||
accumulation.
|
||||
|
||||
Args:
|
||||
optimizer (`torch.optim.optimizer.Optimizer`):
|
||||
The optimizer to wrap.
|
||||
@ -55,6 +56,7 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
self.optimizer = optimizer
|
||||
self.scaler = scaler
|
||||
self.accelerator_state = AcceleratorState()
|
||||
self.gradient_state = GradientState()
|
||||
self.device_placement = device_placement
|
||||
self._is_overflow = False
|
||||
|
||||
@ -103,37 +105,39 @@ class AcceleratedOptimizer(torch.optim.Optimizer):
|
||||
return self.optimizer.state_dict()
|
||||
|
||||
def zero_grad(self, set_to_none=None):
|
||||
if version.parse(torch.__version__) < version.parse("1.7.0"):
|
||||
if set_to_none is not None:
|
||||
raise ValueError(
|
||||
"`set_to_none` for Optimizer.zero_grad` was introduced in PyTorch 1.7.0 and can't be used for "
|
||||
f"earlier versions (found version {torch.__version__})."
|
||||
)
|
||||
self.optimizer.zero_grad()
|
||||
else:
|
||||
accept_arg = "set_to_none" in inspect.signature(self.optimizer.zero_grad).parameters
|
||||
if accept_arg:
|
||||
if set_to_none is None:
|
||||
set_to_none = False
|
||||
self.optimizer.zero_grad(set_to_none=set_to_none)
|
||||
else:
|
||||
if self.gradient_state.sync_gradients:
|
||||
if is_torch_version("<", "1.7.0"):
|
||||
if set_to_none is not None:
|
||||
raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
|
||||
raise ValueError(
|
||||
"`set_to_none` for Optimizer.zero_grad` was introduced in PyTorch 1.7.0 and can't be used for "
|
||||
f"earlier versions (found version {torch.__version__})."
|
||||
)
|
||||
self.optimizer.zero_grad()
|
||||
else:
|
||||
accept_arg = "set_to_none" in inspect.signature(self.optimizer.zero_grad).parameters
|
||||
if accept_arg:
|
||||
if set_to_none is None:
|
||||
set_to_none = False
|
||||
self.optimizer.zero_grad(set_to_none=set_to_none)
|
||||
else:
|
||||
if set_to_none is not None:
|
||||
raise ValueError("`set_to_none` for Optimizer.zero_grad` is not supported by this optimizer.")
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
def step(self, closure=None):
|
||||
if self.accelerator_state.distributed_type == DistributedType.TPU:
|
||||
optimizer_args = {"closure": closure} if closure is not None else {}
|
||||
xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args)
|
||||
elif self.scaler is not None:
|
||||
scale_before = self.scaler.get_scale()
|
||||
self.scaler.step(self.optimizer, closure)
|
||||
self.scaler.update()
|
||||
scale_after = self.scaler.get_scale()
|
||||
# If we reduced the loss scale, it means the optimizer step was skipped because of gradient overflow.
|
||||
self._is_overflow = scale_after < scale_before
|
||||
else:
|
||||
self.optimizer.step(closure)
|
||||
if self.gradient_state.sync_gradients:
|
||||
if self.accelerator_state.distributed_type == DistributedType.TPU:
|
||||
optimizer_args = {"closure": closure} if closure is not None else {}
|
||||
xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args)
|
||||
elif self.scaler is not None:
|
||||
scale_before = self.scaler.get_scale()
|
||||
self.scaler.step(self.optimizer, closure)
|
||||
self.scaler.update()
|
||||
scale_after = self.scaler.get_scale()
|
||||
# If we reduced the loss scale, it means the optimizer step was skipped because of gradient overflow.
|
||||
self._is_overflow = scale_after < scale_before
|
||||
else:
|
||||
self.optimizer.step(closure)
|
||||
|
||||
def _switch_parameters(self, parameters_map):
|
||||
for param_group in self.optimizer.param_groups:
|
||||
|
||||
@ -12,16 +12,24 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# We ignore warnings about stepping the scheduler since we step it ourselves during gradient accumulation
|
||||
|
||||
import warnings
|
||||
|
||||
from .state import AcceleratorState
|
||||
|
||||
|
||||
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")
|
||||
|
||||
|
||||
class AcceleratedScheduler:
|
||||
"""
|
||||
A wrapper around a learning rate scheduler that will only step when the optimizer(s) have a training step. Useful
|
||||
to avoid making a scheduler step too fast when:
|
||||
to avoid making a scheduler step too fast when gradients went overflow and there was no training step (in mixed
|
||||
precision training)
|
||||
|
||||
- gradients went overflow and there was no training step (in mixed precision training)
|
||||
- step was skipped because of gradient accumulation
|
||||
When performing gradient accumulation scheduler lengths should not be changed accordingly, accelerate will always
|
||||
step the scheduler to account for it.
|
||||
|
||||
Args:
|
||||
scheduler (`torch.optim.lr_scheduler._LRScheduler`):
|
||||
@ -52,7 +60,6 @@ class AcceleratedScheduler:
|
||||
for opt in self.optimizers:
|
||||
if opt.step_was_skipped:
|
||||
return
|
||||
|
||||
if self.split_batches:
|
||||
# Split batches -> the training dataloader batch size is not changed so one step per training step
|
||||
self.scheduler.step(*args, **kwargs)
|
||||
@ -61,7 +68,9 @@ class AcceleratedScheduler:
|
||||
# num_processes steps per training step
|
||||
num_processes = AcceleratorState().num_processes
|
||||
for _ in range(num_processes):
|
||||
self.scheduler.step(*args, **kwargs)
|
||||
# Special case when using OneCycle and `drop_last` was not used
|
||||
if getattr(self.scheduler, "total_steps", 0) <= self.scheduler.last_epoch:
|
||||
self.scheduler.step(*args, **kwargs)
|
||||
|
||||
# Passthroughs
|
||||
def get_last_lr(self):
|
||||
|
||||
@ -12,29 +12,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import importlib
|
||||
import os
|
||||
from distutils.util import strtobool
|
||||
from enum import Enum
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
try:
|
||||
import torch_ccl # noqa: F401
|
||||
|
||||
_ccl_available = True
|
||||
except ImportError:
|
||||
_ccl_available = False
|
||||
from .utils import DistributedType, is_ccl_available, is_deepspeed_available, is_tpu_available
|
||||
from .utils.dataclasses import SageMakerDistributedType
|
||||
|
||||
|
||||
try:
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
_tpu_available = True
|
||||
except ImportError:
|
||||
_tpu_available = False
|
||||
|
||||
|
||||
def get_int_from_env(env_keys, default):
|
||||
"""Returns the first positive env value found in the `env_keys` list or the default."""
|
||||
@ -45,22 +34,6 @@ def get_int_from_env(env_keys, default):
|
||||
return default
|
||||
|
||||
|
||||
def is_ccl_available():
|
||||
return _ccl_available
|
||||
|
||||
|
||||
def is_apex_available():
|
||||
return importlib.util.find_spec("apex") is not None
|
||||
|
||||
|
||||
def is_tpu_available():
|
||||
return _tpu_available
|
||||
|
||||
|
||||
def is_deepspeed_available():
|
||||
return importlib.util.find_spec("deepspeed") is not None
|
||||
|
||||
|
||||
def parse_flag_from_env(key, default=False):
|
||||
value = os.environ.get(key, str(default))
|
||||
return strtobool(value) == 1 # As its name indicates `strtobool` actually returns an int...
|
||||
@ -71,60 +44,6 @@ def parse_choice_from_env(key, default="no"):
|
||||
return value
|
||||
|
||||
|
||||
class DistributedType(str, Enum):
|
||||
"""
|
||||
Represents a type of distributed environment.
|
||||
|
||||
Values:
|
||||
|
||||
- **NO** -- Not a distributed environment, just a single process.
|
||||
- **MULTI_CPU** -- Distributed on multiple CPU nodes.
|
||||
- **MULTI_GPU** -- Distributed on multiple GPUs.
|
||||
- **DEEPSPEED** -- Using DeepSpeed.
|
||||
- **TPU** -- Distributed on TPUs.
|
||||
"""
|
||||
|
||||
# Subclassing str as well as Enum allows the `DistributedType` to be JSON-serializable out of the box.
|
||||
NO = "NO"
|
||||
MULTI_CPU = "MULTI_CPU"
|
||||
MULTI_GPU = "MULTI_GPU"
|
||||
DEEPSPEED = "DEEPSPEED"
|
||||
FSDP = "FSDP"
|
||||
TPU = "TPU"
|
||||
|
||||
|
||||
class SageMakerDistributedType(str, Enum):
|
||||
"""
|
||||
Represents a type of distributed environment.
|
||||
|
||||
Values:
|
||||
|
||||
- **NO** -- Not a distributed environment, just a single process.
|
||||
- **DATA_PARALLEL** -- using sagemaker distributed data parallelism.
|
||||
- **MODEL_PARALLEL** -- using sagemaker distributed model parallelism.
|
||||
"""
|
||||
|
||||
# Subclassing str as well as Enum allows the `SageMakerDistributedType` to be JSON-serializable out of the box.
|
||||
NO = "NO"
|
||||
DATA_PARALLEL = "DATA_PARALLEL"
|
||||
MODEL_PARALLEL = "MODEL_PARALLEL"
|
||||
|
||||
|
||||
class ComputeEnvironment(str, Enum):
|
||||
"""
|
||||
Represents a type of the compute environment.
|
||||
|
||||
Values:
|
||||
|
||||
- **LOCAL_MACHINE** -- private/custom cluster hardware.
|
||||
- **AMAZON_SAGEMAKER** -- Amazon SageMaker as compute environment.
|
||||
"""
|
||||
|
||||
# Subclassing str as well as Enum allows the `ComputeEnvironment` to be JSON-serializable out of the box.
|
||||
LOCAL_MACHINE = "LOCAL_MACHINE"
|
||||
AMAZON_SAGEMAKER = "AMAZON_SAGEMAKER"
|
||||
|
||||
|
||||
# Inspired by Alex Martelli's 'Borg'.
|
||||
class AcceleratorState:
|
||||
"""
|
||||
@ -134,6 +53,7 @@ class AcceleratorState:
|
||||
Attributes:
|
||||
|
||||
- **device** (`torch.device`) -- The device to use.
|
||||
- **sync_gradients** (`bool`) -- Whether to sync the gradients or not
|
||||
- **distributed_type** (`~accelerate.state.DistributedType`) -- The type of distributed environment currently
|
||||
in use.
|
||||
- **num_processes** (`int`) -- The number of processes currently launched in parallel.
|
||||
@ -157,22 +77,46 @@ class AcceleratorState:
|
||||
self.__dict__ = self._shared_state
|
||||
if parse_flag_from_env("USE_CPU"):
|
||||
cpu = True
|
||||
self._check_initialized(mixed_precision, cpu)
|
||||
self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
|
||||
if not getattr(self, "initialized", False):
|
||||
self.backend = None
|
||||
self.deepspeed_plugin = None
|
||||
mixed_precision = mixed_precision.lower() if mixed_precision else None
|
||||
mixed_precision = (
|
||||
parse_choice_from_env("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision.lower()
|
||||
)
|
||||
if not _from_accelerator:
|
||||
raise ValueError(
|
||||
"Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` "
|
||||
"before using any functionality from the `accelerate` library."
|
||||
)
|
||||
if (
|
||||
os.environ.get("USE_SAGEMAKER", "false") == "true"
|
||||
and os.environ.get("SAGEMAKER_DISTRIBUTED_TYPE") != SageMakerDistributedType.NO
|
||||
and not cpu
|
||||
):
|
||||
if os.environ.get("SAGEMAKER_DISTRIBUTED_TYPE") == SageMakerDistributedType.DATA_PARALLEL:
|
||||
self.distributed_type = DistributedType.MULTI_GPU
|
||||
import smdistributed.dataparallel.torch.torch_smddp # noqa
|
||||
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group(backend="smddp")
|
||||
self.backend = "smddp"
|
||||
self.num_processes = torch.distributed.get_world_size()
|
||||
self.process_index = torch.distributed.get_rank()
|
||||
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
|
||||
self.device = torch.device("cuda", self.local_process_index)
|
||||
torch.cuda.set_device(self.device)
|
||||
self.mixed_precision = mixed_precision
|
||||
elif is_tpu_available() and not cpu:
|
||||
self.distributed_type = DistributedType.TPU
|
||||
self.num_processes = xm.xrt_world_size()
|
||||
self.process_index = xm.get_ordinal()
|
||||
self.local_process_index = xm.get_local_ordinal()
|
||||
self.device = xm.xla_device()
|
||||
self.mixed_precision = "no"
|
||||
if mixed_precision == "bf16":
|
||||
os.environ["XLA_USE_BF16"] = str(1)
|
||||
self.mixed_precision = mixed_precision
|
||||
elif os.environ.get("USE_DEEPSPEED", "false") == "true" and not cpu:
|
||||
assert (
|
||||
is_deepspeed_available()
|
||||
@ -187,13 +131,6 @@ class AcceleratorState:
|
||||
self.device = torch.device("cuda", self.local_process_index)
|
||||
torch.cuda.set_device(self.device)
|
||||
self.mixed_precision = "no" # deepspeed handles mixed_precision using deepspeed_config
|
||||
mixed_precision = (
|
||||
parse_choice_from_env("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
|
||||
)
|
||||
if mixed_precision == "fp16":
|
||||
deepspeed_plugin.deepspeed_config.update({"fp16": {"enabled": True}})
|
||||
elif mixed_precision == "bf16":
|
||||
deepspeed_plugin.deepspeed_config.update({"bfloat16": {"enabled": True}})
|
||||
self.deepspeed_plugin = deepspeed_plugin
|
||||
elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
|
||||
self.distributed_type = DistributedType.MULTI_GPU
|
||||
@ -205,15 +142,11 @@ class AcceleratorState:
|
||||
self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
|
||||
self.device = torch.device("cuda", self.local_process_index)
|
||||
torch.cuda.set_device(self.device)
|
||||
self.mixed_precision = (
|
||||
parse_choice_from_env("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
|
||||
)
|
||||
self.mixed_precision = mixed_precision
|
||||
if os.environ.get("USE_FSDP", "false") == "true":
|
||||
self.distributed_type = DistributedType.FSDP
|
||||
if self.mixed_precision != "no":
|
||||
raise ValueError(
|
||||
"Mixed precision is currently not supported for FSDP. Please set `mixed_precision` to `no`."
|
||||
)
|
||||
fsdp_plugin.set_mixed_precision(self.mixed_precision)
|
||||
self.fsdp_plugin = fsdp_plugin
|
||||
elif get_int_from_env(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"], 1) > 1:
|
||||
self.distributed_type = DistributedType.MULTI_CPU
|
||||
@ -251,15 +184,13 @@ class AcceleratorState:
|
||||
self.process_index = torch.distributed.get_rank()
|
||||
self.local_process_index = local_rank
|
||||
self.device = torch.device("cpu")
|
||||
self.mixed_precision = "no"
|
||||
self.mixed_precision = mixed_precision
|
||||
else:
|
||||
self.distributed_type = DistributedType.NO
|
||||
self.num_processes = 1
|
||||
self.process_index = self.local_process_index = 0
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() and not cpu else "cpu")
|
||||
self.mixed_precision = (
|
||||
parse_choice_from_env("MIXED_PRECISION", "no") if mixed_precision is None else mixed_precision
|
||||
)
|
||||
self.mixed_precision = mixed_precision
|
||||
self.initialized = True
|
||||
|
||||
def __repr__(self):
|
||||
@ -271,13 +202,61 @@ class AcceleratorState:
|
||||
f"Process index: {self.process_index}\n"
|
||||
f"Local process index: {self.local_process_index}\n"
|
||||
f"Device: {self.device}\n"
|
||||
f"Mixed precision type: {mixed_precision}\n"
|
||||
)
|
||||
if self.distributed_type == DistributedType.DEEPSPEED:
|
||||
repr += f"ds_config: {self.deepspeed_plugin.deepspeed_config}\n"
|
||||
else:
|
||||
f"Mixed precision type: {mixed_precision}\n"
|
||||
return repr
|
||||
|
||||
# For backward compatibility
|
||||
@property
|
||||
def use_fp16(self):
|
||||
return self.mixed_precision != "no"
|
||||
|
||||
@staticmethod
|
||||
def _reset_state():
|
||||
"Resets `_shared_state`, is used internally and should not be called"
|
||||
AcceleratorState._shared_state = {}
|
||||
|
||||
def _check_initialized(self, mixed_precision=None, cpu=None):
|
||||
"Checks if a modification is trying to be made and the `AcceleratorState` has already been initialized"
|
||||
if getattr(self, "initialized", False):
|
||||
err = "AcceleratorState has already been initialized and cannot be changed, restart your runtime completely and pass `{flag}` to `Accelerate()`."
|
||||
if cpu and self.device.type != "cpu":
|
||||
raise ValueError(err.format(flag="cpu=True"))
|
||||
if mixed_precision is not None and mixed_precision != self.mixed_precision:
|
||||
raise ValueError(err.format(flag=f"mixed_precision='{mixed_precision}'"))
|
||||
|
||||
|
||||
class GradientState:
|
||||
"""
|
||||
This is a variation of a [singleton class](https://en.wikipedia.org/wiki/Singleton_pattern) in the sense that all
|
||||
instance of `GradientState` share the same state, which is initialized on the first instantiation.
|
||||
|
||||
This specific state revolves around whether gradients should be synced and if we have reached the end of a prepared
|
||||
dataloader Attributes:
|
||||
|
||||
- **sync_gradients** (`bool`) -- Whether the gradients should be synced
|
||||
- **end_of_dataloader** (`bool`) -- Whether we have reached the end the current dataloader
|
||||
"""
|
||||
|
||||
_shared_state = {}
|
||||
|
||||
def __init__(self):
|
||||
self.__dict__ = self._shared_state
|
||||
if not getattr(self, "initialized", False):
|
||||
self.sync_gradients = True
|
||||
self.end_of_dataloader = False
|
||||
self.initialized = True
|
||||
|
||||
def __repr__(self):
|
||||
return f"Sync Gradients: {self.sync_gradients}\n" f"At end of current dataloader: {self.end_of_dataloader}\n"
|
||||
|
||||
def _set_sync_gradients(self, sync_gradients):
|
||||
"Private function that sets whether gradients should be synchronized. Users should not have to call this."
|
||||
self.sync_gradients = sync_gradients
|
||||
|
||||
def _set_end_of_dataloader(self, end_of_dataloader):
|
||||
"Private function that sets whether the end of the current dataloader has been reached. Users should not have to call this."
|
||||
self.end_of_dataloader = end_of_dataloader
|
||||
|
||||
@ -2,5 +2,17 @@
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
from .testing import are_the_same_tensors, execute_subprocess_async, require_cuda, require_multi_gpu, require_tpu
|
||||
from .testing import (
|
||||
are_the_same_tensors,
|
||||
execute_subprocess_async,
|
||||
require_cpu,
|
||||
require_cuda,
|
||||
require_multi_gpu,
|
||||
require_single_gpu,
|
||||
require_tpu,
|
||||
slow,
|
||||
)
|
||||
from .training import RegressionDataset, RegressionModel
|
||||
|
||||
|
||||
from .scripts import test_script, test_sync # isort:skip
|
||||
|
||||
0
src/accelerate/test_utils/scripts/__init__.py
Normal file
0
src/accelerate/test_utils/scripts/__init__.py
Normal file
@ -19,10 +19,16 @@ from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator
|
||||
from accelerate.data_loader import prepare_data_loader
|
||||
from accelerate.state import AcceleratorState, DistributedType
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.test_utils import RegressionDataset, RegressionModel, are_the_same_tensors
|
||||
from accelerate.utils import gather, set_seed, synchronize_rng_states
|
||||
from packaging import version
|
||||
from accelerate.utils import (
|
||||
DistributedType,
|
||||
gather,
|
||||
is_bf16_available,
|
||||
is_torch_version,
|
||||
set_seed,
|
||||
synchronize_rng_states,
|
||||
)
|
||||
|
||||
|
||||
def init_state_check():
|
||||
@ -40,7 +46,7 @@ def rng_sync_check():
|
||||
if state.distributed_type == DistributedType.MULTI_GPU:
|
||||
synchronize_rng_states(["cuda"])
|
||||
assert are_the_same_tensors(torch.cuda.get_rng_state()), "RNG states improperly synchronized on GPU."
|
||||
if version.parse(torch.__version__) >= version.parse("1.6.0"):
|
||||
if is_torch_version(">=", "1.6.0"):
|
||||
generator = torch.Generator()
|
||||
synchronize_rng_states(["generator"], generator=generator)
|
||||
assert are_the_same_tensors(generator.get_state()), "RNG states improperly synchronized in generator."
|
||||
@ -246,71 +252,77 @@ def training_check():
|
||||
|
||||
accelerator.print("Training yielded the same results on one CPU or distributes setup with batch split.")
|
||||
|
||||
# Mostly a test that FP16 doesn't crash as the operation inside the model is not converted to FP16
|
||||
print("FP16 training check.")
|
||||
accelerator = Accelerator(mixed_precision="fp16")
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
if torch.cuda.is_available():
|
||||
# Mostly a test that FP16 doesn't crash as the operation inside the model is not converted to FP16
|
||||
print("FP16 training check.")
|
||||
AcceleratorState._reset_state()
|
||||
accelerator = Accelerator(mixed_precision="fp16")
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
# TEST that previous fp16 flag still works
|
||||
print("Legacy FP16 training check.")
|
||||
accelerator = Accelerator(fp16=True)
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
# TEST that previous fp16 flag still works
|
||||
print("Legacy FP16 training check.")
|
||||
AcceleratorState._reset_state()
|
||||
accelerator = Accelerator(fp16=True)
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
# Mostly a test that BF16 doesn't crash as the operation inside the model is not converted to BF16
|
||||
print("BF16 training check.")
|
||||
accelerator = Accelerator(mixed_precision="bf16")
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
# BF16 support is only for CPU + TPU, and some GPU
|
||||
if is_bf16_available():
|
||||
# Mostly a test that BF16 doesn't crash as the operation inside the model is not converted to BF16
|
||||
print("BF16 training check.")
|
||||
AcceleratorState._reset_state()
|
||||
accelerator = Accelerator(mixed_precision="bf16")
|
||||
train_dl = DataLoader(train_set, batch_size=batch_size, shuffle=True, generator=generator)
|
||||
model = RegressionModel()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
|
||||
set_seed(42)
|
||||
generator.manual_seed(42)
|
||||
for _ in range(3):
|
||||
for batch in train_dl:
|
||||
model.zero_grad()
|
||||
output = model(batch["x"])
|
||||
loss = torch.nn.functional.mse_loss(output, batch["y"])
|
||||
accelerator.backward(loss)
|
||||
optimizer.step()
|
||||
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
model = accelerator.unwrap_model(model).cpu()
|
||||
assert torch.allclose(old_model.a, model.a), "Did not obtain the same model on CPU or distributed training."
|
||||
assert torch.allclose(old_model.b, model.b), "Did not obtain the same model on CPU or distributed training."
|
||||
|
||||
|
||||
def main():
|
||||
@ -327,7 +339,8 @@ def main():
|
||||
if state.local_process_index == 0:
|
||||
print("\n**DataLoader integration test**")
|
||||
dl_preparation_check()
|
||||
central_dl_preparation_check()
|
||||
if state.distributed_type != DistributedType.TPU:
|
||||
central_dl_preparation_check()
|
||||
|
||||
# Trainings are not exactly the same in DeepSpeed and CPU mode
|
||||
if state.distributed_type == DistributedType.DEEPSPEED:
|
||||
@ -338,5 +351,10 @@ def main():
|
||||
training_check()
|
||||
|
||||
|
||||
def _mp_fn(index):
|
||||
# For xla_spawn (TPUs)
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
274
src/accelerate/test_utils/scripts/test_sync.py
Normal file
274
src/accelerate/test_utils/scripts/test_sync.py
Normal file
@ -0,0 +1,274 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.optim import AdamW
|
||||
from torch.optim.lr_scheduler import LambdaLR
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate import Accelerator
|
||||
from accelerate.test_utils import RegressionDataset, RegressionModel
|
||||
from accelerate.utils import DistributedType, set_seed
|
||||
|
||||
|
||||
def check_model_parameters(model_a, model_b, did_step, iteration):
|
||||
for param, grad_param in zip(model_a.parameters(), model_b.parameters()):
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
if not did_step:
|
||||
# Grads should not be in sync
|
||||
assert (
|
||||
torch.allclose(param.grad, grad_param.grad) is False
|
||||
), f"Gradients in sync when they should not be at iteration {iteration}:\nmodel_a grad ({param.grad}) == model_b grad ({grad_param.grad})"
|
||||
else:
|
||||
# Grads should be in sync
|
||||
assert (
|
||||
torch.allclose(param.grad, grad_param.grad) is True
|
||||
), f"Gradients not in sync when they should be at iteration {iteration}:\nmodel_a grad ({param.grad}) != model_b grad ({grad_param.grad})"
|
||||
|
||||
|
||||
def step_model(model, input, target, accelerator, do_backward=True):
|
||||
model.train()
|
||||
output = model(input)
|
||||
loss = F.mse_loss(output, target.to(output.device))
|
||||
if not do_backward:
|
||||
loss /= accelerator.gradient_accumulation_steps
|
||||
loss.backward()
|
||||
else:
|
||||
accelerator.backward(loss)
|
||||
|
||||
|
||||
def get_training_setup(accelerator, sched=False):
|
||||
"Returns everything needed to perform basic training"
|
||||
set_seed(42)
|
||||
model = RegressionModel()
|
||||
ddp_model = deepcopy(model)
|
||||
dset = RegressionDataset(length=80)
|
||||
dataloader = DataLoader(dset, batch_size=16)
|
||||
model.to(accelerator.device)
|
||||
if sched:
|
||||
opt = AdamW(params=model.parameters(), lr=1e-3)
|
||||
ddp_opt = AdamW(params=ddp_model.parameters(), lr=1e-3)
|
||||
sched = LambdaLR(opt, lr_lambda=lambda epoch: epoch**0.65)
|
||||
ddp_sched = LambdaLR(ddp_opt, lr_lambda=lambda epoch: epoch**0.65)
|
||||
# Make a copy of `model`
|
||||
if sched:
|
||||
ddp_model, ddp_opt, ddp_sched, dataloader = accelerator.prepare(ddp_model, ddp_opt, ddp_sched, dataloader)
|
||||
else:
|
||||
ddp_model, dataloader = accelerator.prepare(ddp_model, dataloader)
|
||||
if sched:
|
||||
return (model, opt, sched, dataloader, ddp_model, ddp_opt, ddp_sched)
|
||||
return model, ddp_model, dataloader
|
||||
|
||||
|
||||
def test_noop_sync(accelerator):
|
||||
# Test when on a single CPU or GPU that the context manager does nothing
|
||||
model, ddp_model, dataloader = get_training_setup(accelerator)
|
||||
# Use a single batch
|
||||
ddp_input, ddp_target = next(iter(dataloader)).values()
|
||||
for iteration in range(3):
|
||||
# Gather the distributed inputs and targs for the base model
|
||||
input, target = accelerator.gather((ddp_input, ddp_target))
|
||||
input, target = input.to(accelerator.device), target.to(accelerator.device)
|
||||
# Perform our initial ground truth step in non "DDP"
|
||||
step_model(model, input, target, accelerator)
|
||||
# Do "gradient accumulation" (noop)
|
||||
if iteration % 2 == 0:
|
||||
# Accumulate grads locally
|
||||
with accelerator.no_sync(ddp_model):
|
||||
step_model(ddp_model, ddp_input, ddp_target, accelerator)
|
||||
else:
|
||||
# Sync grads
|
||||
step_model(ddp_model, ddp_input, ddp_target, accelerator)
|
||||
|
||||
# Since `no_sync` is a noop, `ddp_model` and `model` grads should always be in sync
|
||||
check_model_parameters(model, ddp_model, True, iteration)
|
||||
for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
assert torch.allclose(
|
||||
param.grad, ddp_param.grad
|
||||
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
||||
|
||||
# Shuffle ddp_input on each iteration
|
||||
torch.manual_seed(1337 + iteration)
|
||||
ddp_input = ddp_input[torch.randperm(len(ddp_input))]
|
||||
|
||||
|
||||
def test_distributed_sync(accelerator):
|
||||
# Test on distributed setup that context manager behaves properly
|
||||
model, ddp_model, dataloader = get_training_setup(accelerator)
|
||||
# Use a single batch
|
||||
ddp_input, ddp_target = next(iter(dataloader)).values()
|
||||
for iteration in range(3):
|
||||
# Gather the distributed inputs and targs for the base model
|
||||
input, target = accelerator.gather((ddp_input, ddp_target))
|
||||
input, target = input.to(accelerator.device), target.to(accelerator.device)
|
||||
# Perform our initial ground truth step in non "DDP"
|
||||
step_model(model, input, target, accelerator)
|
||||
# Do "gradient accumulation" (noop)
|
||||
if iteration % 2 == 0:
|
||||
# Accumulate grads locally
|
||||
with accelerator.no_sync(ddp_model):
|
||||
step_model(ddp_model, ddp_input, ddp_target, accelerator)
|
||||
else:
|
||||
# Sync grads
|
||||
step_model(ddp_model, ddp_input, ddp_target, accelerator)
|
||||
|
||||
# DDP model and model should only be in sync when not (iteration % 2 == 0)
|
||||
for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
if iteration % 2 == 0:
|
||||
# Grads should not be in sync
|
||||
assert (
|
||||
torch.allclose(param.grad, ddp_param.grad) is False
|
||||
), f"Gradients in sync when they should not be:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
||||
else:
|
||||
# Grads should be in sync
|
||||
assert (
|
||||
torch.allclose(param.grad, ddp_param.grad) is True
|
||||
), f"Gradients not in sync when they should be:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
||||
|
||||
# Shuffle ddp_input on each iteration
|
||||
torch.manual_seed(1337 + iteration)
|
||||
ddp_input = ddp_input[torch.randperm(len(ddp_input))]
|
||||
|
||||
|
||||
def test_gradient_accumulation(split_batches=False, dispatch_batches=False):
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=2, split_batches=split_batches, dispatch_batches=dispatch_batches
|
||||
)
|
||||
# Test that context manager behaves properly
|
||||
model, ddp_model, dataloader = get_training_setup(accelerator)
|
||||
for iteration, batch in enumerate(dataloader):
|
||||
ddp_input, ddp_target = batch.values()
|
||||
# Gather the distributed inputs and targs for the base model
|
||||
input, target = accelerator.gather((ddp_input, ddp_target))
|
||||
input, target = input.to(accelerator.device), target.to(accelerator.device)
|
||||
# Perform our initial ground truth step in non "DDP"
|
||||
step_model(model, input, target, accelerator, False)
|
||||
# Do "gradient accumulation" (noop)
|
||||
with accelerator.accumulate(ddp_model):
|
||||
step_model(ddp_model, ddp_input, ddp_target, accelerator)
|
||||
|
||||
# DDP model and model should only be in sync when not (iteration % 2 == 0)
|
||||
for param, ddp_param in zip(model.parameters(), ddp_model.parameters()):
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
if ((iteration + 1) % 2 == 0) or (iteration == len(dataloader) - 1):
|
||||
# Grads should be in sync
|
||||
assert (
|
||||
torch.allclose(param.grad, ddp_param.grad) is True
|
||||
), f"Gradients not in sync when they should be at iteration {iteration}:\nModel grad ({param.grad}) != DDP grad ({ddp_param.grad})"
|
||||
else:
|
||||
# Grads should not be in sync
|
||||
assert (
|
||||
torch.allclose(param.grad, ddp_param.grad) is False
|
||||
), f"Gradients in sync when they should not be at iteration {iteration}:\nModel grad ({param.grad}) == DDP grad ({ddp_param.grad})"
|
||||
|
||||
# Shuffle ddp_input on each iteration
|
||||
torch.manual_seed(1337 + iteration)
|
||||
ddp_input = ddp_input[torch.randperm(len(ddp_input))]
|
||||
|
||||
|
||||
def test_gradient_accumulation_with_opt_and_scheduler(split_batches=False, dispatch_batches=False):
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=2, split_batches=split_batches, dispatch_batches=dispatch_batches
|
||||
)
|
||||
# Test that context manager behaves properly
|
||||
model, opt, sched, dataloader, ddp_model, ddp_opt, ddp_sched = get_training_setup(accelerator, True)
|
||||
for iteration, batch in enumerate(dataloader):
|
||||
ddp_input, ddp_target = batch.values()
|
||||
# Gather the distributed inputs and targs for the base model
|
||||
input, target = accelerator.gather((ddp_input, ddp_target))
|
||||
input, target = input.to(accelerator.device), target.to(accelerator.device)
|
||||
# Perform our initial ground truth step in non "DDP"
|
||||
model.train()
|
||||
ddp_model.train()
|
||||
step_model(model, input, target, accelerator, False)
|
||||
opt.step()
|
||||
if split_batches:
|
||||
sched.step()
|
||||
else:
|
||||
for _ in range(accelerator.num_processes):
|
||||
sched.step()
|
||||
opt.zero_grad()
|
||||
# Perform gradient accumulation under wrapper
|
||||
with accelerator.accumulate(ddp_model):
|
||||
step_model(ddp_model, ddp_input, ddp_target, accelerator)
|
||||
ddp_opt.step()
|
||||
ddp_sched.step()
|
||||
ddp_opt.zero_grad()
|
||||
|
||||
# Learning rates should be the same
|
||||
assert (
|
||||
opt.param_groups[0]["lr"] == ddp_opt.param_groups[0]["lr"]
|
||||
), f'Learning rates found in each optimizer did not align\nopt: {opt.param_groups[0]["lr"]}\nDDP opt: {ddp_opt.param_groups[0]["lr"]}\n'
|
||||
did_step = (((iteration + 1) % 2) == 0) or ((iteration + 1) == len(dataloader))
|
||||
if accelerator.num_processes > 1:
|
||||
check_model_parameters(model, ddp_model, did_step, iteration)
|
||||
# Shuffle ddp_input on each iteration
|
||||
torch.manual_seed(1337 + iteration)
|
||||
|
||||
|
||||
def main():
|
||||
accelerator = Accelerator()
|
||||
state = accelerator.state
|
||||
if state.distributed_type == DistributedType.NO:
|
||||
if state.local_process_index == 0:
|
||||
print("**Test NOOP `no_sync` context manager**")
|
||||
test_noop_sync(accelerator)
|
||||
if state.distributed_type in (DistributedType.MULTI_GPU, DistributedType.MULTI_CPU):
|
||||
if state.local_process_index == 0:
|
||||
print("**Test Distributed `no_sync` context manager**")
|
||||
test_distributed_sync(accelerator)
|
||||
if state.distributed_type == DistributedType.MULTI_GPU:
|
||||
for split_batch in [True, False]:
|
||||
for dispatch_batches in [True, False]:
|
||||
if state.local_process_index == 0:
|
||||
print(
|
||||
"**Test `accumulate` gradient accumulation, ",
|
||||
f"`split_batches={split_batch}` and `dispatch_batches={dispatch_batches}`**",
|
||||
)
|
||||
test_gradient_accumulation(split_batch)
|
||||
if state.local_process_index == 0:
|
||||
print(
|
||||
"**Test `accumulate` gradient accumulation with optimizer and scheduler, ",
|
||||
"`split_batches=False`, `dispatch_batches=False`**",
|
||||
)
|
||||
test_gradient_accumulation_with_opt_and_scheduler()
|
||||
if state.distributed_type == DistributedType.MULTI_GPU:
|
||||
for split_batch in [True, False]:
|
||||
for dispatch_batches in [True, False]:
|
||||
if not split_batch and not dispatch_batches:
|
||||
continue
|
||||
if state.local_process_index == 0:
|
||||
print(
|
||||
"**Test `accumulate` gradient accumulation with optimizer and scheduler, ",
|
||||
f"`split_batches={split_batch}` and `dispatch_batches={dispatch_batches}`**",
|
||||
)
|
||||
test_gradient_accumulation_with_opt_and_scheduler(split_batch, dispatch_batches)
|
||||
|
||||
|
||||
def _mp_fn(index):
|
||||
# For xla_spawn (TPUs)
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -15,6 +15,7 @@
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
@ -25,8 +26,15 @@ from unittest import mock
|
||||
|
||||
import torch
|
||||
|
||||
from ..state import AcceleratorState, is_tpu_available
|
||||
from ..utils import gather, is_tensorflow_available
|
||||
from ..state import AcceleratorState
|
||||
from ..utils import (
|
||||
gather,
|
||||
is_comet_ml_available,
|
||||
is_deepspeed_available,
|
||||
is_tensorboard_available,
|
||||
is_tpu_available,
|
||||
is_wandb_available,
|
||||
)
|
||||
|
||||
|
||||
def parse_flag_from_env(key, default=False):
|
||||
@ -53,10 +61,89 @@ def slow(test_case):
|
||||
Decorator marking a test as slow. Slow tests are skipped by default. Set the RUN_SLOW environment variable to a
|
||||
truthy value to run them.
|
||||
"""
|
||||
if not _run_slow_tests:
|
||||
return unittest.skip("test is slow")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
|
||||
|
||||
|
||||
def require_cpu(test_case):
|
||||
"""
|
||||
Decorator marking a test that must be only ran on the CPU. These tests are skipped when a GPU is available.
|
||||
"""
|
||||
return unittest.skipUnless(not torch.cuda.is_available(), "test requires only a CPU")(test_case)
|
||||
|
||||
|
||||
def require_cuda(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires CUDA. These tests are skipped when there are no GPU available.
|
||||
"""
|
||||
return unittest.skipUnless(torch.cuda.is_available(), "test requires a GPU")(test_case)
|
||||
|
||||
|
||||
def require_tpu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires TPUs. These tests are skipped when there are no TPUs available.
|
||||
"""
|
||||
return unittest.skipUnless(is_tpu_available(), "test requires TPU")(test_case)
|
||||
|
||||
|
||||
def require_single_gpu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires CUDA on a single GPU. These tests are skipped when there are no GPU
|
||||
available or number of GPUs is more than one.
|
||||
"""
|
||||
return unittest.skipUnless(torch.cuda.device_count() == 1, "test requires a GPU")(test_case)
|
||||
|
||||
|
||||
def require_multi_gpu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires a multi-GPU setup. These tests are skipped on a machine without multiple
|
||||
GPUs.
|
||||
"""
|
||||
return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
|
||||
|
||||
|
||||
def require_deepspeed(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires DeepSpeed installed. These tests are skipped when DeepSpeed isn't installed
|
||||
"""
|
||||
return unittest.skipUnless(is_deepspeed_available(), "test requires DeepSpeed")(test_case)
|
||||
|
||||
|
||||
def require_tensorboard(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires tensorboard installed. These tests are skipped when tensorboard isn't
|
||||
installed
|
||||
"""
|
||||
return unittest.skipUnless(is_tensorboard_available(), "test requires Tensorboard")(test_case)
|
||||
|
||||
|
||||
def require_wandb(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires wandb installed. These tests are skipped when wandb isn't installed
|
||||
"""
|
||||
return unittest.skipUnless(is_wandb_available(), "test requires wandb")(test_case)
|
||||
|
||||
|
||||
def require_comet_ml(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires comet_ml installed. These tests are skipped when comet_ml isn't installed
|
||||
"""
|
||||
return unittest.skipUnless(is_comet_ml_available(), "test requires comet_ml")(test_case)
|
||||
|
||||
|
||||
_atleast_one_tracker_available = (
|
||||
any([is_wandb_available(), is_tensorboard_available()]) and not is_comet_ml_available()
|
||||
)
|
||||
|
||||
|
||||
def require_trackers(test_case):
|
||||
"""
|
||||
Decorator marking that a test requires at least one tracking library installed. These tests are skipped when none
|
||||
are installed
|
||||
"""
|
||||
return unittest.skipUnless(
|
||||
_atleast_one_tracker_available,
|
||||
"test requires at least one tracker to be available and for `comet_ml` to not be installed",
|
||||
)(test_case)
|
||||
|
||||
|
||||
class TempDirTestCase(unittest.TestCase):
|
||||
@ -136,48 +223,6 @@ def are_the_same_tensors(tensor):
|
||||
return True
|
||||
|
||||
|
||||
def require_cuda(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires CUDA. These tests are skipped when there are no GPU available.
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
return unittest.skip("test requires a GPU")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
def require_tpu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires TPUs. These tests are skipped when there are no TPUs available.
|
||||
"""
|
||||
if not is_tpu_available():
|
||||
return unittest.skip("test requires TPU")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
def require_multi_gpu(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires a multi-GPU setup. These tests are skipped on a machine without multiple
|
||||
GPUs.
|
||||
"""
|
||||
if torch.cuda.device_count() < 2:
|
||||
return unittest.skip("test requires multiple GPUs")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
def require_tensorflow(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires TensorFlow installed. These tests are skipped when TensorFlow isn't
|
||||
installed
|
||||
"""
|
||||
if not is_tensorflow_available():
|
||||
return unittest.skip("test requires TensorFlow")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
class _RunOutput:
|
||||
def __init__(self, returncode, stdout, stderr):
|
||||
self.returncode = returncode
|
||||
@ -251,3 +296,24 @@ def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class SubprocessCallException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def run_command(command: List[str], return_stdout=False):
|
||||
"""
|
||||
Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
|
||||
if an error occured while running `command`
|
||||
"""
|
||||
try:
|
||||
output = subprocess.check_output(command, stderr=subprocess.STDOUT)
|
||||
if return_stdout:
|
||||
if hasattr(output, "decode"):
|
||||
output = output.decode("utf-8")
|
||||
return output
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise SubprocessCallException(
|
||||
f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
|
||||
) from e
|
||||
|
||||
@ -14,6 +14,9 @@
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate.utils.dataclasses import DistributedType
|
||||
|
||||
|
||||
class RegressionDataset:
|
||||
@ -43,3 +46,43 @@ class RegressionModel(torch.nn.Module):
|
||||
print(f"Model dtype: {self.a.dtype}, {self.b.dtype}. Input dtype: {x.dtype}")
|
||||
self.first_batch = False
|
||||
return x * self.a + self.b
|
||||
|
||||
|
||||
def mocked_dataloaders(accelerator, batch_size: int = 16):
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
|
||||
datasets = load_dataset("csv", data_files=data_files)
|
||||
label_list = datasets["train"].unique("label")
|
||||
|
||||
label_to_id = {v: i for i, v in enumerate(label_list)}
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(
|
||||
examples["sentence1"], examples["sentence2"], truncation=True, max_length=None, padding="max_length"
|
||||
)
|
||||
if "label" in examples:
|
||||
outputs["labels"] = [label_to_id[l] for l in examples["label"]]
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["sentence1", "sentence2", "label"],
|
||||
)
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=2)
|
||||
eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
@ -15,11 +15,11 @@
|
||||
# Expectation:
|
||||
# Provide a project dir name, then each type of logger gets stored in project/{`logging_dir`}
|
||||
|
||||
import logging
|
||||
import os
|
||||
from abc import ABCMeta, abstractmethod, abstractproperty
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from .logging import get_logger
|
||||
from .utils import LoggerType, is_comet_ml_available, is_tensorboard_available, is_wandb_available
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ if is_comet_ml_available():
|
||||
_available_trackers.append(LoggerType.COMETML)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def get_available_trackers():
|
||||
@ -136,8 +136,9 @@ class TensorBoardTracker(GeneralTracker):
|
||||
Logs `values` to the current run.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `str`, `float`, or `int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
|
||||
values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
|
||||
`str` to `float`/`int`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
@ -146,6 +147,8 @@ class TensorBoardTracker(GeneralTracker):
|
||||
self.writer.add_scalar(k, v, global_step=step)
|
||||
elif isinstance(v, str):
|
||||
self.writer.add_text(k, v, global_step=step)
|
||||
elif isinstance(v, dict):
|
||||
self.writer.add_scalars(k, v, global_step=step)
|
||||
self.writer.flush()
|
||||
logger.info("Successfully logged to TensorBoard")
|
||||
|
||||
@ -170,7 +173,7 @@ class WandBTracker(GeneralTracker):
|
||||
|
||||
def __init__(self, run_name: str):
|
||||
self.run_name = run_name
|
||||
self.run = wandb.init(self.run_name)
|
||||
self.run = wandb.init(project=self.run_name)
|
||||
logger.info(f"Initialized WandB project {self.run_name}")
|
||||
logger.info(
|
||||
"Make sure to log any initial configurations with `self.store_init_configuration` before training!"
|
||||
@ -193,8 +196,9 @@ class WandBTracker(GeneralTracker):
|
||||
Logs `values` to the current run.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `str`, `float`, or `int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
|
||||
values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
|
||||
`str` to `float`/`int`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
@ -247,14 +251,21 @@ class CometMLTracker(GeneralTracker):
|
||||
Logs `values` to the current run.
|
||||
|
||||
Args:
|
||||
values (Dictionary `str` to `str`, `float`, or `int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
|
||||
values (Dictionary `str` to `str`, `float`, `int` or `dict` of `str` to `float`/`int`):
|
||||
Values to be logged as key-value pairs. The values need to have type `str`, `float`, `int` or `dict` of
|
||||
`str` to `float`/`int`.
|
||||
step (`int`, *optional*):
|
||||
The run step. If included, the log will be affiliated with this step.
|
||||
"""
|
||||
if step is not None:
|
||||
self.writer.set_step(step)
|
||||
self.writer.log_others(values)
|
||||
for k, v in values.items():
|
||||
if isinstance(v, (int, float)):
|
||||
self.writer.log_metric(k, v, step=step)
|
||||
elif isinstance(v, str):
|
||||
self.writer.log_other(k, v)
|
||||
elif isinstance(v, dict):
|
||||
self.writer.log_metrics(v, step=step)
|
||||
logger.info("Successfully logged to CometML")
|
||||
|
||||
def finish(self):
|
||||
|
||||
101
src/accelerate/utils/__init__.py
Normal file
101
src/accelerate/utils/__init__.py
Normal file
@ -0,0 +1,101 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all
|
||||
|
||||
from .constants import MODEL_NAME, OPTIMIZER_NAME, RNG_STATE_NAME, SCALER_NAME, SCHEDULER_NAME
|
||||
from .dataclasses import (
|
||||
ComputeEnvironment,
|
||||
DeepSpeedPlugin,
|
||||
DistributedDataParallelKwargs,
|
||||
DistributedType,
|
||||
FullyShardedDataParallelPlugin,
|
||||
GradScalerKwargs,
|
||||
InitProcessGroupKwargs,
|
||||
KwargsHandler,
|
||||
LoggerType,
|
||||
PrecisionType,
|
||||
RNGType,
|
||||
SageMakerDistributedType,
|
||||
TensorInformation,
|
||||
)
|
||||
from .imports import (
|
||||
is_apex_available,
|
||||
is_bf16_available,
|
||||
is_boto3_available,
|
||||
is_ccl_available,
|
||||
is_comet_ml_available,
|
||||
is_deepspeed_available,
|
||||
is_sagemaker_available,
|
||||
is_tensorboard_available,
|
||||
is_tpu_available,
|
||||
is_transformers_available,
|
||||
is_wandb_available,
|
||||
)
|
||||
from .modeling import (
|
||||
check_device_map,
|
||||
compute_module_sizes,
|
||||
convert_file_size_to_int,
|
||||
dtype_byte_size,
|
||||
find_tied_parameters,
|
||||
get_max_layer_size,
|
||||
get_max_memory,
|
||||
infer_auto_device_map,
|
||||
load_checkpoint_in_model,
|
||||
load_offloaded_weights,
|
||||
named_module_tensors,
|
||||
set_module_tensor_to_device,
|
||||
)
|
||||
from .offload import (
|
||||
OffloadedWeightsLoader,
|
||||
PrefixedDataset,
|
||||
extract_submodules_state_dict,
|
||||
load_offloaded_weight,
|
||||
offload_state_dict,
|
||||
offload_weight,
|
||||
save_offload_index,
|
||||
)
|
||||
from .operations import (
|
||||
broadcast,
|
||||
broadcast_object_list,
|
||||
concatenate,
|
||||
convert_outputs_to_fp32,
|
||||
convert_to_fp32,
|
||||
find_batch_size,
|
||||
find_device,
|
||||
gather,
|
||||
gather_object,
|
||||
get_data_structure,
|
||||
honor_type,
|
||||
initialize_tensors,
|
||||
is_tensor_information,
|
||||
is_torch_tensor,
|
||||
pad_across_processes,
|
||||
recursively_apply,
|
||||
reduce,
|
||||
send_to_device,
|
||||
slice_tensors,
|
||||
)
|
||||
from .versions import compare_versions, is_torch_version
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
from .deepspeed import (
|
||||
DeepSpeedEngineWrapper,
|
||||
DeepSpeedOptimizerWrapper,
|
||||
DeepSpeedSchedulerWrapper,
|
||||
DummyOptim,
|
||||
DummyScheduler,
|
||||
HfDeepSpeedConfig,
|
||||
)
|
||||
|
||||
from .launch import PrepareForLaunch, get_launch_prefix
|
||||
from .memory import find_executable_batch_size
|
||||
from .other import (
|
||||
extract_model_from_parallel,
|
||||
get_pretty_name,
|
||||
patch_environment,
|
||||
save,
|
||||
wait_for_everyone,
|
||||
write_basic_config,
|
||||
)
|
||||
from .random import set_seed, synchronize_rng_state, synchronize_rng_states
|
||||
32
src/accelerate/utils/constants.py
Normal file
32
src/accelerate/utils/constants.py
Normal file
@ -0,0 +1,32 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import operator as op
|
||||
|
||||
|
||||
SCALER_NAME = "scaler.pt"
|
||||
MODEL_NAME = "pytorch_model"
|
||||
RNG_STATE_NAME = "random_states"
|
||||
OPTIMIZER_NAME = "optimizer"
|
||||
SCHEDULER_NAME = "scheduler"
|
||||
SAGEMAKER_PYTORCH_VERSION = "1.10.2"
|
||||
SAGEMAKER_PYTHON_VERSION = "py38"
|
||||
SAGEMAKER_TRANSFORMERS_VERSION = "4.17.0"
|
||||
SAGEMAKER_PARALLEL_EC2_INSTANCES = ["ml.p3.16xlarge", "ml.p3dn.24xlarge", "ml.p4dn.24xlarge"]
|
||||
FSDP_SHARDING_STRATEGY = ["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD"]
|
||||
FSDP_AUTO_WRAP_POLICY = ["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP", "NO_WRAP"]
|
||||
FSDP_BACKWARD_PREFETCH = ["BACKWARD_PRE", "BACKWARD_POST", "NO_PREFETCH"]
|
||||
DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich"]
|
||||
|
||||
STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
|
||||
529
src/accelerate/utils/dataclasses.py
Normal file
529
src/accelerate/utils/dataclasses.py
Normal file
@ -0,0 +1,529 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
General namespace and dataclass related classes
|
||||
"""
|
||||
|
||||
import copy
|
||||
import enum
|
||||
import functools
|
||||
import os
|
||||
import typing
|
||||
import warnings
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import timedelta
|
||||
from typing import Any, Callable, Iterable, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from .constants import FSDP_AUTO_WRAP_POLICY, FSDP_BACKWARD_PREFETCH
|
||||
|
||||
|
||||
class KwargsHandler:
|
||||
"""
|
||||
Internal mixin that implements a `to_kwargs()` method for a dataclass.
|
||||
"""
|
||||
|
||||
def to_dict(self):
|
||||
return copy.deepcopy(self.__dict__)
|
||||
|
||||
def to_kwargs(self):
|
||||
"""
|
||||
Returns a dictionary containing the attributes with values different from the default of this class.
|
||||
"""
|
||||
default_dict = self.__class__().to_dict()
|
||||
this_dict = self.to_dict()
|
||||
return {k: v for k, v in this_dict.items() if default_dict[k] != v}
|
||||
|
||||
|
||||
@dataclass
|
||||
class DistributedDataParallelKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your [`Accelerator`] to customize how your model is wrapped in a
|
||||
`torch.nn.parallel.DistributedDataParallel`. Please refer to the documentation of this
|
||||
[wrapper](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) for more
|
||||
information on each argument.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`gradient_as_bucket_view` is only available in PyTorch 1.7.0 and later versions.
|
||||
|
||||
</Tip>"""
|
||||
|
||||
dim: int = 0
|
||||
broadcast_buffers: bool = True
|
||||
bucket_cap_mb: int = 25
|
||||
find_unused_parameters: bool = False
|
||||
check_reduction: bool = False
|
||||
gradient_as_bucket_view: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class GradScalerKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your [`Accelerator`] to customize the behavior of mixed precision, specifically how the
|
||||
`torch.cuda.amp.GradScaler` used is created. Please refer to the documentation of this
|
||||
[scaler](https://pytorch.org/docs/stable/amp.html?highlight=gradscaler) for more information on each argument.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
`GradScaler` is only available in PyTorch 1.5.0 and later versions.
|
||||
|
||||
</Tip>"""
|
||||
|
||||
init_scale: float = 65536.0
|
||||
growth_factor: float = 2.0
|
||||
backoff_factor: float = 0.5
|
||||
growth_interval: int = 2000
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class InitProcessGroupKwargs(KwargsHandler):
|
||||
"""
|
||||
Use this object in your [`Accelerator`] to customize the initialization of the distributed processes. Please refer
|
||||
to the documentation of this
|
||||
[method](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
|
||||
information on each argument.
|
||||
"""
|
||||
|
||||
init_method: Optional[str] = None
|
||||
timeout: timedelta = timedelta(seconds=1800)
|
||||
|
||||
|
||||
class DistributedType(str, enum.Enum):
|
||||
"""
|
||||
Represents a type of distributed environment.
|
||||
|
||||
Values:
|
||||
|
||||
- **NO** -- Not a distributed environment, just a single process.
|
||||
- **MULTI_CPU** -- Distributed on multiple CPU nodes.
|
||||
- **MULTI_GPU** -- Distributed on multiple GPUs.
|
||||
- **DEEPSPEED** -- Using DeepSpeed.
|
||||
- **TPU** -- Distributed on TPUs.
|
||||
"""
|
||||
|
||||
# Subclassing str as well as Enum allows the `DistributedType` to be JSON-serializable out of the box.
|
||||
NO = "NO"
|
||||
MULTI_CPU = "MULTI_CPU"
|
||||
MULTI_GPU = "MULTI_GPU"
|
||||
DEEPSPEED = "DEEPSPEED"
|
||||
FSDP = "FSDP"
|
||||
TPU = "TPU"
|
||||
|
||||
|
||||
class SageMakerDistributedType(str, enum.Enum):
|
||||
"""
|
||||
Represents a type of distributed environment.
|
||||
|
||||
Values:
|
||||
|
||||
- **NO** -- Not a distributed environment, just a single process.
|
||||
- **DATA_PARALLEL** -- using sagemaker distributed data parallelism.
|
||||
- **MODEL_PARALLEL** -- using sagemaker distributed model parallelism.
|
||||
"""
|
||||
|
||||
# Subclassing str as well as Enum allows the `SageMakerDistributedType` to be JSON-serializable out of the box.
|
||||
NO = "NO"
|
||||
DATA_PARALLEL = "DATA_PARALLEL"
|
||||
MODEL_PARALLEL = "MODEL_PARALLEL"
|
||||
|
||||
|
||||
class ComputeEnvironment(str, enum.Enum):
|
||||
"""
|
||||
Represents a type of the compute environment.
|
||||
|
||||
Values:
|
||||
|
||||
- **LOCAL_MACHINE** -- private/custom cluster hardware.
|
||||
- **AMAZON_SAGEMAKER** -- Amazon SageMaker as compute environment.
|
||||
"""
|
||||
|
||||
# Subclassing str as well as Enum allows the `ComputeEnvironment` to be JSON-serializable out of the box.
|
||||
LOCAL_MACHINE = "LOCAL_MACHINE"
|
||||
AMAZON_SAGEMAKER = "AMAZON_SAGEMAKER"
|
||||
|
||||
|
||||
class EnumWithContains(enum.EnumMeta):
|
||||
"A metaclass that adds the ability to check if `self` contains an item with the `in` operator"
|
||||
|
||||
def __contains__(cls, item):
|
||||
try:
|
||||
cls(item)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class BaseEnum(enum.Enum, metaclass=EnumWithContains):
|
||||
"An enum class that can get the value of an item with `str(Enum.key)`"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
@classmethod
|
||||
def list(cls):
|
||||
"Method to list all the possible items in `cls`"
|
||||
return list(map(lambda item: str(item), cls))
|
||||
|
||||
|
||||
class LoggerType(BaseEnum):
|
||||
"""Represents a type of supported experiment tracker
|
||||
|
||||
Values:
|
||||
|
||||
- **ALL** -- all available trackers in the environment that are supported
|
||||
- **TENSORBOARD** -- TensorBoard as an experiment tracker
|
||||
- **WANDB** -- wandb as an experiment tracker
|
||||
- **COMETML** -- comet_ml as an experiment tracker
|
||||
"""
|
||||
|
||||
ALL = "all"
|
||||
TENSORBOARD = "tensorboard"
|
||||
WANDB = "wandb"
|
||||
COMETML = "comet_ml"
|
||||
|
||||
|
||||
class PrecisionType(BaseEnum):
|
||||
"""Represents a type of precision used on floating point values
|
||||
|
||||
Values:
|
||||
|
||||
- **NO** -- using full precision (FP32)
|
||||
- **FP16** -- using half precision
|
||||
- **BF16** -- using brain floating point precision
|
||||
"""
|
||||
|
||||
NO = "no"
|
||||
FP16 = "fp16"
|
||||
BF16 = "bf16"
|
||||
|
||||
|
||||
class RNGType(BaseEnum):
|
||||
TORCH = "torch"
|
||||
CUDA = "cuda"
|
||||
XLA = "xla"
|
||||
GENERATOR = "generator"
|
||||
|
||||
|
||||
# data classes
|
||||
|
||||
|
||||
@dataclass
|
||||
class TensorInformation:
|
||||
shape: torch.Size
|
||||
dtype: torch.dtype
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeepSpeedPlugin:
|
||||
"""
|
||||
This plugin is used to integrate DeepSpeed.
|
||||
"""
|
||||
|
||||
hf_ds_config: Any = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "path to DeepSpeed config file or dict or an object of class `accelerate.utils.deepspeed.HfDeepSpeedConfig`."
|
||||
},
|
||||
)
|
||||
gradient_accumulation_steps: int = field(
|
||||
default=None, metadata={"help": "Number of steps to accumulate gradients before updating optimizer states"}
|
||||
)
|
||||
gradient_clipping: float = field(default=None, metadata={"help": "Enable gradient clipping with value"})
|
||||
zero_stage: int = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are 0,1,2,3; Default will be taken from environment variable"},
|
||||
)
|
||||
is_train_batch_min: str = field(
|
||||
default=True,
|
||||
metadata={"help": "If both train & eval dataloaders are specified, this will decide the train_batch_size"},
|
||||
)
|
||||
offload_optimizer_device: bool = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."},
|
||||
)
|
||||
offload_param_device: bool = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3."},
|
||||
)
|
||||
zero3_init_flag: bool = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
|
||||
"Only applicable with ZeRO Stage-3."
|
||||
},
|
||||
)
|
||||
zero3_save_16bit_model: bool = field(
|
||||
default=None,
|
||||
metadata={"help": "Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
from .deepspeed import HfDeepSpeedConfig
|
||||
|
||||
if self.hf_ds_config is None:
|
||||
self.hf_ds_config = os.environ.get("DEEPSPEED_CONFIG_FILE", "none")
|
||||
if (
|
||||
isinstance(self.hf_ds_config, dict)
|
||||
or (isinstance(self.hf_ds_config, str) and self.hf_ds_config != "none")
|
||||
or isinstance(self.hf_ds_config, HfDeepSpeedConfig)
|
||||
):
|
||||
if not isinstance(self.hf_ds_config, HfDeepSpeedConfig):
|
||||
self.hf_ds_config = HfDeepSpeedConfig(self.hf_ds_config)
|
||||
if "gradient_accumulation_steps" not in self.hf_ds_config.config:
|
||||
self.hf_ds_config.config["gradient_accumulation_steps"] = 1
|
||||
elif self.hf_ds_config.config["gradient_accumulation_steps"] == "auto":
|
||||
raise ValueError("gradient_accumulation_steps cannot be set to 'auto' in the DeepSpeed config.")
|
||||
if "zero_optimization" not in self.hf_ds_config.config:
|
||||
raise ValueError("Please specify the ZeRO optimization config in the DeepSpeed config.")
|
||||
else:
|
||||
if self.gradient_accumulation_steps is None:
|
||||
self.gradient_accumulation_steps = int(os.environ.get("GRADIENT_ACCUMULATION_STEPS", 1))
|
||||
|
||||
if self.gradient_clipping is None:
|
||||
gradient_clipping = os.environ.get("GRADIENT_CLIPPING", "none")
|
||||
if gradient_clipping != "none":
|
||||
self.gradient_clipping = float(gradient_clipping)
|
||||
|
||||
if self.zero_stage is None:
|
||||
self.zero_stage = int(os.environ.get("DEEPSPEED_ZERO_STAGE", 2))
|
||||
|
||||
if self.offload_optimizer_device is None:
|
||||
self.offload_optimizer_device = os.environ.get("DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE", "none")
|
||||
|
||||
if self.offload_param_device is None:
|
||||
self.offload_param_device = os.environ.get("DEEPSPEED_OFFLOAD_PARAM_DEVICE", "none")
|
||||
|
||||
if self.zero3_save_16bit_model is None:
|
||||
self.zero3_save_16bit_model = os.environ.get("DEEPSPEED_ZERO3_SAVE_16BIT_MODEL", "false") == "true"
|
||||
|
||||
config = {
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"gradient_accumulation_steps": self.gradient_accumulation_steps,
|
||||
"zero_optimization": {
|
||||
"stage": self.zero_stage,
|
||||
"offload_optimizer": {
|
||||
"device": self.offload_optimizer_device,
|
||||
},
|
||||
"offload_param": {
|
||||
"device": self.offload_param_device,
|
||||
},
|
||||
"stage3_gather_16bit_weights_on_model_save": self.zero3_save_16bit_model,
|
||||
},
|
||||
}
|
||||
if self.gradient_clipping:
|
||||
config["gradient_clipping"] = self.gradient_clipping
|
||||
self.hf_ds_config = HfDeepSpeedConfig(config)
|
||||
self.deepspeed_config = self.hf_ds_config.config
|
||||
self.deepspeed_config["steps_per_print"] = float("inf") # this will stop deepspeed from logging @ stdout
|
||||
if self.zero3_init_flag is None:
|
||||
self.zero3_init_flag = os.environ.get("DEEPSPEED_ZERO3_INIT", "false") == "true"
|
||||
if self.zero3_init_flag and not self.hf_ds_config.is_zero3():
|
||||
warnings.warn("DeepSpeed Zero3 Init flag is only applicable for ZeRO Stage 3. Setting it to False.")
|
||||
self.zero3_init_flag = False
|
||||
|
||||
def fill_match(self, ds_key_long, mismatches, must_match=True, **kwargs):
|
||||
config, ds_key = self.hf_ds_config.find_config_node(ds_key_long)
|
||||
if config is None:
|
||||
return
|
||||
|
||||
if config.get(ds_key) == "auto":
|
||||
if ds_key_long in kwargs:
|
||||
config[ds_key] = kwargs[ds_key_long]
|
||||
return
|
||||
else:
|
||||
raise ValueError(
|
||||
f"`{ds_key_long}` not found in kwargs. "
|
||||
f"Please specify `{ds_key_long}` without `auto`(set to correct value) in the DeepSpeed config file or "
|
||||
"pass it in kwargs."
|
||||
)
|
||||
|
||||
if not must_match:
|
||||
return
|
||||
|
||||
ds_val = config.get(ds_key)
|
||||
if ds_val is not None and ds_key_long in kwargs:
|
||||
if ds_val != kwargs[ds_key_long]:
|
||||
mismatches.append(f"- ds {ds_key_long}={ds_val} vs arg {ds_key_long}={kwargs[ds_key_long]}")
|
||||
|
||||
def deepspeed_config_process(self, prefix="", mismatches=None, config=None, must_match=True, **kwargs):
|
||||
"""Process the DeepSpeed config with the values from the kwargs."""
|
||||
mismatches = [] if mismatches is None else mismatches
|
||||
if config is None:
|
||||
config = self.deepspeed_config
|
||||
for key, value in config.items():
|
||||
if isinstance(value, dict):
|
||||
self.deepspeed_config_process(
|
||||
prefix=prefix + key + ".", mismatches=mismatches, config=value, must_match=must_match, **kwargs
|
||||
)
|
||||
else:
|
||||
self.fill_match(prefix + key, mismatches, must_match=must_match, **kwargs)
|
||||
if len(mismatches) > 0 and prefix == "":
|
||||
mismatches_msg = "\n".join(mismatches)
|
||||
raise ValueError(
|
||||
"Please correct the following DeepSpeed config values that mismatch kwargs "
|
||||
f" values:\n{mismatches_msg}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
|
||||
)
|
||||
|
||||
def set_mixed_precision(self, mixed_precision):
|
||||
ds_config = self.deepspeed_config
|
||||
if mixed_precision == "fp16" and "fp16" not in ds_config and "bf16" not in ds_config:
|
||||
ds_config.update({"fp16": {"enabled": True}})
|
||||
elif mixed_precision == "bf16" and "fp16" not in ds_config and "bf16" not in ds_config:
|
||||
ds_config.update({"bf16": {"enabled": True}})
|
||||
|
||||
def set_deepspeed_weakref(self):
|
||||
from .imports import is_transformers_available
|
||||
|
||||
if self.zero3_init_flag:
|
||||
if not is_transformers_available():
|
||||
raise Exception(
|
||||
"When `zero3_init_flag` is set, it requires Transformers to be installed. "
|
||||
"Please run `pip install transformers`."
|
||||
)
|
||||
ds_config = copy.deepcopy(self.deepspeed_config)
|
||||
if "gradient_accumulation_steps" not in ds_config or ds_config["gradient_accumulation_steps"] == "auto":
|
||||
ds_config["gradient_accumulation_steps"] = 1
|
||||
if (
|
||||
"train_micro_batch_size_per_gpu" not in ds_config
|
||||
or ds_config["train_micro_batch_size_per_gpu"] == "auto"
|
||||
):
|
||||
ds_config["train_micro_batch_size_per_gpu"] = 1
|
||||
if ds_config["train_batch_size"] == "auto":
|
||||
del ds_config["train_batch_size"]
|
||||
|
||||
from transformers.deepspeed import HfDeepSpeedConfig
|
||||
|
||||
self.dschf = HfDeepSpeedConfig(ds_config) # keep this object alive # noqa
|
||||
|
||||
|
||||
@dataclass
|
||||
class FullyShardedDataParallelPlugin:
|
||||
"""
|
||||
This plugin is used to enable fully sharded data parallelism.
|
||||
"""
|
||||
|
||||
sharding_strategy: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "FSDP Sharding Strategy of type `torch.distributed.fsdp.fully_sharded_data_parallel.ShardingStrategy`"
|
||||
},
|
||||
)
|
||||
backward_prefetch: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "FSDP Backward Prefetch of type `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`"
|
||||
},
|
||||
)
|
||||
mixed_precision_policy: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "A config to enable mixed precision training with FullyShardedDataParallel. "
|
||||
"The 3 flags that are set are `param_dtype`, `reduce_dtype`, `buffer_dtype`. "
|
||||
"Each flag expects `torch.dtype` as the value. "
|
||||
"It is of type `torch.distributed.fsdp.fully_sharded_data_parallel.MixedPrecision`."
|
||||
},
|
||||
)
|
||||
auto_wrap_policy: Optional[Callable] = field(
|
||||
default=None,
|
||||
metadata={"help": "A callable specifying a policy to recursively wrap layers with FSDP"},
|
||||
)
|
||||
cpu_offload: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "Decides Whether to offload parameters and gradients to CPU. "
|
||||
"It is of type `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffload`."
|
||||
},
|
||||
)
|
||||
ignored_modules: Optional[Iterable[torch.nn.Module]] = field(
|
||||
default=None,
|
||||
metadata={"help": "A list of modules to ignore for FSDP."},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, CPUOffload, ShardingStrategy
|
||||
|
||||
if self.sharding_strategy is None:
|
||||
self.sharding_strategy = ShardingStrategy(int(os.environ.get("FSDP_SHARDING_STRATEGY", 1)))
|
||||
|
||||
if self.cpu_offload is None:
|
||||
if os.environ.get("FSDP_OFFLOAD_PARAMS", "false") == "true":
|
||||
self.cpu_offload = CPUOffload(offload_params=True)
|
||||
else:
|
||||
self.cpu_offload = CPUOffload(offload_params=False)
|
||||
|
||||
if self.backward_prefetch is None:
|
||||
prefetch_policy = os.environ.get("FSDP_BACKWARD_PREFETCH", FSDP_BACKWARD_PREFETCH[-1])
|
||||
if prefetch_policy != FSDP_BACKWARD_PREFETCH[-1]:
|
||||
self.backward_prefetch = BackwardPrefetch(FSDP_BACKWARD_PREFETCH.index(prefetch_policy) + 1)
|
||||
|
||||
@staticmethod
|
||||
def get_module_class_from_name(module, name):
|
||||
"""
|
||||
Gets a class from a module by its name.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module to get the class from.
|
||||
name (`str`): The name of the class.
|
||||
"""
|
||||
modules_children = list(module.children())
|
||||
if module.__class__.__name__ == name:
|
||||
return module.__class__
|
||||
elif len(modules_children) == 0:
|
||||
return
|
||||
else:
|
||||
for child_module in modules_children:
|
||||
module_class = FullyShardedDataParallelPlugin.get_module_class_from_name(child_module, name)
|
||||
if module_class is not None:
|
||||
return module_class
|
||||
|
||||
def set_auto_wrap_policy(self, model):
|
||||
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
|
||||
|
||||
if self.auto_wrap_policy is None:
|
||||
auto_wrap_policy = os.environ.get("FSDP_AUTO_WRAP_POLICY", FSDP_AUTO_WRAP_POLICY[-1])
|
||||
if auto_wrap_policy == FSDP_AUTO_WRAP_POLICY[0]:
|
||||
transformer_cls_to_wrap = os.environ.get("FSDP_TRANSFORMER_CLS_TO_WRAP", "")
|
||||
transformer_cls_to_wrap = FullyShardedDataParallelPlugin.get_module_class_from_name(
|
||||
model, transformer_cls_to_wrap
|
||||
)
|
||||
if transformer_cls_to_wrap is None:
|
||||
raise Exception("Could not find the transformer layer class to wrap in the model.")
|
||||
self.auto_wrap_policy = functools.partial(
|
||||
transformer_auto_wrap_policy,
|
||||
# Transformer layer class to wrap
|
||||
transformer_layer_cls={transformer_cls_to_wrap},
|
||||
)
|
||||
elif auto_wrap_policy == FSDP_AUTO_WRAP_POLICY[1]:
|
||||
min_num_params = int(os.environ.get("FSDP_MIN_NUM_PARAMS", 0))
|
||||
if min_num_params > 0:
|
||||
self.auto_wrap_policy = functools.partial(
|
||||
size_based_auto_wrap_policy, min_num_params=min_num_params
|
||||
)
|
||||
|
||||
def set_mixed_precision(self, mixed_precision):
|
||||
if mixed_precision == "fp16":
|
||||
dtype = torch.float16
|
||||
elif mixed_precision == "bf16":
|
||||
dtype = torch.bfloat16
|
||||
else:
|
||||
raise ValueError(f"Unknown mixed precision value: {mixed_precision}")
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
|
||||
|
||||
if self.mixed_precision_policy is None:
|
||||
self.mixed_precision_policy = MixedPrecision(param_dtype=dtype, reduce_dtype=dtype, buffer_dtype=dtype)
|
||||
252
src/accelerate/utils/deepspeed.py
Normal file
252
src/accelerate/utils/deepspeed.py
Normal file
@ -0,0 +1,252 @@
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import io
|
||||
import json
|
||||
from copy import deepcopy
|
||||
|
||||
from ..optimizer import AcceleratedOptimizer
|
||||
from ..scheduler import AcceleratedScheduler
|
||||
|
||||
|
||||
class HfDeepSpeedConfig:
|
||||
"""
|
||||
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
|
||||
|
||||
A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
|
||||
things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
|
||||
it's important that this object remains alive while the program is still running.
|
||||
|
||||
[`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
|
||||
with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
|
||||
the DeepSpeed configuration is not modified in any way.
|
||||
|
||||
Args:
|
||||
config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, config_file_or_dict):
|
||||
|
||||
if isinstance(config_file_or_dict, dict):
|
||||
# Don't modify user's data should they want to reuse it (e.g. in tests), because once we
|
||||
# modified it, it will not be accepted here again, since `auto` values would have been overridden
|
||||
config = deepcopy(config_file_or_dict)
|
||||
elif isinstance(config_file_or_dict, str):
|
||||
with io.open(config_file_or_dict, "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
else:
|
||||
raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict")
|
||||
self.config = config
|
||||
|
||||
# zero stage - this is done as early as possible, before model is created, to allow
|
||||
# ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object
|
||||
# during ``zero.Init()`` which needs to know the dtype, and some other hparams.
|
||||
self._stage = self.get_value("zero_optimization.stage", -1)
|
||||
|
||||
# offload
|
||||
self._offload = False
|
||||
if self.is_zero2() or self.is_zero3():
|
||||
offload_devices_valid = set(["cpu", "nvme"])
|
||||
offload_devices = set(
|
||||
[
|
||||
self.get_value("zero_optimization.offload_optimizer.device"),
|
||||
self.get_value("zero_optimization.offload_param.device"),
|
||||
]
|
||||
)
|
||||
if len(offload_devices & offload_devices_valid) > 0:
|
||||
self._offload = True
|
||||
|
||||
def find_config_node(self, ds_key_long):
|
||||
config = self.config
|
||||
|
||||
# find the config node of interest if it exists
|
||||
nodes = ds_key_long.split(".")
|
||||
ds_key = nodes.pop()
|
||||
for node in nodes:
|
||||
config = config.get(node)
|
||||
if config is None:
|
||||
return None, ds_key
|
||||
|
||||
return config, ds_key
|
||||
|
||||
def get_value(self, ds_key_long, default=None):
|
||||
"""
|
||||
Returns the set value or `default` if no value is set
|
||||
"""
|
||||
config, ds_key = self.find_config_node(ds_key_long)
|
||||
if config is None:
|
||||
return default
|
||||
return config.get(ds_key, default)
|
||||
|
||||
def del_config_sub_tree(self, ds_key_long, must_exist=False):
|
||||
"""
|
||||
Deletes a sub-section of the config file if it's found.
|
||||
|
||||
Unless `must_exist` is `True` the section doesn't have to exist.
|
||||
"""
|
||||
config = self.config
|
||||
|
||||
# find the config node of interest if it exists
|
||||
nodes = ds_key_long.split(".")
|
||||
for node in nodes:
|
||||
parent_config = config
|
||||
config = config.get(node)
|
||||
if config is None:
|
||||
if must_exist:
|
||||
raise ValueError(f"Can't find {ds_key_long} entry in the config: {self.config}")
|
||||
else:
|
||||
return
|
||||
|
||||
# if found remove it
|
||||
if parent_config is not None:
|
||||
parent_config.pop(node)
|
||||
|
||||
def is_true(self, ds_key_long):
|
||||
"""
|
||||
Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
|
||||
specific question of whether the value is set to `True` (and it's not set to `False`` or isn't set).
|
||||
|
||||
"""
|
||||
value = self.get_value(ds_key_long)
|
||||
return False if value is None else bool(value)
|
||||
|
||||
def is_false(self, ds_key_long):
|
||||
"""
|
||||
Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
|
||||
specific question of whether the value is set to `False` (and it's not set to `True`` or isn't set).
|
||||
"""
|
||||
value = self.get_value(ds_key_long)
|
||||
return False if value is None else not bool(value)
|
||||
|
||||
def is_zero2(self):
|
||||
return self._stage == 2
|
||||
|
||||
def is_zero3(self):
|
||||
return self._stage == 3
|
||||
|
||||
def is_offload(self):
|
||||
return self._offload
|
||||
|
||||
|
||||
class DeepSpeedEngineWrapper:
|
||||
"""
|
||||
Internal wrapper for deepspeed.runtime.engine.DeepSpeedEngine. This is used to follow conventional training loop.
|
||||
|
||||
Args:
|
||||
engine (deepspeed.runtime.engine.DeepSpeedEngine): deepspeed engine to wrap
|
||||
"""
|
||||
|
||||
def __init__(self, engine):
|
||||
self.engine = engine
|
||||
|
||||
def backward(self, loss):
|
||||
# runs backpropagation and handles mixed precision
|
||||
self.engine.backward(loss)
|
||||
|
||||
# deepspeed `engine.step` performs following operations:
|
||||
# gradient accumulation check
|
||||
# gradient clipping
|
||||
# optimizer step
|
||||
# zero grad
|
||||
# checking overflow
|
||||
# lr_scheduler step
|
||||
self.engine.step()
|
||||
|
||||
|
||||
class DeepSpeedOptimizerWrapper(AcceleratedOptimizer):
|
||||
"""
|
||||
Internal wrapper around a deepspeed optimizer.
|
||||
|
||||
Args:
|
||||
optimizer (`torch.optim.optimizer.Optimizer`):
|
||||
The optimizer to wrap.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer):
|
||||
super().__init__(optimizer, device_placement=False, scaler=None)
|
||||
|
||||
def zero_grad(self, set_to_none=None):
|
||||
pass # `accelerator.backward(loss)` is doing that automatically. Therefore, it's implementation is not needed
|
||||
|
||||
def step(self):
|
||||
pass # `accelerator.backward(loss)` is doing that automatically. Therefore, it's implementation is not needed
|
||||
|
||||
@property
|
||||
def step_was_skipped(self):
|
||||
"""Whether or not the optimizer step was done, or skipped because of gradient overflow."""
|
||||
return self.optimizer.overflow
|
||||
|
||||
|
||||
class DeepSpeedSchedulerWrapper(AcceleratedScheduler):
|
||||
"""
|
||||
Internal wrapper around a deepspeed scheduler.
|
||||
|
||||
Args:
|
||||
scheduler (`torch.optim.lr_scheduler.LambdaLR`):
|
||||
The scheduler to wrap.
|
||||
optimizers (one or a list of `torch.optim.Optimizer`):
|
||||
"""
|
||||
|
||||
def __init__(self, scheduler, optimizers):
|
||||
super().__init__(scheduler, optimizers)
|
||||
|
||||
def step(self):
|
||||
pass # `accelerator.backward(loss)` is doing that automatically. Therefore, it's implementation is not needed
|
||||
|
||||
|
||||
class DummyOptim:
|
||||
"""
|
||||
Dummy optimizer presents model parameters or param groups, this is primarily used to follow conventional training
|
||||
loop when optimizer config is specified in the deepspeed config file.
|
||||
|
||||
Args:
|
||||
lr (float):
|
||||
Learning rate.
|
||||
params (iterable): iterable of parameters to optimize or dicts defining
|
||||
parameter groups
|
||||
weight_decay (float):
|
||||
Weight decay.
|
||||
**kwargs:
|
||||
Other arguments.
|
||||
"""
|
||||
|
||||
def __init__(self, params, lr=0.001, weight_decay=0, **kwargs):
|
||||
self.params = params
|
||||
self.lr = lr
|
||||
self.weight_decay = weight_decay
|
||||
self.kwargs = kwargs
|
||||
|
||||
|
||||
class DummyScheduler:
|
||||
"""
|
||||
Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
|
||||
loop when scheduler config is specified in the deepspeed config file.
|
||||
|
||||
Args:
|
||||
optimizer (`torch.optim.optimizer.Optimizer`):
|
||||
The optimizer to wrap.
|
||||
total_num_steps (int):
|
||||
Total number of steps.
|
||||
warmup_num_steps (int):
|
||||
Number of steps for warmup.
|
||||
**kwargs:
|
||||
Other arguments.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, total_num_steps=None, warmup_num_steps=0, **kwargs):
|
||||
self.optimizer = optimizer
|
||||
self.total_num_steps = total_num_steps
|
||||
self.warmup_num_steps = warmup_num_steps
|
||||
self.kwargs = kwargs
|
||||
110
src/accelerate/utils/imports.py
Normal file
110
src/accelerate/utils/imports.py
Normal file
@ -0,0 +1,110 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
from .versions import is_torch_version
|
||||
|
||||
|
||||
# The package importlib_metadata is in a different place, depending on the Python version.
|
||||
if sys.version_info < (3, 8):
|
||||
import importlib_metadata
|
||||
else:
|
||||
import importlib.metadata as importlib_metadata
|
||||
|
||||
|
||||
try:
|
||||
import torch_ccl # noqa: F401
|
||||
|
||||
_ccl_available = True
|
||||
except ImportError:
|
||||
_ccl_available = False
|
||||
|
||||
|
||||
try:
|
||||
import torch_xla.core.xla_model as xm # noqa: F401
|
||||
|
||||
_tpu_available = True
|
||||
except ImportError:
|
||||
_tpu_available = False
|
||||
|
||||
|
||||
def is_ccl_available():
|
||||
return _ccl_available
|
||||
|
||||
|
||||
def is_apex_available():
|
||||
return importlib.util.find_spec("apex") is not None
|
||||
|
||||
|
||||
def is_tpu_available(check_device=True):
|
||||
"Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
|
||||
if _tpu_available and check_device:
|
||||
try:
|
||||
# Will raise a RuntimeError if no XLA configuration is found
|
||||
_ = xm.xla_device()
|
||||
return True
|
||||
except RuntimeError:
|
||||
return False
|
||||
return _tpu_available
|
||||
|
||||
|
||||
def is_deepspeed_available():
|
||||
package_exists = importlib.util.find_spec("deepspeed") is not None
|
||||
# Check we're not importing a "deepspeed" directory somewhere but the actual library by trying to grab the version
|
||||
# AND checking it has an author field in the metadata that is HuggingFace.
|
||||
if package_exists:
|
||||
try:
|
||||
_ = importlib_metadata.metadata("deepspeed")
|
||||
return True
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
return False
|
||||
|
||||
|
||||
def is_bf16_available(ignore_tpu=False):
|
||||
"Checks if bf16 is supported, optionally ignoring the TPU"
|
||||
if is_tpu_available():
|
||||
return not ignore_tpu
|
||||
if is_torch_version(">=", "1.10"):
|
||||
if torch.cuda.is_available():
|
||||
return torch.cuda.is_bf16_supported()
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_transformers_available():
|
||||
return importlib.util.find_spec("transformers") is not None
|
||||
|
||||
|
||||
def is_tensorboard_available():
|
||||
return importlib.util.find_spec("tensorboard") is not None or importlib.util.find_spec("tensorboardX") is not None
|
||||
|
||||
|
||||
def is_wandb_available():
|
||||
return importlib.util.find_spec("wandb") is not None
|
||||
|
||||
|
||||
def is_comet_ml_available():
|
||||
return importlib.util.find_spec("comet_ml") is not None
|
||||
|
||||
|
||||
def is_boto3_available():
|
||||
return importlib.util.find_spec("boto3") is not None
|
||||
|
||||
|
||||
def is_sagemaker_available():
|
||||
return importlib.util.find_spec("sagemaker") is not None
|
||||
72
src/accelerate/utils/launch.py
Normal file
72
src/accelerate/utils/launch.py
Normal file
@ -0,0 +1,72 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
from ..utils import is_torch_version
|
||||
from .dataclasses import DistributedType
|
||||
|
||||
|
||||
def get_launch_prefix():
|
||||
"""
|
||||
Grabs the correct launcher for starting a distributed command, such as either `torchrun`, `python -m
|
||||
torch.distributed.run`, etc
|
||||
"""
|
||||
if is_torch_version(">=", "1.10.0"):
|
||||
cmd = ["torchrun"]
|
||||
elif is_torch_version(">=", "1.9.0"):
|
||||
cmd = [sys.executable, "-m", "torch.distributed.run"]
|
||||
else:
|
||||
cmd = [sys.executable, "-m", "torch.distributed.launch", "--use_env"]
|
||||
return cmd
|
||||
|
||||
|
||||
class PrepareForLaunch:
|
||||
"""
|
||||
Prepare a function that will launched in a distributed setup.
|
||||
|
||||
Args:
|
||||
launcher (`Callable`):
|
||||
The function to launch.
|
||||
distributed_type ([`~state.DistributedType`]):
|
||||
The distributed type to prepare for.
|
||||
debug (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not this is a debug launch.
|
||||
"""
|
||||
|
||||
def __init__(self, launcher, distributed_type="NO", debug=False):
|
||||
self.launcher = launcher
|
||||
self.distributed_type = DistributedType(distributed_type)
|
||||
self.debug = debug
|
||||
|
||||
def __call__(self, index, *args):
|
||||
if self.debug:
|
||||
world_size = int(os.environ.get("WORLD_SIZE"))
|
||||
rdv_file = os.environ.get("ACCELERATE_DEBUG_RDV_FILE")
|
||||
torch.distributed.init_process_group(
|
||||
"gloo",
|
||||
rank=index,
|
||||
store=torch.distributed.FileStore(rdv_file, world_size),
|
||||
world_size=world_size,
|
||||
)
|
||||
elif self.distributed_type == DistributedType.MULTI_GPU or self.distributed_type == DistributedType.MULTI_CPU:
|
||||
# Prepare the environment for torch.distributed
|
||||
os.environ["LOCAL_RANK"] = str(index)
|
||||
os.environ["RANK"] = str(index)
|
||||
|
||||
os.environ["FORK_LAUNCHED"] = str(1)
|
||||
self.launcher(*args)
|
||||
88
src/accelerate/utils/memory.py
Normal file
88
src/accelerate/utils/memory.py
Normal file
@ -0,0 +1,88 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
A collection of utilities for ensuring that training can always occur. Heavily influenced by the
|
||||
[toma](https://github.com/BlackHC/toma) library.
|
||||
"""
|
||||
|
||||
import functools
|
||||
import gc
|
||||
import inspect
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def should_reduce_batch_size(exception: Exception) -> bool:
|
||||
"""
|
||||
Checks if `exception` relates to CUDA out-of-memory, CUDNN not supported, or CPU out-of-memory
|
||||
|
||||
Args:
|
||||
exception (`Exception`):
|
||||
An exception
|
||||
"""
|
||||
_statements = [
|
||||
"CUDA out of memory.", # CUDA OOM
|
||||
"cuDNN error: CUDNN_STATUS_NOT_SUPPORTED.", # CUDNN SNAFU
|
||||
"DefaultCPUAllocator: can't allocate memory", # CPU OOM
|
||||
]
|
||||
if isinstance(exception, RuntimeError) and len(exception.args) == 1:
|
||||
return any(err in exception.args[0] for err in _statements)
|
||||
return False
|
||||
|
||||
|
||||
def find_executable_batch_size(function: callable = None, starting_batch_size: int = 128):
|
||||
"""
|
||||
A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or
|
||||
CUDNN, the batch size is cut in half and passed to `function`
|
||||
|
||||
`function` must take in a `batch_size` parameter as its first argument.
|
||||
|
||||
Args:
|
||||
function (`callable`, *optional*):
|
||||
A function to wrap
|
||||
starting_batch_size (`int`, *optional*):
|
||||
The batch size to try and fit into memory
|
||||
"""
|
||||
if function is None:
|
||||
return functools.partial(find_executable_batch_size, starting_batch_size=starting_batch_size)
|
||||
|
||||
batch_size = starting_batch_size
|
||||
|
||||
def decorator(*args, **kwargs):
|
||||
nonlocal batch_size
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
params = list(inspect.signature(function).parameters.keys())
|
||||
# Guard against user error
|
||||
if len(params) < (len(args) + 1):
|
||||
arg_str = ", ".join([f"{arg}={value}" for arg, value in zip(params[1:], args[1:])])
|
||||
raise TypeError(
|
||||
f"Batch size was passed into `{function.__name__}` as the first argument when called."
|
||||
f"Remove this as the decorator already does so: `{function.__name__}({arg_str})`"
|
||||
)
|
||||
while True:
|
||||
if batch_size == 0:
|
||||
raise RuntimeError("No executable batch size found, reached zero.")
|
||||
try:
|
||||
return function(batch_size, *args, **kwargs)
|
||||
except Exception as e:
|
||||
if should_reduce_batch_size(e):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
batch_size //= 2
|
||||
else:
|
||||
raise
|
||||
|
||||
return decorator
|
||||
614
src/accelerate/utils/modeling.py
Normal file
614
src/accelerate/utils/modeling.py
Normal file
@ -0,0 +1,614 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .offload import offload_weight, save_offload_index
|
||||
|
||||
|
||||
WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
|
||||
|
||||
|
||||
def convert_file_size_to_int(size: Union[int, str]):
|
||||
"""
|
||||
Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
|
||||
|
||||
Args:
|
||||
size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> convert_file_size_to_int("1MiB")
|
||||
1048576
|
||||
```
|
||||
"""
|
||||
if isinstance(size, int):
|
||||
return size
|
||||
if size.upper().endswith("GIB"):
|
||||
return int(size[:-3]) * (2**30)
|
||||
if size.upper().endswith("MIB"):
|
||||
return int(size[:-3]) * (2**20)
|
||||
if size.upper().endswith("KIB"):
|
||||
return int(size[:-3]) * (2**10)
|
||||
if size.upper().endswith("GB"):
|
||||
int_size = int(size[:-2]) * (10**9)
|
||||
return int_size // 8 if size.endswith("b") else int_size
|
||||
if size.upper().endswith("MB"):
|
||||
int_size = int(size[:-2]) * (10**6)
|
||||
return int_size // 8 if size.endswith("b") else int_size
|
||||
if size.upper().endswith("KB"):
|
||||
int_size = int(size[:-2]) * (10**3)
|
||||
return int_size // 8 if size.endswith("b") else int_size
|
||||
raise ValueError("`size` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.")
|
||||
|
||||
|
||||
def dtype_byte_size(dtype: torch.dtype):
|
||||
"""
|
||||
Returns the size (in bytes) occupied by one parameter of type `dtype`.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
>>> dtype_byte_size(torch.float32)
|
||||
4
|
||||
```
|
||||
"""
|
||||
if dtype == torch.bool:
|
||||
return 1 / 8
|
||||
bit_search = re.search(r"[^\d](\d+)$", str(dtype))
|
||||
if bit_search is None:
|
||||
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
|
||||
bit_size = int(bit_search.groups()[0])
|
||||
return bit_size // 8
|
||||
|
||||
|
||||
def set_module_tensor_to_device(
|
||||
module: nn.Module, tensor_name: str, device: Union[int, str, torch.device], value: Optional[torch.Tensor] = None
|
||||
):
|
||||
"""
|
||||
A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
|
||||
`param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function).
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module in which the tensor we want to move lives.
|
||||
param_name (`str`): The full name of the parameter/buffer.
|
||||
device (`int`, `str` or `torch.device`): The device on which to set the tensor.
|
||||
value (`torch.Tensor`, *optional*): The value of the tensor (useful when going from the meta device to any
|
||||
other device).
|
||||
"""
|
||||
# Recurse if needed
|
||||
if "." in tensor_name:
|
||||
splits = tensor_name.split(".")
|
||||
for split in splits[:-1]:
|
||||
new_module = getattr(module, split)
|
||||
if new_module is None:
|
||||
raise ValueError(f"{module} has no attribute {split}.")
|
||||
module = new_module
|
||||
tensor_name = splits[-1]
|
||||
|
||||
if tensor_name not in module._parameters and tensor_name not in module._buffers:
|
||||
raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
|
||||
is_buffer = tensor_name in module._buffers
|
||||
old_value = getattr(module, tensor_name)
|
||||
|
||||
if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
|
||||
raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
|
||||
|
||||
with torch.no_grad():
|
||||
if value is None:
|
||||
new_value = old_value.to(device)
|
||||
elif isinstance(value, torch.Tensor):
|
||||
new_value = value.to(device)
|
||||
else:
|
||||
new_value = torch.tensor(value, device=device)
|
||||
if is_buffer:
|
||||
module._buffers[tensor_name] = new_value
|
||||
else:
|
||||
new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
|
||||
module._parameters[tensor_name] = new_value
|
||||
|
||||
|
||||
def named_module_tensors(module: nn.Module, include_buffers: bool = True, recurse: bool = False):
|
||||
"""
|
||||
A helper function that gathers all the tensors (parameters + buffers) of a given module. If `include_buffers=True`
|
||||
it's the same as doing `module.named_parameters(recurse=recurse) + module.named_buffers(recurse=recurse)`.
|
||||
|
||||
Args:
|
||||
module (`torch.nn.Module`): The module we want the tensors or.
|
||||
include_buffer (`bool`, *optional*, defaults to `True`): Whether or not to include the buffers in the result.
|
||||
recurse (`bool`, *optional`, defaults to `False`):
|
||||
Whether or not to go look in every submodule or just return the direct parameters and buffers.
|
||||
"""
|
||||
for named_parameter in module.named_parameters(recurse=recurse):
|
||||
yield named_parameter
|
||||
|
||||
if include_buffers:
|
||||
for named_buffer in module.named_buffers(recurse=recurse):
|
||||
yield named_buffer
|
||||
|
||||
|
||||
def find_tied_parameters(model: nn.Module, **kwargs):
|
||||
"""
|
||||
Find the tied parameters in a given model.
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to inspect.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
The signature accepts keyword arguments, but they are for the recursive part of this function and you should ignore
|
||||
them.
|
||||
|
||||
</Tip>
|
||||
|
||||
Example:
|
||||
|
||||
|
||||
```py
|
||||
>>> from collections import OrderedDict
|
||||
>>> import torch.nn as nn
|
||||
|
||||
>>> model = nn.Sequential(OrderedDict([("linear1", nn.Linear(4, 4)), ("linear2", nn.Linear(4, 4))]))
|
||||
>>> model.linear2.weight = test_model.linear1.weight
|
||||
>>> find_tied_parameters(test_model)
|
||||
{'linear1.weight': 'linear2.weight'}
|
||||
```
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: A dictionary mapping tied parameter names to the name of the parameter they are tied to.
|
||||
"""
|
||||
# Initialize result and named_parameters before recursing.
|
||||
named_parameters = kwargs.get("named_parameters", None)
|
||||
prefix = kwargs.get("prefix", "")
|
||||
result = kwargs.get("result", {})
|
||||
|
||||
if named_parameters is None:
|
||||
named_parameters = {n: p for n, p in model.named_parameters()}
|
||||
else:
|
||||
# A tied parameter will not be in the full `named_parameters` seen above but will be in the `named_parameters`
|
||||
# of the submodule it belongs to. So while recursing we track the names that are not in the initial
|
||||
# `named_parameters`.
|
||||
for name, parameter in model.named_parameters():
|
||||
full_name = name if prefix == "" else f"{prefix}.{name}"
|
||||
if full_name not in named_parameters:
|
||||
# When we find one, it has to be one of the existing parameters.
|
||||
for new_name, new_param in named_parameters.items():
|
||||
if new_param is parameter:
|
||||
result[new_name] = full_name
|
||||
|
||||
# Once we have treated direct parameters, we move to the child modules.
|
||||
for name, child in model.named_children():
|
||||
child_name = name if prefix == "" else f"{prefix}.{name}"
|
||||
find_tied_parameters(child, named_parameters=named_parameters, prefix=child_name, result=result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def compute_module_sizes(model: nn.Module, dtype: Optional[Union[str, torch.device]] = None):
|
||||
"""
|
||||
Compute the size of each submodule of a given model.
|
||||
"""
|
||||
if isinstance(dtype, str):
|
||||
# We accept "torch.float16" or just "float16"
|
||||
dtype = dtype.replace("torch.", "")
|
||||
dtype = getattr(torch, dtype)
|
||||
if dtype is not None:
|
||||
dtype_size = dtype_byte_size(dtype)
|
||||
module_sizes = defaultdict(int)
|
||||
for name, tensor in named_module_tensors(model, recurse=True):
|
||||
if dtype is None:
|
||||
size = tensor.numel() * dtype_byte_size(tensor.dtype)
|
||||
else:
|
||||
size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
|
||||
name_parts = name.split(".")
|
||||
for idx in range(len(name_parts) + 1):
|
||||
module_sizes[".".join(name_parts[:idx])] += size
|
||||
|
||||
return module_sizes
|
||||
|
||||
|
||||
def get_max_layer_size(
|
||||
modules: List[Tuple[str, torch.nn.Module]], module_sizes: Dict[str, int], no_split_module_classes: List[str]
|
||||
):
|
||||
"""
|
||||
Utility function that will scan a list of named modules and return the maximum size used by one full layer. The
|
||||
definition of a layer being:
|
||||
- a module with no direct children (just parameters and buffers)
|
||||
- a module whose class name is in the list `no_split_module_classes`
|
||||
|
||||
Args:
|
||||
modules (`List[Tuple[str, torch.nn.Module]]`):
|
||||
The list of named modules where we want to determine the maximum layer size.
|
||||
module_sizes (`Dict[str, int]`):
|
||||
A dictionary mapping each layer name to its size (as generated by `compute_module_sizes`).
|
||||
no_split_module_classes (`List[str]`):
|
||||
A list of class names for layers we don't want to be split.
|
||||
|
||||
Returns:
|
||||
`Tuple[int, List[str]]`: The maximum size of a layer with the list of layer names realizing that maximum size.
|
||||
"""
|
||||
max_size = 0
|
||||
layer_names = []
|
||||
modules_to_treat = modules.copy()
|
||||
while len(modules_to_treat) > 0:
|
||||
module_name, module = modules_to_treat.pop(0)
|
||||
modules_children = list(module.named_children())
|
||||
if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
|
||||
# No splitting this one so we compare to the max_size
|
||||
size = module_sizes[module_name]
|
||||
if size > max_size:
|
||||
max_size = size
|
||||
layer_names = [module_name]
|
||||
elif size == max_size:
|
||||
layer_names.append(module_name)
|
||||
else:
|
||||
modules_to_treat = [(f"{module_name}.{n}", v) for n, v in modules_children] + modules_to_treat
|
||||
return max_size, layer_names
|
||||
|
||||
|
||||
def get_max_memory(max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None):
|
||||
"""
|
||||
Get the maximum memory available if nothing is passed, converts string to int otherwise.
|
||||
"""
|
||||
import psutil
|
||||
|
||||
if max_memory is None:
|
||||
if not torch.cuda.is_available():
|
||||
max_memory = {}
|
||||
else:
|
||||
# Make sure CUDA is initialized on each GPU to have the right memory info.
|
||||
for i in range(torch.cuda.device_count()):
|
||||
_ = torch.tensor([0], device=i)
|
||||
max_memory = {i: torch.cuda.mem_get_info(i)[0] for i in range(torch.cuda.device_count())}
|
||||
max_memory["cpu"] = psutil.virtual_memory().available
|
||||
return max_memory
|
||||
|
||||
for key in max_memory:
|
||||
if isinstance(max_memory[key], str):
|
||||
max_memory[key] = convert_file_size_to_int(max_memory[key])
|
||||
return max_memory
|
||||
|
||||
|
||||
def clean_device_map(device_map: Dict[str, Union[int, str, torch.device]], module_name: str = ""):
|
||||
"""
|
||||
Cleans a device_map by grouping all submodules that go on the same device together.
|
||||
"""
|
||||
# Get the value of the current module and if there is only one split across several keys, regroup it.
|
||||
prefix = "" if module_name == "" else f"{module_name}."
|
||||
values = [v for k, v in device_map.items() if k.startswith(prefix)]
|
||||
if len(set(values)) == 1 and len(values) > 1:
|
||||
for k in [k for k in device_map if k.startswith(prefix)]:
|
||||
del device_map[k]
|
||||
device_map[module_name] = values[0]
|
||||
|
||||
# Recurse over the children
|
||||
children_modules = [k for k in device_map.keys() if k.startswith(module_name) and len(k) > len(module_name)]
|
||||
idx = len(module_name.split(".")) + 1 if len(module_name) > 0 else 1
|
||||
children_modules = set(".".join(k.split(".")[:idx]) for k in children_modules)
|
||||
for child in children_modules:
|
||||
clean_device_map(device_map, module_name=child)
|
||||
|
||||
return device_map
|
||||
|
||||
|
||||
def load_offloaded_weights(model, index, offload_folder):
|
||||
if index is None or len(index) == 0:
|
||||
# Nothing to do
|
||||
return
|
||||
|
||||
for param_name, metadata in index.items():
|
||||
tensor_file = os.path.join(offload_folder, f"{param_name}.dat")
|
||||
shape = tuple(metadata["shape"])
|
||||
weight = np.memmap(tensor_file, dtype=metadata["dtype"], mode="r", shape=shape)
|
||||
set_module_tensor_to_device(model, param_name, "cpu", value=torch.tensor(weight))
|
||||
|
||||
|
||||
def infer_auto_device_map(
|
||||
model: nn.Module,
|
||||
max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
|
||||
no_split_module_classes: Optional[List[str]] = None,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
):
|
||||
"""
|
||||
Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk,
|
||||
such that:
|
||||
- we don't exceed the memory available of any of the GPU.
|
||||
- if offload to the CPU is needed, there is always room left on GPU 0 to put back the layer offloaded on CPU that
|
||||
has the largest size.
|
||||
- if offload to the CPU is needed,we don't exceed the RAM available on the CPU.
|
||||
- if offload to the disk is needed, there is always room left on the CPU to put back the layer offloaded on disk
|
||||
that has the largest size.
|
||||
|
||||
<Tip>
|
||||
|
||||
All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the
|
||||
meta device (as it would if initialized within the `init_empty_weights` context manager).
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to analyze.
|
||||
max_memory (`Dict`, *optional*):
|
||||
A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
|
||||
no_split_module_classes (`List[str]`, *optional*):
|
||||
A list of layer class names that should never be split across device (for instance any layer that has a
|
||||
residual connection).
|
||||
dtype (`str` or `torch.dtype`, *optional*):
|
||||
If provided, the weights will be converted to that type when loaded.
|
||||
"""
|
||||
# Get default / clean up max_memory
|
||||
max_memory = get_max_memory(max_memory)
|
||||
if no_split_module_classes is None:
|
||||
no_split_module_classes = []
|
||||
elif not isinstance(no_split_module_classes, (list, tuple)):
|
||||
no_split_module_classes = [no_split_module_classes]
|
||||
|
||||
devices = list(max_memory.keys())
|
||||
gpus = [device for device in devices if device != "cpu"]
|
||||
if "disk" not in devices:
|
||||
devices.append("disk")
|
||||
|
||||
# Devices that need to keep space for a potential offloaded layer.
|
||||
main_devices = [gpus[0], "cpu"] if len(gpus) > 0 else ["cpu"]
|
||||
|
||||
module_sizes = compute_module_sizes(model, dtype=dtype)
|
||||
tied_parameters = find_tied_parameters(model)
|
||||
|
||||
device_map = {}
|
||||
current_device = 0
|
||||
current_memory_used = 0
|
||||
|
||||
# Direct submodules and parameters
|
||||
modules_to_treat = list(model.named_parameters(recurse=False)) + list(model.named_children())
|
||||
# Initialize maximum largest layer, to know which space to keep in memory
|
||||
max_layer_size, max_layer_names = get_max_layer_size(modules_to_treat, module_sizes, no_split_module_classes)
|
||||
|
||||
# Ready ? This is going to be a bit messy.
|
||||
while len(modules_to_treat) > 0:
|
||||
name, module = modules_to_treat.pop(0)
|
||||
# Max size in the remaining layers may have changed since we took one, so we maybe update it.
|
||||
max_layer_names = [n for n in max_layer_names if not n.startswith(name)]
|
||||
if len(max_layer_names) == 0:
|
||||
max_layer_size, max_layer_names = get_max_layer_size(
|
||||
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
|
||||
module_sizes,
|
||||
no_split_module_classes,
|
||||
)
|
||||
# Assess size needed
|
||||
module_size = module_sizes[name]
|
||||
tied_params = [v for k, v in tied_parameters.items() if name in k]
|
||||
# We ignore parameters that are tied when they're tied to > 1 one
|
||||
tied_param = tied_params[0] if len(tied_params) == 1 else None
|
||||
|
||||
device = devices[current_device]
|
||||
current_max_size = max_memory[device] if device != "disk" else None
|
||||
# Reduce max size available by the largest layer.
|
||||
if devices[current_device] in main_devices:
|
||||
current_max_size = current_max_size - max_layer_size
|
||||
# Case 1 -> We're too big!
|
||||
if current_max_size is not None and current_memory_used + module_size > current_max_size:
|
||||
# Split or not split?
|
||||
modules_children = list(module.named_children())
|
||||
if len(modules_children) == 0 or module.__class__.__name__ in no_split_module_classes:
|
||||
# -> no split, we go to the next device
|
||||
current_device += 1
|
||||
modules_to_treat = [(name, module)] + modules_to_treat
|
||||
current_memory_used = 0
|
||||
else:
|
||||
# -> split, we replace the module studied by its children + parameters
|
||||
modules_children = list(module.named_parameters(recurse=False)) + modules_children
|
||||
modules_to_treat = [(f"{name}.{n}", v) for n, v in modules_children] + modules_to_treat
|
||||
# Update the max layer size.
|
||||
max_layer_size, max_layer_names = get_max_layer_size(
|
||||
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
|
||||
module_sizes,
|
||||
no_split_module_classes,
|
||||
)
|
||||
|
||||
# Case 2, it fits! We're not entirely out of the wood though, because we may have some tied parameters.
|
||||
elif tied_param is not None:
|
||||
# Determine the sized occupied by this module + the module containing the tied parameter
|
||||
tied_module_size = module_size
|
||||
tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if n in tied_param][0]
|
||||
tied_module_name, tied_module = modules_to_treat[tied_module_index]
|
||||
tied_module_size += module_sizes[tied_module_name] - module_sizes[tied_param]
|
||||
if current_max_size is not None and current_memory_used + tied_module_size > current_max_size:
|
||||
# Split or not split?
|
||||
tied_module_children = list(tied_module.named_children())
|
||||
if len(tied_module_children) == 0 or tied_module.__class__.__name__ in no_split_module_classes:
|
||||
# If the tied module is not split, we go to the next device
|
||||
current_device += 1
|
||||
modules_to_treat = [(name, module)] + modules_to_treat
|
||||
current_memory_used = 0
|
||||
else:
|
||||
# Otherwise, we replace the tied module by its children.
|
||||
tied_module_children = list(tied_module.named_parameters(recurse=False)) + tied_module_children
|
||||
tied_module_children = [(f"{tied_module_name}.{n}", v) for n, v in tied_module_children]
|
||||
modules_to_treat = (
|
||||
[(name, module)]
|
||||
+ modules_to_treat[:tied_module_index]
|
||||
+ tied_module_children
|
||||
+ modules_to_treat[tied_module_index + 1 :]
|
||||
)
|
||||
# Update the max layer size.
|
||||
max_layer_size, max_layer_names = get_max_layer_size(
|
||||
[(n, m) for n, m in modules_to_treat if isinstance(m, torch.nn.Module)],
|
||||
module_sizes,
|
||||
no_split_module_classes,
|
||||
)
|
||||
else:
|
||||
# We really really fit!
|
||||
current_memory_used += tied_module_size
|
||||
device_map[name] = devices[current_device]
|
||||
modules_to_treat.pop(tied_module_index)
|
||||
device_map[tied_module_name] = devices[current_device]
|
||||
else:
|
||||
current_memory_used += module_size
|
||||
device_map[name] = devices[current_device]
|
||||
|
||||
return clean_device_map(device_map)
|
||||
|
||||
|
||||
def check_device_map(model: nn.Module, device_map: Dict[str, Union[int, str, torch.device]]):
|
||||
"""
|
||||
Checks a device map covers everything in a given model.
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to check the device map against.
|
||||
device_map (`Dict[str, Union[int, str, torch.device]]`): The device map to check.
|
||||
"""
|
||||
all_model_tensors = [name for name, _ in model.state_dict().items()]
|
||||
for module_name in device_map.keys():
|
||||
all_model_tensors = [name for name in all_model_tensors if not name.startswith(module_name)]
|
||||
if len(all_model_tensors) > 0:
|
||||
non_covered_params = ", ".join(all_model_tensors)
|
||||
raise ValueError(
|
||||
f"The device_map provided does not give any device for the following parameters: {non_covered_params}"
|
||||
)
|
||||
|
||||
|
||||
def load_checkpoint_in_model(
|
||||
model: nn.Module,
|
||||
checkpoint: Union[str, os.PathLike],
|
||||
device_map: Optional[Dict[str, Union[int, str, torch.device]]] = None,
|
||||
offload_folder: Optional[Union[str, os.PathLike]] = None,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
offload_state_dict: bool = False,
|
||||
):
|
||||
"""
|
||||
Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
|
||||
loaded.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Once loaded across devices, you still need to call [`dispatch_model`] on your model to make it able to run. To
|
||||
group the checkpoint loading and dispatch in one single call, use [`load_checkpoint_and_dispatch`].
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model in which we want to load a checkpoint.
|
||||
checkpoint (`str` or `os.PathLike`):
|
||||
The folder checkpoint to load. It can be:
|
||||
- a path to a file containing a whole model state dict
|
||||
- a path to a `.json` file containing the index to a sharded checkpoint
|
||||
- a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
|
||||
device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
|
||||
A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
|
||||
name, once a given module name is inside, every submodule of it will be sent to the same device.
|
||||
offload_folder (`str` or `os.PathLike`, *optional*):
|
||||
If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
|
||||
dtype (`str` or `torch.dtype`, *optional*):
|
||||
If provided, the weights will be converted to that type when loaded.
|
||||
offload_state_dict (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, will temporarily offload the CPU state dict on the hard drive to avoig getting out of CPU RAM if
|
||||
the weight of the CPU state dict + the biggest shard does not fit.
|
||||
"""
|
||||
if offload_folder is None and device_map is not None and "disk" in device_map.values():
|
||||
raise ValueError(
|
||||
"At least one of the model submodule will be offloaded to disk, please pass along an `offload_folder`."
|
||||
)
|
||||
elif offload_folder is not None and device_map is not None and "disk" in device_map.values():
|
||||
os.makedirs(offload_folder, exist_ok=True)
|
||||
|
||||
if isinstance(dtype, str):
|
||||
# We accept "torch.float16" or just "float16"
|
||||
dtype = dtype.replace("torch.", "")
|
||||
dtype = getattr(torch, dtype)
|
||||
|
||||
checkpoint_files = None
|
||||
index_filename = None
|
||||
if os.path.isfile(checkpoint):
|
||||
if str(checkpoint).endswith(".json"):
|
||||
index_filename = checkpoint
|
||||
else:
|
||||
checkpoint_files = [checkpoint]
|
||||
elif os.path.isdir(checkpoint):
|
||||
potential_index = [f for f in os.listdir(checkpoint) if f.endswith(".index.json")]
|
||||
if len(potential_index) == 0:
|
||||
raise ValueError(f"{checkpoint} is not a folder containing a `.index.json` file.")
|
||||
elif len(potential_index) == 1:
|
||||
index_filename = os.path.join(checkpoint, potential_index[0])
|
||||
else:
|
||||
raise ValueError(f"{checkpoint} containing mote than one `.index.json` file, delete the irrelevant ones.")
|
||||
else:
|
||||
raise ValueError(
|
||||
"`checkpoint` should be the path to a file containing a whole state dict, or the index of a sharded "
|
||||
f"checkpoint, or a folder containing a sharded checkpoint, but got {checkpoint}."
|
||||
)
|
||||
|
||||
if index_filename is not None:
|
||||
checkpoint_folder = os.path.split(index_filename)[0]
|
||||
with open(index_filename, "r") as f:
|
||||
index = json.loads(f.read())
|
||||
|
||||
if "weight_map" in index:
|
||||
index = index["weight_map"]
|
||||
checkpoint_files = sorted(list(set(index.values())))
|
||||
checkpoint_files = [os.path.join(checkpoint_folder, f) for f in checkpoint_files]
|
||||
|
||||
# Logic for missing/unexepected keys goes here.
|
||||
|
||||
offload_index = {}
|
||||
if offload_state_dict:
|
||||
state_dict_folder = tempfile.mkdtemp()
|
||||
state_dict_index = {}
|
||||
|
||||
for checkpoint_file in checkpoint_files:
|
||||
checkpoint = torch.load(checkpoint_file)
|
||||
if device_map is None:
|
||||
model.load_state_dict(checkpoint, strict=False)
|
||||
else:
|
||||
for param_name, param in checkpoint.items():
|
||||
module_name = param_name
|
||||
if dtype is not None:
|
||||
param = param.to(dtype)
|
||||
while len(module_name) > 0 and module_name not in device_map:
|
||||
module_name = ".".join(module_name.split(".")[:-1])
|
||||
if module_name == "" and "" not in device_map:
|
||||
# TODO: group all errors and raise at the end.
|
||||
raise ValueError(f"{param_name} doesn't have any device set.")
|
||||
param_device = device_map[module_name]
|
||||
|
||||
if param_device == "disk":
|
||||
set_module_tensor_to_device(model, param_name, "meta")
|
||||
offload_weight(param, param_name, offload_folder, index=offload_index)
|
||||
elif param_device == "cpu" and offload_state_dict:
|
||||
set_module_tensor_to_device(model, param_name, "meta")
|
||||
offload_weight(param, param_name, state_dict_folder, index=state_dict_index)
|
||||
else:
|
||||
set_module_tensor_to_device(model, param_name, param_device, value=param)
|
||||
|
||||
# Force Python to clean up.
|
||||
del checkpoint
|
||||
gc.collect()
|
||||
|
||||
save_offload_index(offload_index, offload_folder)
|
||||
|
||||
# Load back offloaded state dict on CPU
|
||||
if offload_state_dict:
|
||||
load_offloaded_weights(model, state_dict_index, state_dict_folder)
|
||||
shutil.rmtree(state_dict_folder)
|
||||
182
src/accelerate/utils/offload.py
Normal file
182
src/accelerate/utils/offload.py
Normal file
@ -0,0 +1,182 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections.abc import Mapping
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def offload_weight(weight, weight_name, offload_folder, index=None):
|
||||
dtype = None
|
||||
# Check the string instead of the dtype to be compatible with versions of PyTorch that don't have bfloat16.
|
||||
if str(weight.dtype) == "torch.bfloat16":
|
||||
# Need to reinterpret the underlined data as int16 since NumPy does not handle bfloat16s.
|
||||
weight = weight.view(torch.int16)
|
||||
dtype = "bfloat16"
|
||||
array = weight.numpy()
|
||||
tensor_file = os.path.join(offload_folder, f"{weight_name}.dat")
|
||||
if index is not None:
|
||||
if dtype is None:
|
||||
dtype = str(array.dtype)
|
||||
index[weight_name] = {"dtype": dtype, "shape": list(array.shape)}
|
||||
if array.ndim == 0:
|
||||
array = array[None]
|
||||
file_array = np.memmap(tensor_file, dtype=array.dtype, mode="w+", shape=array.shape)
|
||||
file_array[:] = array[:]
|
||||
file_array.flush()
|
||||
return index
|
||||
|
||||
|
||||
def load_offloaded_weight(weight_file, weight_info):
|
||||
shape = tuple(weight_info["shape"])
|
||||
if shape == ():
|
||||
# NumPy memory-mapped arrays can't have 0 dims so it was saved as 1d tensor
|
||||
shape = (1,)
|
||||
|
||||
dtype = weight_info["dtype"]
|
||||
if dtype == "bfloat16":
|
||||
# NumPy does not support bfloat16 so this was saved as a int16
|
||||
dtype = "int16"
|
||||
|
||||
weight = np.memmap(weight_file, dtype=dtype, shape=shape, mode="r")
|
||||
|
||||
if len(weight_info["shape"]) == 0:
|
||||
weight = weight[0]
|
||||
weight = torch.tensor(weight)
|
||||
if weight_info["dtype"] == "bfloat16":
|
||||
weight = weight.view(torch.bfloat16)
|
||||
|
||||
return weight
|
||||
|
||||
|
||||
def save_offload_index(index, offload_folder):
|
||||
if index is None or len(index) == 0:
|
||||
# Nothing to save
|
||||
return
|
||||
|
||||
offload_index_file = os.path.join(offload_folder, "index.json")
|
||||
if os.path.isfile(offload_index_file):
|
||||
with open(offload_index_file, "r", encoding="utf-8") as f:
|
||||
current_index = json.load(f)
|
||||
else:
|
||||
current_index = {}
|
||||
current_index.update(index)
|
||||
|
||||
with open(offload_index_file, "w", encoding="utf-8") as f:
|
||||
json.dump(current_index, f, indent=2)
|
||||
|
||||
|
||||
def offload_state_dict(save_dir: Union[str, os.PathLike], state_dict: Dict[str, torch.Tensor]):
|
||||
"""
|
||||
Offload a state dict in a given folder.
|
||||
|
||||
Args:
|
||||
save_dir (`str` or `os.PathLike`): The directory in which to offload the state dict.
|
||||
state_dict (`Dict[str, torch.Tensor]`): The dictionary of tensors to offload.
|
||||
"""
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
index = {}
|
||||
for name, parameter in state_dict.items():
|
||||
index = offload_weight(parameter, name, save_dir, index=index)
|
||||
|
||||
# Update index
|
||||
save_offload_index(index, save_dir)
|
||||
|
||||
|
||||
class PrefixedDataset(Mapping):
|
||||
"""
|
||||
Will access keys in a given dataset by adding a prefix.
|
||||
|
||||
Args:
|
||||
dataset (`Mapping`): Any map with string keys.
|
||||
prefix (`str`): A prefix to add when trying to access any element in the underlying dataset.
|
||||
"""
|
||||
|
||||
def __init__(self, dataset: Mapping, prefix: str):
|
||||
self.dataset = dataset
|
||||
self.prefix = prefix
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.dataset[f"{self.prefix}{key}"]
|
||||
|
||||
def __iter__(self):
|
||||
return iter([key for key in self.dataset if key.startswith(self.prefix)])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dataset)
|
||||
|
||||
|
||||
class OffloadedWeightsLoader(Mapping):
|
||||
"""
|
||||
A collection that loads weights stored in a given state dict or memory-mapped on disk.
|
||||
|
||||
Args:
|
||||
state_dict (`Dict[str, torch.Tensor]`, *optional*):
|
||||
A dictionary parameter name to tensor.
|
||||
save_folder (`str` or `os.PathLike`, *optional*):
|
||||
The directory in which the weights are stored (by `offload_state_dict` for instance).
|
||||
index (`Dict`, *optional*):
|
||||
A dictionary from weight name to their information (`dtype` and `shape`). Will default to the index saved
|
||||
in `save_folder`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
state_dict: Dict[str, torch.Tensor] = None,
|
||||
save_folder: Optional[Union[str, os.PathLike]] = None,
|
||||
index: Mapping = None,
|
||||
):
|
||||
if state_dict is None and save_folder is None:
|
||||
raise ValueError("Need either a `state_dict` or a `save_folder` containing offloaded weights.")
|
||||
|
||||
self.state_dict = {} if state_dict is None else state_dict
|
||||
self.save_folder = save_folder
|
||||
if index is None and save_folder is not None:
|
||||
with open(os.path.join(save_folder, "index.json")) as f:
|
||||
index = json.load(f)
|
||||
self.index = {} if index is None else index
|
||||
self.all_keys = list(self.state_dict.keys())
|
||||
self.all_keys.extend([key for key in self.index if key not in self.all_keys])
|
||||
|
||||
def __getitem__(self, key: str):
|
||||
# State dict gets priority
|
||||
if key in self.state_dict:
|
||||
return self.state_dict[key]
|
||||
weight_info = self.index[key]
|
||||
weight_file = os.path.join(self.save_folder, f"{key}.dat")
|
||||
return load_offloaded_weight(weight_file, weight_info)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.all_keys)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.all_keys)
|
||||
|
||||
|
||||
def extract_submodules_state_dict(state_dict: Dict[str, torch.Tensor], submodule_names: List[str]):
|
||||
"""
|
||||
Extract the sub state-dict corresponding to a list of given submodules.
|
||||
|
||||
Args:
|
||||
state_dict (`Dict[str, torch.Tensor]`): The state dict to extract from.
|
||||
submodule_names (`List[str]`): The list of submodule names we want to extract.
|
||||
"""
|
||||
result = {}
|
||||
for module_name in submodule_names:
|
||||
result.update({key: param for key, param in state_dict.items() if key.startswith(module_name)})
|
||||
return result
|
||||
@ -1,4 +1,4 @@
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -12,172 +12,33 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
import importlib
|
||||
import os
|
||||
import random
|
||||
import typing
|
||||
from collections.abc import Mapping
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum, EnumMeta
|
||||
from functools import update_wrapper
|
||||
from typing import Any, Callable, Iterable, List, Optional, Union
|
||||
"""
|
||||
A set of basic tensor ops compatible with tpu, gpu, and multigpu
|
||||
"""
|
||||
|
||||
|
||||
from functools import update_wrapper
|
||||
from typing import Any, Mapping
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.distributed import ReduceOp
|
||||
|
||||
from packaging import version
|
||||
|
||||
from .state import AcceleratorState, DistributedType, is_deepspeed_available, is_tpu_available
|
||||
from ..state import AcceleratorState
|
||||
from .dataclasses import DistributedType, TensorInformation
|
||||
from .imports import is_tpu_available
|
||||
from .versions import is_torch_version
|
||||
|
||||
|
||||
if is_tpu_available():
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
|
||||
def is_tensorflow_available():
|
||||
return importlib.util.find_spec("tensorflow") is not None
|
||||
def is_torch_tensor(tensor):
|
||||
return isinstance(tensor, torch.Tensor)
|
||||
|
||||
|
||||
def is_tensorboard_available():
|
||||
return importlib.util.find_spec("tensorboard") is not None or importlib.util.find_spec("tensorboardX") is not None
|
||||
|
||||
|
||||
def is_wandb_available():
|
||||
return importlib.util.find_spec("wandb") is not None
|
||||
|
||||
|
||||
def is_comet_ml_available():
|
||||
return importlib.util.find_spec("comet_ml") is not None
|
||||
|
||||
|
||||
def is_boto3_available():
|
||||
return importlib.util.find_spec("boto3") is not None
|
||||
|
||||
|
||||
def is_sagemaker_available():
|
||||
return importlib.util.find_spec("sagemaker") is not None
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
from deepspeed import DeepSpeedEngine
|
||||
|
||||
SCALER_NAME = "scaler.pt"
|
||||
MODEL_NAME = "pytorch_model"
|
||||
RNG_STATE_NAME = "random_states"
|
||||
OPTIMIZER_NAME = "optimizer"
|
||||
SCHEDULER_NAME = "scheduler"
|
||||
|
||||
|
||||
class EnumWithContains(EnumMeta):
|
||||
"A metaclass that adds the ability to check if `self` contains an item with the `in` operator"
|
||||
|
||||
def __contains__(cls, item):
|
||||
try:
|
||||
cls(item)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class BaseEnum(Enum, metaclass=EnumWithContains):
|
||||
"An enum class that can get the value of an item with `str(Enum.key)`"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
@classmethod
|
||||
def list(cls):
|
||||
"Method to list all the possible items in `cls`"
|
||||
return list(map(lambda item: str(item), cls))
|
||||
|
||||
|
||||
class LoggerType(BaseEnum):
|
||||
ALL = "all"
|
||||
TENSORBOARD = "tensorboard"
|
||||
WANDB = "wandb"
|
||||
COMETML = "comet_ml"
|
||||
|
||||
|
||||
class PrecisionType(BaseEnum):
|
||||
NO = "no"
|
||||
FP16 = "fp16"
|
||||
BF16 = "bf16"
|
||||
|
||||
|
||||
class RNGType(BaseEnum):
|
||||
TORCH = "torch"
|
||||
CUDA = "cuda"
|
||||
XLA = "xla"
|
||||
GENERATOR = "generator"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TensorInformation:
|
||||
shape: torch.Size
|
||||
dtype: torch.dtype
|
||||
|
||||
|
||||
def set_seed(seed: int, device_specific: bool = False):
|
||||
"""
|
||||
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
|
||||
|
||||
Args:
|
||||
seed (`int`): The seed to set.
|
||||
device_specific (`bool`, *optional*, defaults to `False`):
|
||||
Whether to differ the seed on each device slightly with `self.process_index`.
|
||||
"""
|
||||
if device_specific:
|
||||
seed += AcceleratorState().process_index
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
if is_tpu_available():
|
||||
xm.set_rng_state(seed)
|
||||
|
||||
|
||||
def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optional[torch.Generator] = None):
|
||||
# Get the proper rng state
|
||||
if rng_type == RNGType.TORCH:
|
||||
rng_state = torch.get_rng_state()
|
||||
elif rng_type == RNGType.CUDA:
|
||||
rng_state = torch.cuda.get_rng_state()
|
||||
elif rng_type == RNGType.XLA:
|
||||
assert is_tpu_available(), "Can't synchronize XLA seeds on an environment without TPUs."
|
||||
rng_state = torch.tensor(xm.get_rng_state())
|
||||
elif rng_type == RNGType.GENERATOR:
|
||||
assert generator is not None, "Need a generator to synchronize its seed."
|
||||
rng_state = generator.get_state()
|
||||
|
||||
# Broadcast the rng state from device 0 to other devices
|
||||
state = AcceleratorState()
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
rng_state = xm.mesh_reduce("random_seed", rng_state, lambda x: x[0])
|
||||
elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
rng_state = rng_state.to(state.device)
|
||||
torch.distributed.broadcast(rng_state, 0)
|
||||
rng_state = rng_state.cpu()
|
||||
elif state.distributed_type == DistributedType.MULTI_CPU:
|
||||
torch.distributed.broadcast(rng_state, 0)
|
||||
|
||||
# Set the broadcast rng state
|
||||
if rng_type == RNGType.TORCH:
|
||||
torch.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.CUDA:
|
||||
torch.cuda.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.XLA:
|
||||
xm.set_rng_state(rng_state.item())
|
||||
elif rng_type == RNGType.GENERATOR:
|
||||
generator.set_state(rng_state)
|
||||
|
||||
|
||||
def synchronize_rng_states(rng_types: List[Union[str, RNGType]], generator: Optional[torch.Generator] = None):
|
||||
for rng_type in rng_types:
|
||||
synchronize_rng_state(RNGType(rng_type), generator=generator)
|
||||
def is_tensor_information(tensor_info):
|
||||
return isinstance(tensor_info, TensorInformation)
|
||||
|
||||
|
||||
def honor_type(obj, generator):
|
||||
@ -191,14 +52,6 @@ def honor_type(obj, generator):
|
||||
return type(obj)(*list(generator))
|
||||
|
||||
|
||||
def is_torch_tensor(tensor):
|
||||
return isinstance(tensor, torch.Tensor)
|
||||
|
||||
|
||||
def is_tensor_information(tensor_info):
|
||||
return isinstance(tensor_info, TensorInformation)
|
||||
|
||||
|
||||
def recursively_apply(func, data, *args, test_type=is_torch_tensor, error_on_other_type=False, **kwargs):
|
||||
"""
|
||||
Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.
|
||||
@ -305,73 +158,24 @@ def initialize_tensors(data_structure):
|
||||
return recursively_apply(_initialize_tensor, data_structure, test_type=is_tensor_information)
|
||||
|
||||
|
||||
def convert_to_fp32(tensor):
|
||||
def find_batch_size(data):
|
||||
"""
|
||||
Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.
|
||||
Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.
|
||||
|
||||
Args:
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to convert from FP16/BF16 to FP32.
|
||||
data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.
|
||||
|
||||
Returns:
|
||||
The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
|
||||
`int`: The batch size.
|
||||
"""
|
||||
|
||||
def _convert_to_fp32(tensor):
|
||||
return tensor.float()
|
||||
|
||||
def _is_fp16_bf16_tensor(tensor):
|
||||
return hasattr(tensor, "dtype") and (
|
||||
tensor.dtype == torch.float16
|
||||
or (version.parse(torch.__version__) >= version.parse("1.10") and tensor.dtype == torch.bfloat16)
|
||||
)
|
||||
|
||||
return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor)
|
||||
|
||||
|
||||
class ConvertOutputsToFp32:
|
||||
"""
|
||||
Decorator to apply to a function outputing tensors (like a model forward pass) that ensures the outputs in FP16
|
||||
precision will be convert back to FP32.
|
||||
|
||||
Use a class instead of a decorator because otherwise, the prepared model can no longer be pickled (issue #273).
|
||||
|
||||
Args:
|
||||
model_forward (`Callable`):
|
||||
The function which outputs we want to treat.
|
||||
|
||||
Returns:
|
||||
The same function as `model_forward` but with converted outputs.
|
||||
"""
|
||||
|
||||
def __init__(self, model_forward):
|
||||
self.model_forward = model_forward
|
||||
update_wrapper(self, model_forward)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return convert_to_fp32(self.model_forward(*args, **kwargs))
|
||||
|
||||
|
||||
convert_outputs_to_fp32 = ConvertOutputsToFp32
|
||||
|
||||
|
||||
def extract_model_from_parallel(model):
|
||||
"""
|
||||
Extract a model from its distributed containers.
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to extract.
|
||||
|
||||
Returns:
|
||||
`torch.nn.Module`: The extracted model.
|
||||
"""
|
||||
options = (torch.nn.parallel.DistributedDataParallel, torch.nn.DataParallel)
|
||||
if is_deepspeed_available():
|
||||
options += (DeepSpeedEngine,)
|
||||
|
||||
while isinstance(model, options):
|
||||
model = model.module
|
||||
return model
|
||||
if isinstance(data, (tuple, list)):
|
||||
return find_batch_size(data[0])
|
||||
elif isinstance(data, Mapping):
|
||||
for k in data.keys():
|
||||
return find_batch_size(data[k])
|
||||
elif not isinstance(data, torch.Tensor):
|
||||
raise TypeError(f"Can only find the batch size of tensors but got {type(data)}.")
|
||||
return data.shape[0]
|
||||
|
||||
|
||||
def _tpu_gather(tensor, name="gather tensor"):
|
||||
@ -413,7 +217,11 @@ def gather(tensor):
|
||||
"""
|
||||
if AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
return _tpu_gather(tensor, name="accelerate.utils.gather")
|
||||
elif AcceleratorState().distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
elif AcceleratorState().distributed_type in [
|
||||
DistributedType.DEEPSPEED,
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.FSDP,
|
||||
]:
|
||||
return _gpu_gather(tensor)
|
||||
elif AcceleratorState().distributed_type == DistributedType.MULTI_CPU:
|
||||
return _cpu_gather(tensor)
|
||||
@ -536,26 +344,6 @@ def slice_tensors(data, tensor_slice):
|
||||
return recursively_apply(_slice_tensor, data, tensor_slice)
|
||||
|
||||
|
||||
def find_batch_size(data):
|
||||
"""
|
||||
Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.
|
||||
|
||||
Args:
|
||||
data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.
|
||||
|
||||
Returns:
|
||||
`int`: The batch size.
|
||||
"""
|
||||
if isinstance(data, (tuple, list)):
|
||||
return find_batch_size(data[0])
|
||||
elif isinstance(data, Mapping):
|
||||
for k in data.keys():
|
||||
return find_batch_size(data[k])
|
||||
elif not isinstance(data, torch.Tensor):
|
||||
raise TypeError(f"Can only find the batch size of tensors but got {type(data)}.")
|
||||
return data.shape[0]
|
||||
|
||||
|
||||
def concatenate(data, dim=0):
|
||||
"""
|
||||
Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.
|
||||
@ -646,7 +434,7 @@ def reduce(tensor, reduction="mean"):
|
||||
xm.all_reduce("sum", cloned_tensor)
|
||||
return cloned_tensor
|
||||
elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
torch.distributed.reduce(cloned_tensor, ReduceOp.SUM)
|
||||
torch.distributed.all_reduce(cloned_tensor, ReduceOp.SUM)
|
||||
return cloned_tensor
|
||||
else:
|
||||
if reduction == "sum":
|
||||
@ -657,198 +445,71 @@ def reduce(tensor, reduction="mean"):
|
||||
return recursively_apply(_reduce_across_processes, tensor, error_on_other_type=True, reduction=reduction)
|
||||
|
||||
|
||||
def wait_for_everyone():
|
||||
def convert_to_fp32(tensor):
|
||||
"""
|
||||
Introduces a blocking point in the script, making sure all processes have reached this point before continuing.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Make sure all processes will reach this instruction otherwise one of your processes will hang forever.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
if (
|
||||
AcceleratorState().distributed_type == DistributedType.MULTI_GPU
|
||||
or AcceleratorState().distributed_type == DistributedType.MULTI_CPU
|
||||
or AcceleratorState().distributed_type == DistributedType.DEEPSPEED
|
||||
):
|
||||
torch.distributed.barrier()
|
||||
elif AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
xm.rendezvous("accelerate.utils.wait_for_everyone")
|
||||
|
||||
|
||||
def save(obj, f):
|
||||
"""
|
||||
Save the data to disk. Use in place of `torch.save()`.
|
||||
Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.
|
||||
|
||||
Args:
|
||||
obj: The data to save
|
||||
f: The file (or file-like object) to use to save the data
|
||||
tensor (nested list/tuple/dictionary of `torch.Tensor`):
|
||||
The data to convert from FP16/BF16 to FP32.
|
||||
|
||||
Returns:
|
||||
The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
|
||||
"""
|
||||
if AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
xm.save(obj, f)
|
||||
elif AcceleratorState().local_process_index == 0:
|
||||
torch.save(obj, f)
|
||||
|
||||
def _convert_to_fp32(tensor):
|
||||
return tensor.float()
|
||||
|
||||
def _is_fp16_bf16_tensor(tensor):
|
||||
return hasattr(tensor, "dtype") and (
|
||||
tensor.dtype == torch.float16 or (is_torch_version(">=", "1.10") and tensor.dtype == torch.bfloat16)
|
||||
)
|
||||
|
||||
return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor)
|
||||
|
||||
|
||||
class PrepareForLaunch:
|
||||
class ConvertOutputsToFp32:
|
||||
"""
|
||||
Prepare a function that will launched in a distributed setup.
|
||||
Decorator to apply to a function outputing tensors (like a model forward pass) that ensures the outputs in FP16
|
||||
precision will be convert back to FP32.
|
||||
|
||||
Use a class instead of a decorator because otherwise, the prepared model can no longer be pickled (issue #273).
|
||||
|
||||
Args:
|
||||
launcher (`Callable`):
|
||||
The function to launch.
|
||||
distributed_type ([`~state.DistributedType`]):
|
||||
The distributed type to prepare for.
|
||||
debug (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not this is a debug launch.
|
||||
model_forward (`Callable`):
|
||||
The function which outputs we want to treat.
|
||||
|
||||
Returns:
|
||||
The same function as `model_forward` but with converted outputs.
|
||||
"""
|
||||
|
||||
def __init__(self, launcher, distributed_type="NO", debug=False):
|
||||
self.launcher = launcher
|
||||
self.distributed_type = DistributedType(distributed_type)
|
||||
self.debug = debug
|
||||
def __init__(self, model_forward):
|
||||
self.model_forward = model_forward
|
||||
update_wrapper(self, model_forward)
|
||||
|
||||
def __call__(self, index, *args):
|
||||
if self.debug:
|
||||
world_size = int(os.environ.get("WORLD_SIZE"))
|
||||
rdv_file = os.environ.get("ACCELERATE_DEBUG_RDV_FILE")
|
||||
torch.distributed.init_process_group(
|
||||
"gloo",
|
||||
rank=index,
|
||||
store=torch.distributed.FileStore(rdv_file, world_size),
|
||||
world_size=world_size,
|
||||
)
|
||||
elif self.distributed_type == DistributedType.MULTI_GPU or self.distributed_type == DistributedType.MULTI_CPU:
|
||||
# Prepare the environment for torch.distributed
|
||||
os.environ["LOCAL_RANK"] = str(index)
|
||||
os.environ["RANK"] = str(index)
|
||||
|
||||
self.launcher(*args)
|
||||
def __call__(self, *args, **kwargs):
|
||||
return convert_to_fp32(self.model_forward(*args, **kwargs))
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeepSpeedPlugin:
|
||||
|
||||
gradient_accumulation_steps: int = field(
|
||||
default=None, metadata={"help": "Number of steps to accumulate gradients before updating optimizer states"}
|
||||
)
|
||||
zero_stage: int = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are 0,1,2,3; Default will be taken from environment variable"},
|
||||
)
|
||||
is_train_batch_min: str = field(
|
||||
default=True,
|
||||
metadata={"help": "If both train & eval dataloaders are specified, this will decide the train_batch_size"},
|
||||
)
|
||||
|
||||
auto_opt_mapping: bool = field(
|
||||
default=True,
|
||||
metadata={"help": "whether to map torch.adam to deepspeed optimizer version of adam based on config"},
|
||||
)
|
||||
|
||||
offload_optimizer_device: bool = field(default=None, metadata={"help": "Possible options are none|cpu|nvme"})
|
||||
|
||||
def __post_init__(self):
|
||||
|
||||
if self.gradient_accumulation_steps is None:
|
||||
self.gradient_accumulation_steps = int(os.environ.get("GRADIENT_ACCUMULATION_STEPS", 1))
|
||||
|
||||
if self.zero_stage is None:
|
||||
self.zero_stage = int(os.environ.get("DEEPSPEED_ZERO_STAGE", 2))
|
||||
|
||||
if self.offload_optimizer_device is None:
|
||||
self.offload_optimizer_device = os.environ.get("DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE", "none")
|
||||
|
||||
self.deepspeed_config = {
|
||||
"train_batch_size": None,
|
||||
"gradient_accumulation_steps": self.gradient_accumulation_steps,
|
||||
"zero_optimization": {
|
||||
"stage": self.zero_stage,
|
||||
"offload_optimizer": {
|
||||
"device": self.offload_optimizer_device,
|
||||
},
|
||||
},
|
||||
"steps_per_print": float("inf"), # this will stop deepspeed from logging @ stdout
|
||||
"zero_allow_untested_optimizer": True,
|
||||
}
|
||||
convert_outputs_to_fp32 = ConvertOutputsToFp32
|
||||
|
||||
|
||||
@dataclass
|
||||
class FullyShardedDataParallelPlugin:
|
||||
def find_device(data):
|
||||
"""
|
||||
This plugin is used to enable fully sharded data parallelism.
|
||||
Finds the device on which a nested dict/list/tuple of tensors lies (assuming they are all on the same device).
|
||||
|
||||
Args:
|
||||
(nested list/tuple/dictionary of `torch.Tensor`): The data we want to know the device of.
|
||||
"""
|
||||
|
||||
sharding_strategy: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are [1] FULL_SHARD, [2] SHARD_GRAD_OP"},
|
||||
)
|
||||
backward_prefetch: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={"help": "Possible options are [1] BACKWARD_PRE, [2] BACKWARD_POST"},
|
||||
)
|
||||
auto_wrap_policy: "typing.Any" = field(
|
||||
default=None,
|
||||
metadata={"help": "A callable specifying a policy to recursively wrap layers with FSDP"},
|
||||
)
|
||||
cpu_offload: Optional[Callable] = field(
|
||||
default=None,
|
||||
metadata={"help": "Decides Whether to offload parameters and gradients to CPU."},
|
||||
)
|
||||
min_num_params: int = field(
|
||||
default=None, metadata={"help": "FSDP's minimum number of parameters for Default Auto Wrapping."}
|
||||
)
|
||||
ignored_modules: Optional[Iterable[torch.nn.Module]] = field(
|
||||
default=None,
|
||||
metadata={"help": "A list of modules to ignore for FSDP."},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, ShardingStrategy
|
||||
from torch.distributed.fsdp.wrap import default_auto_wrap_policy
|
||||
|
||||
if self.sharding_strategy is None:
|
||||
self.sharding_strategy = ShardingStrategy(int(os.environ.get("FSDP_SHARDING_STRATEGY", 1)))
|
||||
|
||||
if self.cpu_offload is None:
|
||||
if os.environ.get("FSDP_OFFLOAD_PARAMS", "false") == "true":
|
||||
self.cpu_offload = CPUOffload(offload_params=True)
|
||||
else:
|
||||
self.cpu_offload = CPUOffload(offload_params=False)
|
||||
|
||||
if self.min_num_params is None:
|
||||
self.min_num_params = int(os.environ.get("FSDP_MIN_NUM_PARAMS", 0))
|
||||
|
||||
if self.auto_wrap_policy is None:
|
||||
if self.min_num_params > 0:
|
||||
self.auto_wrap_policy = functools.partial(default_auto_wrap_policy, min_num_params=self.min_num_params)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def patch_environment(**kwargs):
|
||||
"""
|
||||
A context manager that will add each keyword argument passed to `os.environ` and remove them when exiting.
|
||||
|
||||
Will convert the values in `kwargs` to strings and upper-case all the keys.
|
||||
"""
|
||||
for key, value in kwargs.items():
|
||||
os.environ[key.upper()] = str(value)
|
||||
|
||||
yield
|
||||
|
||||
for key in kwargs:
|
||||
del os.environ[key.upper()]
|
||||
|
||||
|
||||
def get_pretty_name(obj):
|
||||
"""
|
||||
Gets a pretty name from `obj`.
|
||||
"""
|
||||
if not hasattr(obj, "__qualname__") and not hasattr(obj, "__name__"):
|
||||
obj = getattr(obj, "__class__", obj)
|
||||
if hasattr(obj, "__qualname__"):
|
||||
return obj.__qualname__
|
||||
if hasattr(obj, "__name__"):
|
||||
return obj.__name__
|
||||
return str(obj)
|
||||
if isinstance(data, Mapping):
|
||||
for obj in data.values():
|
||||
device = find_device(obj)
|
||||
if device is not None:
|
||||
return device
|
||||
elif isinstance(data, (tuple, list)):
|
||||
for obj in data:
|
||||
device = find_device(obj)
|
||||
if device is not None:
|
||||
return device
|
||||
elif isinstance(data, torch.Tensor):
|
||||
return data.device
|
||||
156
src/accelerate/utils/other.py
Normal file
156
src/accelerate/utils/other.py
Normal file
@ -0,0 +1,156 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
from ..commands.config.cluster import ClusterConfig
|
||||
from ..commands.config.config_args import default_json_config_file
|
||||
from ..state import AcceleratorState
|
||||
from .dataclasses import DistributedType
|
||||
from .imports import is_deepspeed_available, is_tpu_available
|
||||
|
||||
|
||||
if is_deepspeed_available():
|
||||
from deepspeed import DeepSpeedEngine
|
||||
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
|
||||
def extract_model_from_parallel(model):
|
||||
"""
|
||||
Extract a model from its distributed containers.
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to extract.
|
||||
|
||||
Returns:
|
||||
`torch.nn.Module`: The extracted model.
|
||||
"""
|
||||
options = (torch.nn.parallel.DistributedDataParallel, torch.nn.DataParallel)
|
||||
if is_deepspeed_available():
|
||||
options += (DeepSpeedEngine,)
|
||||
|
||||
while isinstance(model, options):
|
||||
model = model.module
|
||||
return model
|
||||
|
||||
|
||||
def wait_for_everyone():
|
||||
"""
|
||||
Introduces a blocking point in the script, making sure all processes have reached this point before continuing.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Make sure all processes will reach this instruction otherwise one of your processes will hang forever.
|
||||
|
||||
</Tip>
|
||||
"""
|
||||
if (
|
||||
AcceleratorState().distributed_type == DistributedType.MULTI_GPU
|
||||
or AcceleratorState().distributed_type == DistributedType.MULTI_CPU
|
||||
or AcceleratorState().distributed_type == DistributedType.DEEPSPEED
|
||||
):
|
||||
torch.distributed.barrier()
|
||||
elif AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
xm.rendezvous("accelerate.utils.wait_for_everyone")
|
||||
|
||||
|
||||
def save(obj, f):
|
||||
"""
|
||||
Save the data to disk. Use in place of `torch.save()`.
|
||||
|
||||
Args:
|
||||
obj: The data to save
|
||||
f: The file (or file-like object) to use to save the data
|
||||
"""
|
||||
if AcceleratorState().distributed_type == DistributedType.TPU:
|
||||
xm.save(obj, f)
|
||||
elif AcceleratorState().local_process_index == 0:
|
||||
torch.save(obj, f)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def patch_environment(**kwargs):
|
||||
"""
|
||||
A context manager that will add each keyword argument passed to `os.environ` and remove them when exiting.
|
||||
|
||||
Will convert the values in `kwargs` to strings and upper-case all the keys.
|
||||
"""
|
||||
for key, value in kwargs.items():
|
||||
os.environ[key.upper()] = str(value)
|
||||
|
||||
yield
|
||||
|
||||
for key in kwargs:
|
||||
del os.environ[key.upper()]
|
||||
|
||||
|
||||
def get_pretty_name(obj):
|
||||
"""
|
||||
Gets a pretty name from `obj`.
|
||||
"""
|
||||
if not hasattr(obj, "__qualname__") and not hasattr(obj, "__name__"):
|
||||
obj = getattr(obj, "__class__", obj)
|
||||
if hasattr(obj, "__qualname__"):
|
||||
return obj.__qualname__
|
||||
if hasattr(obj, "__name__"):
|
||||
return obj.__name__
|
||||
return str(obj)
|
||||
|
||||
|
||||
def write_basic_config(mixed_precision="no", save_location: str = default_json_config_file):
|
||||
"""
|
||||
Creates and saves a basic cluster config to be used on a local machine with potentially multiple GPUs. Will also
|
||||
set CPU if it is a CPU-only machine.
|
||||
|
||||
Args:
|
||||
mixed_precision (`str`, *optional*, defaults to "no"):
|
||||
Mixed Precision to use. Should be one of "no", "fp16", or "bf16"
|
||||
save_location (`str`, *optional*, defaults to `default_json_config_file`):
|
||||
Optional custom save location. Should be passed to `--config_file` when using `accelerate launch`. Default
|
||||
location is inside the huggingface cache folder (`~/.cache/huggingface`) but can be overriden by setting
|
||||
the `HF_HOME` environmental variable, followed by `accelerate/default_config.yaml`.
|
||||
"""
|
||||
path = Path(save_location)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if path.exists():
|
||||
print(
|
||||
f"Configuration already exists at {save_location}, will not override. Run `accelerate config` manually or pass a different `save_location`."
|
||||
)
|
||||
return
|
||||
mixed_precision = mixed_precision.lower()
|
||||
if mixed_precision not in ["no", "fp16", "bf16"]:
|
||||
raise ValueError(f"`mixed_precision` should be one of 'no', 'fp16', or 'bf16'. Received {mixed_precision}")
|
||||
config = {"compute_environment": "LOCAL_MACHINE", "mixed_precision": mixed_precision}
|
||||
if torch.cuda.is_available():
|
||||
num_gpus = torch.cuda.device_count()
|
||||
config["num_processes"] = num_gpus
|
||||
config["use_cpu"] = False
|
||||
if num_gpus > 1:
|
||||
config["distributed_type"] = "MULTI_GPU"
|
||||
else:
|
||||
config["distributed_type"] = "NO"
|
||||
else:
|
||||
num_gpus = 0
|
||||
config["use_cpu"] = True
|
||||
config["num_processes"] = 1
|
||||
config["distributed_type"] = "NO"
|
||||
if not path.exists():
|
||||
config = ClusterConfig(**config)
|
||||
config.to_json_file(path)
|
||||
87
src/accelerate/utils/random.py
Normal file
87
src/accelerate/utils/random.py
Normal file
@ -0,0 +1,87 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from ..state import AcceleratorState
|
||||
from .dataclasses import DistributedType, RNGType
|
||||
from .imports import is_tpu_available
|
||||
|
||||
|
||||
if is_tpu_available(check_device=False):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
|
||||
def set_seed(seed: int, device_specific: bool = False):
|
||||
"""
|
||||
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
|
||||
|
||||
Args:
|
||||
seed (`int`): The seed to set.
|
||||
device_specific (`bool`, *optional*, defaults to `False`):
|
||||
Whether to differ the seed on each device slightly with `self.process_index`.
|
||||
"""
|
||||
if device_specific:
|
||||
seed += AcceleratorState().process_index
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
if is_tpu_available():
|
||||
xm.set_rng_state(seed)
|
||||
|
||||
|
||||
def synchronize_rng_state(rng_type: Optional[RNGType] = None, generator: Optional[torch.Generator] = None):
|
||||
# Get the proper rng state
|
||||
if rng_type == RNGType.TORCH:
|
||||
rng_state = torch.get_rng_state()
|
||||
elif rng_type == RNGType.CUDA:
|
||||
rng_state = torch.cuda.get_rng_state()
|
||||
elif rng_type == RNGType.XLA:
|
||||
assert is_tpu_available(), "Can't synchronize XLA seeds on an environment without TPUs."
|
||||
rng_state = torch.tensor(xm.get_rng_state())
|
||||
elif rng_type == RNGType.GENERATOR:
|
||||
assert generator is not None, "Need a generator to synchronize its seed."
|
||||
rng_state = generator.get_state()
|
||||
|
||||
# Broadcast the rng state from device 0 to other devices
|
||||
state = AcceleratorState()
|
||||
if state.distributed_type == DistributedType.TPU:
|
||||
rng_state = xm.mesh_reduce("random_seed", rng_state, lambda x: x[0])
|
||||
elif state.distributed_type in [DistributedType.DEEPSPEED, DistributedType.MULTI_GPU]:
|
||||
rng_state = rng_state.to(state.device)
|
||||
torch.distributed.broadcast(rng_state, 0)
|
||||
rng_state = rng_state.cpu()
|
||||
elif state.distributed_type == DistributedType.MULTI_CPU:
|
||||
torch.distributed.broadcast(rng_state, 0)
|
||||
|
||||
# Set the broadcast rng state
|
||||
if rng_type == RNGType.TORCH:
|
||||
torch.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.CUDA:
|
||||
torch.cuda.set_rng_state(rng_state)
|
||||
elif rng_type == RNGType.XLA:
|
||||
xm.set_rng_state(rng_state.item())
|
||||
elif rng_type == RNGType.GENERATOR:
|
||||
generator.set_state(rng_state)
|
||||
|
||||
|
||||
def synchronize_rng_states(rng_types: List[Union[str, RNGType]], generator: Optional[torch.Generator] = None):
|
||||
for rng_type in rng_types:
|
||||
synchronize_rng_state(RNGType(rng_type), generator=generator)
|
||||
61
src/accelerate/utils/versions.py
Normal file
61
src/accelerate/utils/versions.py
Normal file
@ -0,0 +1,61 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
from typing import Union
|
||||
|
||||
from packaging.version import Version, parse
|
||||
|
||||
from .constants import STR_OPERATION_TO_FUNC
|
||||
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
import importlib_metadata
|
||||
else:
|
||||
import importlib.metadata as importlib_metadata
|
||||
|
||||
torch_version = parse(importlib_metadata.version("torch"))
|
||||
|
||||
|
||||
def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
|
||||
"""
|
||||
Compares a library version to some requirement using a given operation.
|
||||
|
||||
Args:
|
||||
library_or_version (`str` or `packaging.version.Version`):
|
||||
A library name or a version to check.
|
||||
operation (`str`):
|
||||
A string representation of an operator, such as `">"` or `"<="`.
|
||||
requirement_version (`str`):
|
||||
The version to compare the library version against
|
||||
"""
|
||||
if operation not in STR_OPERATION_TO_FUNC.keys():
|
||||
raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
|
||||
operation = STR_OPERATION_TO_FUNC[operation]
|
||||
if isinstance(library_or_version, str):
|
||||
library_or_version = parse(importlib_metadata.version(library_or_version))
|
||||
return operation(library_or_version, parse(requirement_version))
|
||||
|
||||
|
||||
def is_torch_version(operation: str, version: str):
|
||||
"""
|
||||
Compares the current PyTorch version to a given reference with an operation.
|
||||
|
||||
Args:
|
||||
operation (`str`):
|
||||
A string representation of an operator, such as `">"` or `"<="`
|
||||
version (`str`):
|
||||
A string version of PyTorch
|
||||
"""
|
||||
return compare_versions(torch_version, operation, version)
|
||||
49
tests/deepspeed/ds_config_zero2.json
Normal file
49
tests/deepspeed/ds_config_zero2.json
Normal file
@ -0,0 +1,49 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto",
|
||||
"torch_adam": true,
|
||||
"adam_w_mode": true
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
56
tests/deepspeed/ds_config_zero3.json
Normal file
56
tests/deepspeed/ds_config_zero3.json
Normal file
@ -0,0 +1,56 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto",
|
||||
"torch_adam": true,
|
||||
"adam_w_mode": true
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": "auto"
|
||||
},
|
||||
"gradient_accumulation_steps": 1,
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
584
tests/deepspeed/test_deepspeed.py
Normal file
584
tests/deepspeed/test_deepspeed.py
Normal file
@ -0,0 +1,584 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import io
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from accelerate.accelerator import Accelerator
|
||||
from accelerate.scheduler import AcceleratedScheduler
|
||||
from accelerate.state import AcceleratorState
|
||||
from accelerate.test_utils.testing import require_cuda, require_deepspeed
|
||||
from accelerate.test_utils.training import RegressionDataset
|
||||
from accelerate.utils.dataclasses import DeepSpeedPlugin
|
||||
from accelerate.utils.deepspeed import (
|
||||
DeepSpeedEngineWrapper,
|
||||
DeepSpeedOptimizerWrapper,
|
||||
DeepSpeedSchedulerWrapper,
|
||||
DummyOptim,
|
||||
DummyScheduler,
|
||||
)
|
||||
from parameterized import parameterized
|
||||
from transformers import AutoModel, AutoModelForCausalLM, get_scheduler
|
||||
from transformers.testing_utils import mockenv_context
|
||||
from transformers.trainer_utils import set_seed
|
||||
from transformers.utils import is_torch_bf16_available
|
||||
|
||||
|
||||
set_seed(42)
|
||||
|
||||
T5_SMALL = "t5-small"
|
||||
T5_TINY = "patrickvonplaten/t5-tiny-random"
|
||||
GPT2_TINY = "sshleifer/tiny-gpt2"
|
||||
|
||||
ZERO2 = "zero2"
|
||||
ZERO3 = "zero3"
|
||||
|
||||
FP16 = "fp16"
|
||||
BF16 = "bf16"
|
||||
|
||||
CUSTOM_OPTIMIZER = "custom_optimizer"
|
||||
CUSTOM_SCHEDULER = "custom_scheduler"
|
||||
DS_OPTIMIZER = "deepspeed_optimizer"
|
||||
DS_SCHEDULER = "deepspeed_scheduler"
|
||||
|
||||
stages = [ZERO2, ZERO3]
|
||||
optims = [CUSTOM_OPTIMIZER, DS_OPTIMIZER]
|
||||
schedulers = [CUSTOM_SCHEDULER, DS_SCHEDULER]
|
||||
if is_torch_bf16_available():
|
||||
dtypes = [FP16, BF16]
|
||||
else:
|
||||
dtypes = [FP16]
|
||||
|
||||
|
||||
def parameterized_custom_name_func(func, param_num, param):
|
||||
# customize the test name generator function as we want both params to appear in the sub-test
|
||||
# name, as by default it shows only the first param
|
||||
param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
|
||||
return f"{func.__name__}_{param_based_name}"
|
||||
|
||||
|
||||
# Cartesian-product of zero stages with models to test
|
||||
params = list(itertools.product(stages, dtypes))
|
||||
optim_scheduler_params = list(itertools.product(optims, schedulers))
|
||||
|
||||
|
||||
@require_deepspeed
|
||||
@require_cuda
|
||||
class DeepSpeedConfigIntegration(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
self._test_file_path = inspect.getfile(self.__class__)
|
||||
path = Path(self._test_file_path).resolve()
|
||||
self.test_file_dir_str = str(path.parents[0])
|
||||
|
||||
self.ds_config_file = dict(
|
||||
zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
|
||||
zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
|
||||
)
|
||||
|
||||
# use self.get_config_dict(stage) to use these to ensure the original is not modified
|
||||
with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
|
||||
config_zero2 = json.load(f)
|
||||
with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
|
||||
config_zero3 = json.load(f)
|
||||
# The following setting slows things down, so don't enable it by default unless needed by a test.
|
||||
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
|
||||
config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False
|
||||
|
||||
self.ds_config_dict = dict(zero2=config_zero2, zero3=config_zero3)
|
||||
|
||||
self.dist_env = dict(
|
||||
USE_DEEPSPEED="true",
|
||||
MASTER_ADDR="localhost",
|
||||
MASTER_PORT="10999",
|
||||
RANK="0",
|
||||
LOCAL_RANK="0",
|
||||
WORLD_SIZE="1",
|
||||
)
|
||||
|
||||
def get_config_dict(self, stage):
|
||||
# As some tests modify the dict, always make a copy
|
||||
return deepcopy(self.ds_config_dict[stage])
|
||||
|
||||
@parameterized.expand(stages, name_func=parameterized_custom_name_func)
|
||||
def test_deepspeed_plugin(self, stage):
|
||||
|
||||
# Test zero3_init_flag will be set to False when ZeRO stage != 3
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
gradient_accumulation_steps=1,
|
||||
gradient_clipping=1.0,
|
||||
zero_stage=2,
|
||||
offload_optimizer_device="cpu",
|
||||
offload_param_device="cpu",
|
||||
zero3_save_16bit_model=True,
|
||||
zero3_init_flag=True,
|
||||
)
|
||||
self.assertFalse(deepspeed_plugin.zero3_init_flag)
|
||||
deepspeed_plugin.deepspeed_config = None
|
||||
|
||||
# Test zero3_init_flag will be set to True only when ZeRO stage == 3
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
gradient_accumulation_steps=1,
|
||||
gradient_clipping=1.0,
|
||||
zero_stage=3,
|
||||
offload_optimizer_device="cpu",
|
||||
offload_param_device="cpu",
|
||||
zero3_save_16bit_model=True,
|
||||
zero3_init_flag=True,
|
||||
)
|
||||
self.assertTrue(deepspeed_plugin.zero3_init_flag)
|
||||
deepspeed_plugin.deepspeed_config = None
|
||||
|
||||
# Test config files are loaded correctly
|
||||
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[stage], zero3_init_flag=True)
|
||||
if stage == ZERO2:
|
||||
self.assertFalse(deepspeed_plugin.zero3_init_flag)
|
||||
elif stage == ZERO3:
|
||||
self.assertTrue(deepspeed_plugin.zero3_init_flag)
|
||||
|
||||
# Test `gradient_accumulation_steps` is set to 1 if unavailable in config file
|
||||
with tempfile.TemporaryDirectory() as dirpath:
|
||||
ds_config = self.get_config_dict(stage)
|
||||
del ds_config["gradient_accumulation_steps"]
|
||||
with open(os.path.join(dirpath, "ds_config.json"), "w") as out_file:
|
||||
json.dump(ds_config, out_file)
|
||||
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=os.path.join(dirpath, "ds_config.json"))
|
||||
self.assertEqual(deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"], 1)
|
||||
deepspeed_plugin.deepspeed_config = None
|
||||
|
||||
# Test `ValueError` is raised if `zero_optimization` is unavailable in config file
|
||||
with tempfile.TemporaryDirectory() as dirpath:
|
||||
ds_config = self.get_config_dict(stage)
|
||||
del ds_config["zero_optimization"]
|
||||
with open(os.path.join(dirpath, "ds_config.json"), "w") as out_file:
|
||||
json.dump(ds_config, out_file)
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=os.path.join(dirpath, "ds_config.json"))
|
||||
self.assertTrue(
|
||||
"Please specify the ZeRO optimization config in the DeepSpeed config." in str(cm.exception)
|
||||
)
|
||||
deepspeed_plugin.deepspeed_config = None
|
||||
|
||||
# Test `deepspeed_config_process`
|
||||
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[stage])
|
||||
kwargs = {
|
||||
"fp16.enabled": True,
|
||||
"bf16.enabled": False,
|
||||
"optimizer.params.lr": 5e-5,
|
||||
"optimizer.params.weight_decay": 0.0,
|
||||
"scheduler.params.warmup_min_lr": 0.0,
|
||||
"scheduler.params.warmup_max_lr": 5e-5,
|
||||
"scheduler.params.warmup_num_steps": 0,
|
||||
"train_micro_batch_size_per_gpu": 16,
|
||||
"gradient_clipping": 1.0,
|
||||
"train_batch_size": 16,
|
||||
"zero_optimization.reduce_bucket_size": 5e5,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 5e5,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 5e5,
|
||||
"zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
|
||||
}
|
||||
deepspeed_plugin.deepspeed_config_process(**kwargs)
|
||||
for ds_key_long, value in kwargs.items():
|
||||
config, ds_key = deepspeed_plugin.hf_ds_config.find_config_node(ds_key_long)
|
||||
if config.get(ds_key) is not None:
|
||||
self.assertEqual(config.get(ds_key), value)
|
||||
|
||||
# Test mismatches
|
||||
mismatches = {
|
||||
"optimizer.params.lr": 1e-5,
|
||||
"optimizer.params.weight_decay": 1e-5,
|
||||
"gradient_accumulation_steps": 2,
|
||||
}
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
new_kwargs = deepcopy(kwargs)
|
||||
new_kwargs.update(mismatches)
|
||||
deepspeed_plugin.deepspeed_config_process(**new_kwargs)
|
||||
for key in mismatches.keys():
|
||||
self.assertTrue(
|
||||
key in str(cm.exception),
|
||||
f"{key} is not in the exception message:\n{cm.exception}",
|
||||
)
|
||||
|
||||
# Test `ValueError` is raised if some config file fields with `auto` value is missing in `kwargs`
|
||||
deepspeed_plugin.deepspeed_config["optimizer"]["params"]["lr"] = "auto"
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
del kwargs["optimizer.params.lr"]
|
||||
deepspeed_plugin.deepspeed_config_process(**kwargs)
|
||||
self.assertTrue("`optimizer.params.lr` not found in kwargs." in str(cm.exception))
|
||||
|
||||
@parameterized.expand([FP16, BF16], name_func=parameterized_custom_name_func)
|
||||
def test_accelerate_state_deepspeed(self, dtype):
|
||||
state = AcceleratorState(_from_accelerator=True)
|
||||
if state.initialized:
|
||||
state.initialized = False
|
||||
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
gradient_accumulation_steps=1,
|
||||
gradient_clipping=1.0,
|
||||
zero_stage=ZERO2,
|
||||
offload_optimizer_device="cpu",
|
||||
offload_param_device="cpu",
|
||||
zero3_save_16bit_model=True,
|
||||
zero3_init_flag=True,
|
||||
)
|
||||
with mockenv_context(**self.dist_env):
|
||||
state = Accelerator(mixed_precision=dtype, deepspeed_plugin=deepspeed_plugin).state
|
||||
self.assertTrue(state.deepspeed_plugin.deepspeed_config[dtype]["enabled"])
|
||||
state.initialized = False
|
||||
|
||||
def test_init_zero3(self):
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
gradient_accumulation_steps=1,
|
||||
gradient_clipping=1.0,
|
||||
zero_stage=3,
|
||||
offload_optimizer_device="cpu",
|
||||
offload_param_device="cpu",
|
||||
zero3_save_16bit_model=True,
|
||||
zero3_init_flag=True,
|
||||
)
|
||||
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
|
||||
from transformers.deepspeed import is_deepspeed_zero3_enabled
|
||||
|
||||
self.assertTrue(is_deepspeed_zero3_enabled())
|
||||
accelerator.state.initialized = False
|
||||
|
||||
@parameterized.expand(optim_scheduler_params, name_func=parameterized_custom_name_func)
|
||||
def test_prepare_deepspeed(self, optim_type, scheduler_type):
|
||||
# 1. Testing with one of the ZeRO Stages is enough to test the `_prepare_deepspeed` function.
|
||||
# Here we test using ZeRO Stage 2 with FP16 enabled.
|
||||
from deepspeed.runtime.engine import DeepSpeedEngine
|
||||
|
||||
kwargs = {
|
||||
"fp16.enabled": True,
|
||||
"bf16.enabled": False,
|
||||
"optimizer.params.lr": 5e-5,
|
||||
"optimizer.params.weight_decay": 0.0,
|
||||
"scheduler.params.warmup_min_lr": 0.0,
|
||||
"scheduler.params.warmup_max_lr": 5e-5,
|
||||
"scheduler.params.warmup_num_steps": 0,
|
||||
"train_micro_batch_size_per_gpu": 16,
|
||||
"gradient_clipping": 1.0,
|
||||
"train_batch_size": 16,
|
||||
"zero_optimization.reduce_bucket_size": 5e5,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 5e5,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 5e5,
|
||||
"zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
|
||||
}
|
||||
|
||||
if optim_type == CUSTOM_OPTIMIZER and scheduler_type == CUSTOM_SCHEDULER:
|
||||
# Test custom optimizer + custom scheduler
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
gradient_accumulation_steps=1,
|
||||
gradient_clipping=1.0,
|
||||
zero_stage=2,
|
||||
offload_optimizer_device="cpu",
|
||||
offload_param_device="cpu",
|
||||
zero3_save_16bit_model=False,
|
||||
zero3_init_flag=False,
|
||||
)
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(mixed_precision="fp16", deepspeed_plugin=deepspeed_plugin)
|
||||
|
||||
train_set = RegressionDataset(length=80)
|
||||
eval_set = RegressionDataset(length=20)
|
||||
train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
|
||||
eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
|
||||
model = AutoModel.from_pretrained(GPT2_TINY)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
|
||||
lr_scheduler = get_scheduler(
|
||||
name="linear",
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=1000,
|
||||
)
|
||||
dummy_optimizer = DummyOptim(params=model.parameters())
|
||||
dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
|
||||
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
self.assertTrue(
|
||||
"You cannot create a `DummyOptim` without specifying an optimizer in the config file."
|
||||
in str(cm.exception)
|
||||
)
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
|
||||
)
|
||||
self.assertTrue(
|
||||
"You cannot create a `DummyScheduler` without specifying a scheduler in the config file."
|
||||
in str(cm.exception)
|
||||
)
|
||||
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
|
||||
self.assertTrue(
|
||||
"You must specify a training or evaluation dataloader in `accelerate.prepare()` when using DeepSpeed."
|
||||
in str(cm.exception)
|
||||
)
|
||||
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
self.assertTrue(accelerator.deepspeed_config["zero_allow_untested_optimizer"])
|
||||
self.assertTrue(accelerator.deepspeed_config["train_batch_size"], 16)
|
||||
self.assertEqual(type(model), DeepSpeedEngine)
|
||||
self.assertEqual(type(optimizer), DeepSpeedOptimizerWrapper)
|
||||
self.assertEqual(type(lr_scheduler), AcceleratedScheduler)
|
||||
self.assertEqual(type(accelerator.deepspeed_engine_wrapped), DeepSpeedEngineWrapper)
|
||||
|
||||
elif optim_type == DS_OPTIMIZER and scheduler_type == DS_SCHEDULER:
|
||||
# Test DeepSpeed optimizer + DeepSpeed scheduler
|
||||
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[ZERO2])
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
|
||||
train_set = RegressionDataset(length=80)
|
||||
eval_set = RegressionDataset(length=20)
|
||||
train_dataloader = DataLoader(train_set, batch_size=10, shuffle=True)
|
||||
eval_dataloader = DataLoader(eval_set, batch_size=5, shuffle=False)
|
||||
model = AutoModel.from_pretrained(GPT2_TINY)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
|
||||
lr_scheduler = get_scheduler(
|
||||
name="linear",
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=1000,
|
||||
)
|
||||
dummy_optimizer = DummyOptim(params=model.parameters())
|
||||
dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
|
||||
kwargs["train_batch_size"] = (
|
||||
kwargs["train_micro_batch_size_per_gpu"]
|
||||
* deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
|
||||
* accelerator.num_processes
|
||||
)
|
||||
accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
|
||||
)
|
||||
self.assertTrue(
|
||||
"You cannot specify an optimizer in the config file and in the code at the same time"
|
||||
in str(cm.exception)
|
||||
)
|
||||
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
self.assertTrue(
|
||||
"You cannot specify a scheduler in the config file and in the code at the same time"
|
||||
in str(cm.exception)
|
||||
)
|
||||
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
self.assertTrue(
|
||||
"You cannot specify a scheduler in the config file and in the code at the same time"
|
||||
in str(cm.exception)
|
||||
)
|
||||
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
|
||||
)
|
||||
self.assertTrue(type(model) == DeepSpeedEngine)
|
||||
self.assertTrue(type(optimizer) == DeepSpeedOptimizerWrapper)
|
||||
self.assertTrue(type(lr_scheduler) == DeepSpeedSchedulerWrapper)
|
||||
self.assertTrue(type(accelerator.deepspeed_engine_wrapped) == DeepSpeedEngineWrapper)
|
||||
|
||||
elif optim_type == CUSTOM_OPTIMIZER and scheduler_type == DS_SCHEDULER:
|
||||
# Test custom optimizer + DeepSpeed scheduler
|
||||
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[ZERO2])
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
|
||||
train_set = RegressionDataset(length=80)
|
||||
eval_set = RegressionDataset(length=20)
|
||||
train_dataloader = DataLoader(train_set, batch_size=10, shuffle=True)
|
||||
eval_dataloader = DataLoader(eval_set, batch_size=5, shuffle=False)
|
||||
model = AutoModel.from_pretrained(GPT2_TINY)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
|
||||
lr_scheduler = get_scheduler(
|
||||
name="linear",
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=1000,
|
||||
)
|
||||
dummy_optimizer = DummyOptim(params=model.parameters())
|
||||
dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
|
||||
kwargs["train_batch_size"] = (
|
||||
kwargs["train_micro_batch_size_per_gpu"]
|
||||
* deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
|
||||
* accelerator.num_processes
|
||||
)
|
||||
accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)
|
||||
del accelerator.state.deepspeed_plugin.deepspeed_config["optimizer"]
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
|
||||
)
|
||||
self.assertTrue(type(model) == DeepSpeedEngine)
|
||||
self.assertTrue(type(optimizer) == DeepSpeedOptimizerWrapper)
|
||||
self.assertTrue(type(lr_scheduler) == DeepSpeedSchedulerWrapper)
|
||||
self.assertTrue(type(accelerator.deepspeed_engine_wrapped) == DeepSpeedEngineWrapper)
|
||||
elif optim_type == DS_OPTIMIZER and scheduler_type == CUSTOM_SCHEDULER:
|
||||
# Test deepspeed optimizer + custom scheduler
|
||||
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.ds_config_file[ZERO2])
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
|
||||
train_set = RegressionDataset(length=80)
|
||||
eval_set = RegressionDataset(length=20)
|
||||
train_dataloader = DataLoader(train_set, batch_size=10, shuffle=True)
|
||||
eval_dataloader = DataLoader(eval_set, batch_size=5, shuffle=False)
|
||||
model = AutoModel.from_pretrained(GPT2_TINY)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
|
||||
lr_scheduler = get_scheduler(
|
||||
name="linear",
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=1000,
|
||||
)
|
||||
dummy_optimizer = DummyOptim(params=model.parameters())
|
||||
dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
|
||||
kwargs["train_batch_size"] = (
|
||||
kwargs["train_micro_batch_size_per_gpu"]
|
||||
* deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
|
||||
* accelerator.num_processes
|
||||
)
|
||||
accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)
|
||||
del accelerator.state.deepspeed_plugin.deepspeed_config["scheduler"]
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||
model, dummy_optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||
)
|
||||
self.assertTrue(
|
||||
"You can only specify `accelerate.utils.DummyScheduler` in the code when using `accelerate.utils.DummyOptim`."
|
||||
in str(cm.exception)
|
||||
)
|
||||
accelerator.state.initialized = False
|
||||
|
||||
def test_save_checkpoints(self):
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
hf_ds_config=self.ds_config_file[ZERO3],
|
||||
zero3_init_flag=True,
|
||||
)
|
||||
del deepspeed_plugin.deepspeed_config["bf16"]
|
||||
kwargs = {
|
||||
"fp16.enabled": True,
|
||||
"bf16.enabled": False,
|
||||
"optimizer.params.lr": 5e-5,
|
||||
"optimizer.params.weight_decay": 0.0,
|
||||
"scheduler.params.warmup_min_lr": 0.0,
|
||||
"scheduler.params.warmup_max_lr": 5e-5,
|
||||
"scheduler.params.warmup_num_steps": 0,
|
||||
"train_micro_batch_size_per_gpu": 16,
|
||||
"gradient_clipping": 1.0,
|
||||
"train_batch_size": 16,
|
||||
"zero_optimization.reduce_bucket_size": 5e5,
|
||||
"zero_optimization.stage3_prefetch_bucket_size": 5e5,
|
||||
"zero_optimization.stage3_param_persistence_threshold": 5e5,
|
||||
"zero_optimization.stage3_gather_16bit_weights_on_model_save": False,
|
||||
}
|
||||
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
|
||||
kwargs["train_batch_size"] = (
|
||||
kwargs["train_micro_batch_size_per_gpu"]
|
||||
* deepspeed_plugin.deepspeed_config["gradient_accumulation_steps"]
|
||||
* accelerator.num_processes
|
||||
)
|
||||
accelerator.state.deepspeed_plugin.deepspeed_config_process(**kwargs)
|
||||
|
||||
train_set = RegressionDataset(length=80)
|
||||
eval_set = RegressionDataset(length=20)
|
||||
train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
|
||||
eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
dummy_optimizer = DummyOptim(params=model.parameters())
|
||||
dummy_lr_scheduler = DummyScheduler(dummy_optimizer)
|
||||
|
||||
model, _, train_dataloader, eval_dataloader, _ = accelerator.prepare(
|
||||
model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
|
||||
)
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
accelerator.get_state_dict(model)
|
||||
msg = (
|
||||
"Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
|
||||
"To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
|
||||
"set `zero3_save_16bit_model` to True when using `accelerate config`. "
|
||||
"To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
|
||||
)
|
||||
self.assertTrue(msg in str(cm.exception))
|
||||
accelerator.state.initialized = False
|
||||
|
||||
def test_autofill_dsconfig(self):
|
||||
deepspeed_plugin = DeepSpeedPlugin(
|
||||
hf_ds_config=self.ds_config_file[ZERO3],
|
||||
zero3_init_flag=True,
|
||||
)
|
||||
del deepspeed_plugin.deepspeed_config["bf16"]
|
||||
del deepspeed_plugin.deepspeed_config["fp16"]
|
||||
|
||||
with mockenv_context(**self.dist_env):
|
||||
accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
|
||||
train_set = RegressionDataset(length=80)
|
||||
eval_set = RegressionDataset(length=20)
|
||||
train_dataloader = DataLoader(train_set, batch_size=16, shuffle=True)
|
||||
eval_dataloader = DataLoader(eval_set, batch_size=32, shuffle=False)
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
dummy_optimizer = DummyOptim(params=model.parameters(), lr=5e-5, weight_decay=1e-4)
|
||||
dummy_lr_scheduler = DummyScheduler(dummy_optimizer, warmup_num_steps=10, total_num_steps=1000)
|
||||
hidden_size = model.config.hidden_size
|
||||
model, _, train_dataloader, eval_dataloader, _ = accelerator.prepare(
|
||||
model, dummy_optimizer, train_dataloader, eval_dataloader, dummy_lr_scheduler
|
||||
)
|
||||
self.assertEqual(accelerator.deepspeed_config["train_micro_batch_size_per_gpu"], 16)
|
||||
self.assertEqual(accelerator.deepspeed_config["train_batch_size"], 16)
|
||||
|
||||
self.assertEqual(accelerator.deepspeed_config["optimizer"]["params"]["lr"], 5e-5)
|
||||
self.assertEqual(accelerator.deepspeed_config["optimizer"]["params"]["weight_decay"], 1e-4)
|
||||
|
||||
self.assertEqual(accelerator.deepspeed_config["scheduler"]["params"]["warmup_min_lr"], 0.0)
|
||||
self.assertEqual(accelerator.deepspeed_config["scheduler"]["params"]["warmup_max_lr"], 5e-5)
|
||||
self.assertEqual(accelerator.deepspeed_config["scheduler"]["params"]["warmup_num_steps"], 10)
|
||||
|
||||
self.assertEqual(accelerator.deepspeed_config["gradient_clipping"], 1.0)
|
||||
self.assertEqual(
|
||||
accelerator.deepspeed_config["zero_optimization"]["reduce_bucket_size"], hidden_size * hidden_size
|
||||
)
|
||||
self.assertEqual(
|
||||
accelerator.deepspeed_config["zero_optimization"]["stage3_prefetch_bucket_size"],
|
||||
0.9 * hidden_size * hidden_size,
|
||||
)
|
||||
self.assertEqual(
|
||||
accelerator.deepspeed_config["zero_optimization"]["stage3_param_persistence_threshold"],
|
||||
10 * hidden_size,
|
||||
)
|
||||
self.assertFalse(
|
||||
accelerator.deepspeed_config["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"]
|
||||
)
|
||||
accelerator.state.initialized = False
|
||||
448
tests/test_big_modeling.py
Normal file
448
tests/test_big_modeling.py
Normal file
@ -0,0 +1,448 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from accelerate.big_modeling import (
|
||||
cpu_offload,
|
||||
disk_offload,
|
||||
dispatch_model,
|
||||
init_empty_weights,
|
||||
load_checkpoint_and_dispatch,
|
||||
)
|
||||
from accelerate.hooks import remove_hook_from_submodules
|
||||
from accelerate.test_utils import require_cuda, require_multi_gpu, slow
|
||||
from accelerate.utils import offload_state_dict
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
|
||||
class ModelForTest(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear1 = nn.Linear(3, 4)
|
||||
self.batchnorm = nn.BatchNorm1d(4)
|
||||
self.linear2 = nn.Linear(4, 5)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear2(self.batchnorm(self.linear1(x)))
|
||||
|
||||
|
||||
class BiggerModelForTest(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear1 = nn.Linear(3, 4)
|
||||
self.linear2 = nn.Linear(4, 5)
|
||||
self.batchnorm = nn.BatchNorm1d(5)
|
||||
self.linear3 = nn.Linear(5, 6)
|
||||
self.linear4 = nn.Linear(6, 5)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear4(self.linear3(self.batchnorm(self.linear2(self.linear1(x)))))
|
||||
|
||||
|
||||
# To test preload_module_classes
|
||||
class ModuleWithUnusedSubModules(nn.Module):
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(input_dim, output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
return x @ self.linear.weight.t() + self.linear.bias
|
||||
|
||||
|
||||
class ModelWithUnusedSubModulesForTest(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear1 = ModuleWithUnusedSubModules(3, 4)
|
||||
self.linear2 = ModuleWithUnusedSubModules(4, 5)
|
||||
self.batchnorm = nn.BatchNorm1d(5)
|
||||
self.linear3 = ModuleWithUnusedSubModules(5, 6)
|
||||
self.linear4 = ModuleWithUnusedSubModules(6, 5)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear4(self.linear3(self.batchnorm(self.linear2(self.linear1(x)))))
|
||||
|
||||
|
||||
class BigModelingTester(unittest.TestCase):
|
||||
def test_init_empty_weights(self):
|
||||
# base use
|
||||
with init_empty_weights():
|
||||
module = nn.Linear(4, 5)
|
||||
self.assertEqual(module.weight.device, torch.device("meta"))
|
||||
|
||||
# base use with buffers, they are not touched
|
||||
with init_empty_weights():
|
||||
module = nn.BatchNorm1d(4)
|
||||
self.assertEqual(module.weight.device, torch.device("meta"))
|
||||
self.assertEqual(module.running_mean.device, torch.device("cpu"))
|
||||
|
||||
# Use with include_buffers=True
|
||||
with init_empty_weights(include_buffers=True):
|
||||
module = nn.BatchNorm1d(4)
|
||||
self.assertEqual(module.weight.device, torch.device("meta"))
|
||||
self.assertEqual(module.running_mean.device, torch.device("meta"))
|
||||
|
||||
# Double check we didn't break PyTorch
|
||||
module = nn.BatchNorm1d(4)
|
||||
self.assertEqual(module.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(module.running_mean.device, torch.device("cpu"))
|
||||
|
||||
def test_init_empty_weights_very_large_model(self):
|
||||
# This is a 100 billion parameters model.
|
||||
with init_empty_weights():
|
||||
_ = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
|
||||
|
||||
def test_cpu_offload(self):
|
||||
model = ModelForTest()
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
|
||||
cpu_offload(model, execution_device=device)
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
# Clean up for next test.
|
||||
remove_hook_from_submodules(model)
|
||||
|
||||
cpu_offload(model, execution_device=device, offload_buffers=True)
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
def test_cpu_offload_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
|
||||
cpu_offload(model, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"])
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
# Clean up for next test.
|
||||
remove_hook_from_submodules(model)
|
||||
|
||||
cpu_offload(
|
||||
model,
|
||||
execution_device=device,
|
||||
offload_buffers=True,
|
||||
preload_module_classes=["ModuleWithUnusedSubModules"],
|
||||
)
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_cuda
|
||||
def test_cpu_offload_gpt2(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
|
||||
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
cpu_offload(gpt2, execution_device=0)
|
||||
outputs = gpt2.generate(inputs["input_ids"])
|
||||
self.assertEqual(
|
||||
tokenizer.decode(outputs[0].tolist()),
|
||||
"Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo",
|
||||
)
|
||||
|
||||
def test_disk_offload(self):
|
||||
model = ModelForTest()
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
disk_offload(model, tmp_dir, execution_device=device)
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
# Clean up for next test.
|
||||
remove_hook_from_submodules(model)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
disk_offload(model, tmp_dir, execution_device=device, offload_buffers=True)
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
def test_disk_offload_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
device = torch.device(0 if torch.cuda.is_available() else "cpu")
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
disk_offload(
|
||||
model, tmp_dir, execution_device=device, preload_module_classes=["ModuleWithUnusedSubModules"]
|
||||
)
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
# Clean up for next test.
|
||||
remove_hook_from_submodules(model)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
disk_offload(
|
||||
model,
|
||||
tmp_dir,
|
||||
execution_device=device,
|
||||
offload_buffers=True,
|
||||
preload_module_classes=["ModuleWithUnusedSubModules"],
|
||||
)
|
||||
output = model(x)
|
||||
self.assertTrue(
|
||||
torch.allclose(expected, output.cpu(), 1e-4, 1e-5), msg=f"Expected: {expected}\nActual: {output.cpu()}"
|
||||
)
|
||||
|
||||
@slow
|
||||
@require_cuda
|
||||
def test_disk_offload_gpt2(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
|
||||
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
disk_offload(gpt2, tmp_dir, execution_device=0)
|
||||
outputs = gpt2.generate(inputs["input_ids"])
|
||||
self.assertEqual(
|
||||
tokenizer.decode(outputs[0].tolist()),
|
||||
"Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo",
|
||||
)
|
||||
|
||||
@require_cuda
|
||||
def test_dispatch_model(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "disk", "batchnorm": "cpu", "linear2": 0}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
dispatch_model(model, device_map, offload_dir=tmp_dir)
|
||||
output = model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
|
||||
@require_multi_gpu
|
||||
def test_dispatch_model_multi_gpu(self):
|
||||
model = BiggerModelForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
dispatch_model(model, device_map, offload_dir=tmp_dir)
|
||||
output = model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
|
||||
@slow
|
||||
@require_multi_gpu
|
||||
def test_dispatch_model_gpt2_on_two_gpus(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
inputs = tokenizer("Hello world! My name is", return_tensors="pt").to(0)
|
||||
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
# Dispatch on GPUs 0 and 1
|
||||
device_map = {
|
||||
"transformer.wte": 0,
|
||||
"transformer.wpe": 0,
|
||||
"transformer.ln_f": 1,
|
||||
"lm_head": 1,
|
||||
}
|
||||
for i in range(12):
|
||||
device_map[f"transformer.h.{i}"] = 0 if i <= 5 else 1
|
||||
|
||||
gpt2 = dispatch_model(gpt2, device_map)
|
||||
outputs = gpt2.generate(inputs["input_ids"])
|
||||
self.assertEqual(
|
||||
tokenizer.decode(outputs[0].tolist()),
|
||||
"Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo",
|
||||
)
|
||||
|
||||
# Dispatch with a bit of CPU offload
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
for i in range(4):
|
||||
device_map[f"transformer.h.{i}"] = "cpu"
|
||||
gpt2 = dispatch_model(gpt2, device_map)
|
||||
outputs = gpt2.generate(inputs["input_ids"])
|
||||
self.assertEqual(
|
||||
tokenizer.decode(outputs[0].tolist()),
|
||||
"Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo",
|
||||
)
|
||||
# Dispatch with a bit of CPU and disk offload
|
||||
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
for i in range(2):
|
||||
device_map[f"transformer.h.{i}"] = "disk"
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
state_dict = {
|
||||
k: p for k, p in gpt2.state_dict().items() if "transformer.h.0" in k or "transformer.h.1" in k
|
||||
}
|
||||
offload_state_dict(tmp_dir, state_dict)
|
||||
gpt2 = dispatch_model(gpt2, device_map, offload_dir=tmp_dir)
|
||||
outputs = gpt2.generate(inputs["input_ids"])
|
||||
self.assertEqual(
|
||||
tokenizer.decode(outputs[0].tolist()),
|
||||
"Hello world! My name is Kiyoshi, and I'm a student at the University of Tokyo",
|
||||
)
|
||||
|
||||
@require_cuda
|
||||
def test_dispatch_model_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 0}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
dispatch_model(
|
||||
model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
|
||||
)
|
||||
output = model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
|
||||
@require_multi_gpu
|
||||
def test_dispatch_model_with_unused_submodules_multi_gpu(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "disk", "batchnorm": "cpu", "linear3": 0, "linear4": 1}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
dispatch_model(
|
||||
model, device_map, offload_dir=tmp_dir, preload_module_classes=["ModuleWithUnusedSubModules"]
|
||||
)
|
||||
output = model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
|
||||
@require_cuda
|
||||
def test_load_checkpoint_and_dispatch(self):
|
||||
model = ModelForTest()
|
||||
device_map = {"linear1": "cpu", "batchnorm": "cpu", "linear2": 0}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
|
||||
torch.save(model.state_dict(), checkpoint)
|
||||
|
||||
new_model = ModelForTest()
|
||||
new_model = load_checkpoint_and_dispatch(new_model, checkpoint, device_map=device_map)
|
||||
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
self.assertEqual(new_model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(new_model.linear2.weight.device, torch.device(0))
|
||||
|
||||
output = new_model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
|
||||
@require_multi_gpu
|
||||
def test_load_checkpoint_and_dispatch_multi_gpu(self):
|
||||
model = BiggerModelForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
|
||||
torch.save(model.state_dict(), checkpoint)
|
||||
|
||||
new_model = BiggerModelForTest()
|
||||
new_model = load_checkpoint_and_dispatch(new_model, checkpoint, device_map=device_map)
|
||||
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
self.assertEqual(new_model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(new_model.linear2.weight.device, torch.device("meta"))
|
||||
self.assertEqual(new_model.linear3.weight.device, torch.device(0))
|
||||
self.assertEqual(new_model.linear4.weight.device, torch.device(1))
|
||||
|
||||
output = new_model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
|
||||
@require_cuda
|
||||
def test_load_checkpoint_and_dispatch_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 0}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
|
||||
torch.save(model.state_dict(), checkpoint)
|
||||
|
||||
new_model = ModelWithUnusedSubModulesForTest()
|
||||
new_model = load_checkpoint_and_dispatch(
|
||||
new_model, checkpoint, device_map=device_map, preload_module_classes=["ModuleWithUnusedSubModules"]
|
||||
)
|
||||
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
self.assertEqual(new_model.linear1.linear.weight.device, torch.device("meta"))
|
||||
self.assertEqual(new_model.linear2.linear.weight.device, torch.device("meta"))
|
||||
self.assertEqual(new_model.linear3.linear.weight.device, torch.device(0))
|
||||
self.assertEqual(new_model.linear4.linear.weight.device, torch.device(0))
|
||||
|
||||
output = new_model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
|
||||
@require_multi_gpu
|
||||
def test_load_checkpoint_and_dispatch_multi_gpu_with_unused_submodules(self):
|
||||
model = ModelWithUnusedSubModulesForTest()
|
||||
device_map = {"linear1": "cpu", "linear2": "cpu", "batchnorm": 0, "linear3": 0, "linear4": 1}
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
expected = model(x)
|
||||
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
checkpoint = os.path.join(tmp_dir, "pt_model.bin")
|
||||
torch.save(model.state_dict(), checkpoint)
|
||||
|
||||
new_model = ModelWithUnusedSubModulesForTest()
|
||||
new_model = load_checkpoint_and_dispatch(
|
||||
new_model, checkpoint, device_map=device_map, preload_module_classes=["ModuleWithUnusedSubModules"]
|
||||
)
|
||||
|
||||
# CPU-offloaded weights are on the meta device while waiting for the forward pass.
|
||||
self.assertEqual(new_model.linear1.linear.weight.device, torch.device("meta"))
|
||||
self.assertEqual(new_model.linear2.linear.weight.device, torch.device("meta"))
|
||||
self.assertEqual(new_model.linear3.linear.weight.device, torch.device(0))
|
||||
self.assertEqual(new_model.linear4.linear.weight.device, torch.device(1))
|
||||
|
||||
output = new_model(x)
|
||||
self.assertTrue(torch.allclose(expected, output.cpu(), atol=1e-5))
|
||||
@ -15,9 +15,10 @@
|
||||
import unittest
|
||||
|
||||
from accelerate import debug_launcher
|
||||
from accelerate.test_utils import test_script
|
||||
from accelerate.test_utils import require_cpu, test_script
|
||||
|
||||
|
||||
class MultiTPUTester(unittest.TestCase):
|
||||
@require_cpu
|
||||
class MultiCPUTester(unittest.TestCase):
|
||||
def test_cpu(self):
|
||||
debug_launcher(test_script.main)
|
||||
|
||||
@ -12,72 +12,33 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import ast
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
import torch
|
||||
|
||||
from accelerate import DistributedType
|
||||
from accelerate.test_utils.examples import compare_against_test
|
||||
from accelerate.test_utils.testing import TempDirTestCase, slow
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoTokenizer
|
||||
from accelerate.test_utils.testing import TempDirTestCase, require_trackers, run_command, slow
|
||||
from accelerate.utils import write_basic_config
|
||||
|
||||
|
||||
SRC_DIRS = [os.path.abspath(os.path.join("examples", "by_feature"))]
|
||||
sys.path.extend(SRC_DIRS)
|
||||
|
||||
if SRC_DIRS is not None:
|
||||
import checkpointing
|
||||
import cross_validation
|
||||
import multi_process_metrics
|
||||
import tracking
|
||||
|
||||
# DataLoaders built from `test_samples/MRPC` for quick testing
|
||||
# Should mock `{script_name}.get_dataloaders` via:
|
||||
# @mock.patch("{script_name}.get_dataloaders", mocked_dataloaders)
|
||||
|
||||
EXCLUDE_EXAMPLES = ["cross_validation.py", "multi_process_metrics.py", "memory.py"]
|
||||
|
||||
|
||||
def mocked_dataloaders(accelerator, batch_size: int = 16):
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
|
||||
datasets = load_dataset("csv", data_files=data_files)
|
||||
label_list = datasets["train"].unique("label")
|
||||
|
||||
label_to_id = {v: i for i, v in enumerate(label_list)}
|
||||
|
||||
def tokenize_function(examples):
|
||||
# max_length=None => use the model max length (it's actually the default)
|
||||
outputs = tokenizer(
|
||||
examples["sentence1"], examples["sentence2"], truncation=True, max_length=None, padding="max_length"
|
||||
)
|
||||
if "label" in examples:
|
||||
outputs["labels"] = [label_to_id[l] for l in examples["label"]]
|
||||
return outputs
|
||||
|
||||
# Apply the method we just defined to all the examples in all the splits of the dataset
|
||||
tokenized_datasets = datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=["sentence1", "sentence2", "label"],
|
||||
)
|
||||
|
||||
def collate_fn(examples):
|
||||
# On TPU it's best to pad everything to the same length or training will be very slow.
|
||||
if accelerator.distributed_type == DistributedType.TPU:
|
||||
return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
|
||||
return tokenizer.pad(examples, padding="longest", return_tensors="pt")
|
||||
|
||||
# Instantiate dataloaders.
|
||||
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=2)
|
||||
eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)
|
||||
|
||||
return train_dataloader, eval_dataloader
|
||||
EXCLUDE_EXAMPLES = [
|
||||
"cross_validation.py",
|
||||
"gradient_accumulation.py",
|
||||
"multi_process_metrics.py",
|
||||
"memory.py",
|
||||
"fsdp_with_peak_mem_tracking.py",
|
||||
"deepspeed_with_config_support.py",
|
||||
]
|
||||
|
||||
|
||||
class ExampleDifferenceTests(unittest.TestCase):
|
||||
@ -147,99 +108,109 @@ class ExampleDifferenceTests(unittest.TestCase):
|
||||
cv_path = os.path.abspath(os.path.join("examples", "cv_example.py"))
|
||||
special_strings = [
|
||||
" " * 16 + "{\n\n",
|
||||
" " * 18 + '"accuracy": eval_metric["accuracy"],\n\n',
|
||||
" " * 18 + '"f1": eval_metric["f1"],\n\n',
|
||||
" " * 18 + '"train_loss": total_loss,\n\n',
|
||||
" " * 18 + '"epoch": epoch,\n\n',
|
||||
" " * 16 + "}\n",
|
||||
" " * 8,
|
||||
" " * 20 + '"accuracy": eval_metric["accuracy"],\n\n',
|
||||
" " * 20 + '"f1": eval_metric["f1"],\n\n',
|
||||
" " * 20 + '"train_loss": total_loss.item() / len(train_dataloader),\n\n',
|
||||
" " * 20 + '"epoch": epoch,\n\n',
|
||||
" " * 16 + "},\n\n",
|
||||
" " * 16 + "step=epoch,\n",
|
||||
" " * 12,
|
||||
]
|
||||
self.one_complete_example("complete_cv_example.py", True, cv_path, special_strings)
|
||||
self.one_complete_example("complete_cv_example.py", False, cv_path, special_strings)
|
||||
|
||||
|
||||
@mock.patch.dict(os.environ, {"TESTING_MOCKED_DATALOADERS": "1"})
|
||||
class FeatureExamplesTests(TempDirTestCase):
|
||||
clear_on_setup = False
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
cls._tmpdir = tempfile.mkdtemp()
|
||||
cls.configPath = os.path.join(cls._tmpdir, "default_config.yml")
|
||||
|
||||
write_basic_config(save_location=cls.configPath)
|
||||
cls._launch_args = ["accelerate", "launch", "--config_file", cls.configPath]
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
super().tearDownClass()
|
||||
shutil.rmtree(cls._tmpdir)
|
||||
|
||||
def test_checkpointing_by_epoch(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
examples/by_feature/checkpointing.py
|
||||
--checkpointing_steps epoch
|
||||
--output_dir {self.tmpdir}
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
self.assertTrue(os.path.exists(os.path.join(self.tmpdir, "epoch_1")))
|
||||
run_command(self._launch_args + testargs)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.tmpdir, "epoch_1")))
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
def test_checkpointing_by_steps(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
--checkpointing_steps 2
|
||||
examples/by_feature/checkpointing.py
|
||||
--checkpointing_steps 1
|
||||
--output_dir {self.tmpdir}
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
self.assertTrue(os.path.exists(os.path.join(self.tmpdir, "step_4")))
|
||||
_ = run_command(self._launch_args + testargs)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.tmpdir, "step_5")))
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
def test_load_states_by_epoch(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
examples/by_feature/checkpointing.py
|
||||
--resume_from_checkpoint {os.path.join(self.tmpdir, "epoch_1")}
|
||||
""".split()
|
||||
dummy_results = {"accuracy": mock.ANY, "f1": mock.ANY}
|
||||
with mock.patch("accelerate.Accelerator.print") as mocked_print:
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
with self.assertRaises(AssertionError):
|
||||
mocked_print.assert_any_call("epoch 0:", dummy_results)
|
||||
with self.assertRaises(AssertionError):
|
||||
mocked_print.assert_any_call("epoch 1:", dummy_results)
|
||||
mocked_print.assert_any_call("epoch 2:", dummy_results)
|
||||
output = run_command(self._launch_args + testargs, return_stdout=True)
|
||||
self.assertNotIn("epoch 0:", output)
|
||||
self.assertNotIn("epoch 1:", output)
|
||||
self.assertIn("epoch 2:", output)
|
||||
|
||||
@mock.patch("checkpointing.get_dataloaders", mocked_dataloaders)
|
||||
def test_load_states_by_steps(self):
|
||||
testargs = f"""
|
||||
checkpointing.py
|
||||
--resume_from_checkpoint {os.path.join(self.tmpdir, "step_4")}
|
||||
examples/by_feature/checkpointing.py
|
||||
--resume_from_checkpoint {os.path.join(self.tmpdir, "step_5")}
|
||||
""".split()
|
||||
dummy_results = {"accuracy": mock.ANY, "f1": mock.ANY}
|
||||
with mock.patch("accelerate.Accelerator.print") as mocked_print:
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
checkpointing.main()
|
||||
with self.assertRaises(AssertionError):
|
||||
mocked_print.assert_any_call("epoch 0:", dummy_results)
|
||||
mocked_print.assert_any_call("epoch 1:", dummy_results)
|
||||
mocked_print.assert_any_call("epoch 2:", dummy_results)
|
||||
output = run_command(self._launch_args + testargs, return_stdout=True)
|
||||
if torch.cuda.is_available():
|
||||
num_processes = torch.cuda.device_count()
|
||||
else:
|
||||
num_processes = 1
|
||||
if num_processes > 1:
|
||||
self.assertNotIn("epoch 0:", output)
|
||||
self.assertNotIn("epoch 1:", output)
|
||||
else:
|
||||
self.assertNotIn("epoch 0:", output)
|
||||
self.assertIn("epoch 1:", output)
|
||||
self.assertIn("epoch 2:", output)
|
||||
|
||||
@slow
|
||||
def test_cross_validation(self):
|
||||
testargs = """
|
||||
cross_validation.py
|
||||
examples/by_feature/cross_validation.py
|
||||
--num_folds 2
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
with mock.patch("accelerate.Accelerator.print") as mocked_print:
|
||||
cross_validation.main()
|
||||
call = mocked_print.mock_calls[-1]
|
||||
self.assertGreaterEqual(call.args[1]["accuracy"], 0.75)
|
||||
with mock.patch.dict(os.environ, {"TESTING_MOCKED_DATALOADERS": "0"}):
|
||||
output = run_command(self._launch_args + testargs, return_stdout=True)
|
||||
results = ast.literal_eval(re.findall("({.+})", output)[-1])
|
||||
self.assertGreaterEqual(results["accuracy"], 0.75)
|
||||
|
||||
@mock.patch("multi_process_metrics.get_dataloaders", mocked_dataloaders)
|
||||
def test_multi_process_metrics(self):
|
||||
testargs = ["multi_process_metrics.py"]
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
multi_process_metrics.main()
|
||||
testargs = ["examples/by_feature/multi_process_metrics.py"]
|
||||
run_command(self._launch_args + testargs)
|
||||
|
||||
@mock.patch("tracking.get_dataloaders", mocked_dataloaders)
|
||||
@require_trackers
|
||||
@mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
|
||||
def test_tracking(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
testargs = f"""
|
||||
tracking.py
|
||||
examples/by_feature/tracking.py
|
||||
--with_tracking
|
||||
--logging_dir {tmpdir}
|
||||
""".split()
|
||||
with mock.patch.object(sys, "argv", testargs):
|
||||
tracking.main()
|
||||
self.assertTrue(os.path.exists(os.path.join(tmpdir, "tracking")))
|
||||
run_command(self._launch_args + testargs)
|
||||
self.assertTrue(os.path.exists(os.path.join(tmpdir, "tracking")))
|
||||
|
||||
def test_gradient_accumulation(self):
|
||||
testargs = ["examples/by_feature/gradient_accumulation.py"]
|
||||
run_command(self._launch_args + testargs)
|
||||
|
||||
55
tests/test_grad_sync.py
Normal file
55
tests/test_grad_sync.py
Normal file
@ -0,0 +1,55 @@
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
import accelerate
|
||||
from accelerate import debug_launcher
|
||||
from accelerate.test_utils import (
|
||||
execute_subprocess_async,
|
||||
require_cpu,
|
||||
require_multi_gpu,
|
||||
require_single_gpu,
|
||||
test_sync,
|
||||
)
|
||||
from accelerate.utils import get_launch_prefix, patch_environment
|
||||
|
||||
|
||||
class SyncScheduler(unittest.TestCase):
|
||||
def setUp(self):
|
||||
mod_file = inspect.getfile(accelerate.test_utils)
|
||||
self.test_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_sync.py"])
|
||||
|
||||
@require_cpu
|
||||
def test_gradient_sync_cpu_noop(self):
|
||||
debug_launcher(test_sync.main, num_processes=1)
|
||||
|
||||
@require_cpu
|
||||
def test_gradient_sync_cpu_multi(self):
|
||||
debug_launcher(test_sync.main)
|
||||
|
||||
@require_single_gpu
|
||||
def test_gradient_sync_gpu(self):
|
||||
test_sync.main()
|
||||
|
||||
@require_multi_gpu
|
||||
def test_gradient_sync_gpu_multi(self):
|
||||
print(f"Found {torch.cuda.device_count()} devices.")
|
||||
cmd = get_launch_prefix() + [f"--nproc_per_node={torch.cuda.device_count()}", self.test_file_path]
|
||||
with patch_environment(omp_num_threads=1):
|
||||
execute_subprocess_async(cmd, env=os.environ.copy())
|
||||
330
tests/test_hooks.py
Normal file
330
tests/test_hooks.py
Normal file
@ -0,0 +1,330 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from accelerate.hooks import (
|
||||
AlignDevicesHook,
|
||||
ModelHook,
|
||||
SequentialHook,
|
||||
add_hook_to_module,
|
||||
attach_align_device_hook,
|
||||
remove_hook_from_module,
|
||||
remove_hook_from_submodules,
|
||||
)
|
||||
from accelerate.test_utils import require_multi_gpu
|
||||
|
||||
|
||||
class ModelForTest(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear1 = nn.Linear(3, 4)
|
||||
self.batchnorm = nn.BatchNorm1d(4)
|
||||
self.linear2 = nn.Linear(4, 5)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear2(self.batchnorm(self.linear1(x)))
|
||||
|
||||
|
||||
class PreForwardHook(ModelHook):
|
||||
def pre_forward(self, module, *args, **kwargs):
|
||||
return (args[0] + 1,) + args[1:], kwargs
|
||||
|
||||
|
||||
class PostForwardHook(ModelHook):
|
||||
def post_forward(self, module, output):
|
||||
return output + 1
|
||||
|
||||
|
||||
class HooksModelTester(unittest.TestCase):
|
||||
def test_add_and_remove_hooks(self):
|
||||
test_model = ModelForTest()
|
||||
test_hook = ModelHook()
|
||||
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
self.assertEqual(test_model._hf_hook, test_hook)
|
||||
self.assertTrue(hasattr(test_model, "_old_forward"))
|
||||
|
||||
# Check adding the hook did not change the name or the signature
|
||||
self.assertEqual(test_model.forward.__name__, "forward")
|
||||
self.assertListEqual(list(inspect.signature(test_model.forward).parameters), ["x"])
|
||||
|
||||
remove_hook_from_module(test_model)
|
||||
self.assertFalse(hasattr(test_model, "_hf_hook"))
|
||||
self.assertFalse(hasattr(test_model, "_old_forward"))
|
||||
|
||||
def test_pre_forward_hook_is_executed(self):
|
||||
test_model = ModelForTest()
|
||||
x = torch.randn(2, 3)
|
||||
expected = test_model(x + 1)
|
||||
expected2 = test_model(x + 2)
|
||||
|
||||
test_hook = PreForwardHook()
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
output1 = test_model(x)
|
||||
self.assertTrue(torch.allclose(output1, expected, atol=1e-5))
|
||||
|
||||
# Attaching a hook to a model when it already has one replaces, does not chain
|
||||
test_hook = PreForwardHook()
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
output1 = test_model(x)
|
||||
self.assertTrue(torch.allclose(output1, expected, atol=1e-5))
|
||||
|
||||
# You need to use the sequential hook to chain two or more hooks
|
||||
test_hook = SequentialHook(PreForwardHook(), PreForwardHook())
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
|
||||
output2 = test_model(x)
|
||||
assert torch.allclose(output2, expected2, atol=1e-5)
|
||||
|
||||
def test_post_forward_hook_is_executed(self):
|
||||
test_model = ModelForTest()
|
||||
x = torch.randn(2, 3)
|
||||
output = test_model(x)
|
||||
|
||||
test_hook = PostForwardHook()
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
output1 = test_model(x)
|
||||
self.assertTrue(torch.allclose(output1, output + 1, atol=1e-5))
|
||||
|
||||
# Attaching a hook to a model when it already has one replaces, does not chain
|
||||
test_hook = PostForwardHook()
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
output1 = test_model(x)
|
||||
self.assertTrue(torch.allclose(output1, output + 1, atol=1e-5))
|
||||
|
||||
# You need to use the sequential hook to chain two or more hooks
|
||||
test_hook = SequentialHook(PostForwardHook(), PostForwardHook())
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
|
||||
output2 = test_model(x)
|
||||
assert torch.allclose(output2, output + 2, atol=1e-5)
|
||||
|
||||
def test_no_grad_in_hook(self):
|
||||
test_model = ModelForTest()
|
||||
x = torch.randn(2, 3)
|
||||
output = test_model(x)
|
||||
|
||||
test_hook = PostForwardHook()
|
||||
add_hook_to_module(test_model, test_hook)
|
||||
output1 = test_model(x)
|
||||
self.assertTrue(torch.allclose(output1, output + 1))
|
||||
self.assertTrue(output1.requires_grad)
|
||||
|
||||
test_hook.no_grad = True
|
||||
output1 = test_model(x)
|
||||
self.assertFalse(output1.requires_grad)
|
||||
|
||||
@require_multi_gpu
|
||||
def test_align_devices_as_model_parallelism(self):
|
||||
model = ModelForTest()
|
||||
# Everything is on CPU
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
# This will move each submodule on different devices
|
||||
add_hook_to_module(model.linear1, AlignDevicesHook(execution_device=0))
|
||||
add_hook_to_module(model.batchnorm, AlignDevicesHook(execution_device=0))
|
||||
add_hook_to_module(model.linear2, AlignDevicesHook(execution_device=1))
|
||||
|
||||
self.assertEqual(model.linear1.weight.device, torch.device(0))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device(0))
|
||||
self.assertEqual(model.batchnorm.running_mean.device, torch.device(0))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device(1))
|
||||
|
||||
# We can still make a forward pass. The input does not need to be on any particular device
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, torch.device(1))
|
||||
|
||||
# We can add a general hook to put back output on same device as input.
|
||||
add_hook_to_module(model, AlignDevicesHook(io_same_device=True))
|
||||
x = torch.randn(2, 3).to(0)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, torch.device(0))
|
||||
|
||||
def test_align_devices_as_cpu_offload(self):
|
||||
model = ModelForTest()
|
||||
|
||||
# Everything is on CPU
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
# This will move each submodule on different devices
|
||||
hook_kwargs = {"execution_device": 0 if torch.cuda.is_available() else "cpu", "offload": True}
|
||||
|
||||
add_hook_to_module(model.linear1, AlignDevicesHook(**hook_kwargs))
|
||||
add_hook_to_module(model.batchnorm, AlignDevicesHook(**hook_kwargs))
|
||||
add_hook_to_module(model.linear2, AlignDevicesHook(**hook_kwargs))
|
||||
|
||||
# Parameters have been offloaded, so on the meta device
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("meta"))
|
||||
# Buffers are not included in the offload by default, so are on the execution device
|
||||
device = torch.device(hook_kwargs["execution_device"])
|
||||
self.assertEqual(model.batchnorm.running_mean.device, device)
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, device)
|
||||
|
||||
# Removing hooks loads back the weights in the model.
|
||||
remove_hook_from_module(model.linear1)
|
||||
remove_hook_from_module(model.batchnorm)
|
||||
remove_hook_from_module(model.linear2)
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
# Now test with buffers included in the offload
|
||||
hook_kwargs = {
|
||||
"execution_device": 0 if torch.cuda.is_available() else "cpu",
|
||||
"offload": True,
|
||||
"offload_buffers": True,
|
||||
}
|
||||
|
||||
add_hook_to_module(model.linear1, AlignDevicesHook(**hook_kwargs))
|
||||
add_hook_to_module(model.batchnorm, AlignDevicesHook(**hook_kwargs))
|
||||
add_hook_to_module(model.linear2, AlignDevicesHook(**hook_kwargs))
|
||||
|
||||
# Parameters have been offloaded, so on the meta device, buffers included
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.running_mean.device, torch.device("meta"))
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, device)
|
||||
|
||||
# Removing hooks loads back the weights in the model.
|
||||
remove_hook_from_module(model.linear1)
|
||||
remove_hook_from_module(model.batchnorm)
|
||||
remove_hook_from_module(model.linear2)
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
def test_attach_align_device_hook_as_cpu_offload(self):
|
||||
model = ModelForTest()
|
||||
|
||||
# Everything is on CPU
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
# This will move each submodule on different devices
|
||||
execution_device = 0 if torch.cuda.is_available() else "cpu"
|
||||
attach_align_device_hook(model, execution_device=execution_device, offload=True)
|
||||
|
||||
# Parameters have been offloaded, so on the meta device
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("meta"))
|
||||
# Buffers are not included in the offload by default, so are on the execution device
|
||||
device = torch.device(execution_device)
|
||||
self.assertEqual(model.batchnorm.running_mean.device, device)
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, device)
|
||||
|
||||
# Removing hooks loads back the weights in the model.
|
||||
remove_hook_from_submodules(model)
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
# Now test with buffers included in the offload
|
||||
attach_align_device_hook(model, execution_device=execution_device, offload=True, offload_buffers=True)
|
||||
|
||||
# Parameters have been offloaded, so on the meta device, buffers included
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.running_mean.device, torch.device("meta"))
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, device)
|
||||
|
||||
# Removing hooks loads back the weights in the model.
|
||||
remove_hook_from_submodules(model)
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
def test_attach_align_device_hook_as_cpu_offload_with_weight_map(self):
|
||||
model = ModelForTest()
|
||||
|
||||
# Everything is on CPU
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
# This will move each submodule on different devices
|
||||
execution_device = 0 if torch.cuda.is_available() else "cpu"
|
||||
attach_align_device_hook(
|
||||
model, execution_device=execution_device, offload=True, weights_map=model.state_dict()
|
||||
)
|
||||
|
||||
# Parameters have been offloaded, so on the meta device
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("meta"))
|
||||
# Buffers are not included in the offload by default, so are on the execution device
|
||||
device = torch.device(execution_device)
|
||||
self.assertEqual(model.batchnorm.running_mean.device, device)
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, device)
|
||||
|
||||
# Removing hooks loads back the weights in the model.
|
||||
remove_hook_from_submodules(model)
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
|
||||
# Now test with buffers included in the offload
|
||||
attach_align_device_hook(
|
||||
model,
|
||||
execution_device=execution_device,
|
||||
offload=True,
|
||||
weights_map=model.state_dict(),
|
||||
offload_buffers=True,
|
||||
)
|
||||
|
||||
# Parameters have been offloaded, so on the meta device, buffers included
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("meta"))
|
||||
self.assertEqual(model.batchnorm.running_mean.device, torch.device("meta"))
|
||||
|
||||
x = torch.randn(2, 3)
|
||||
output = model(x)
|
||||
self.assertEqual(output.device, device)
|
||||
|
||||
# Removing hooks loads back the weights in the model.
|
||||
remove_hook_from_submodules(model)
|
||||
self.assertEqual(model.linear1.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.batchnorm.weight.device, torch.device("cpu"))
|
||||
self.assertEqual(model.linear2.weight.device, torch.device("cpu"))
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user